add judgement for CL

2020-08-03 10:55:53 +08:00 · 2020-08-03 10:55:53 +08:00 · d831f8df5e
parent 6a1e6b01f7
commit d831f8df5e
9 changed files with 194 additions and 319 deletions
--- a/build.sh
+++ b/build.sh
@ -510,12 +510,8 @@ gene_ocl_program() {
 build_opencl() {
    cd ${BASEPATH}
-    if [[ ! -d "third_party/OpenCL-Headers" ]]; then
+    git submodule update --init third_party/OpenCL-Headers
-      git submodule update --init third_party/OpenCL-Headers
+    git submodule update --init third_party/OpenCL-CLHPP
    fi
    if [[ ! -d "third_party/OpenCL-CLHPP" ]]; then
      git submodule update --init third_party/OpenCL-CLHPP
    fi
    if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then
        gene_ocl_program
    else
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
@ -13,19 +13,9 @@
 __constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void DepthwiseConv2d_NC4HW4(
+__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-__global FLT4* src_data,
+                                     float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-    __global FLT4* filters,  
+                                     int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* biases,
    float relu_clip1,
 __global FLT4* dst_data,
    int2 kernel_size,                
    int2 stride,                     
    int2 padding,                    
    int2 dilation,                   
    int4 src_size,                   
    int4 dst_size                    
 ) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -42,31 +32,21 @@ __global FLT4* dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filters[fx_c];
-      FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
+        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
        r += TO_ACCUM_TYPE(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
  FLT4 bias_val = biases[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
+  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
 }
-__kernel void DepthwiseConv2d_NHWC4(
+__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
-__global FLT4* src_data,
+                                    float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-    __global FLT4* filters,  
+                                    int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* biases,
    float relu_clip1,
 __global FLT4* dst_data,
    int2 kernel_size,                
    int2 stride,                     
    int2 padding,                    
    int2 dilation,                   
    int4 src_size,                   
    int4 dst_size                    
 ) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -83,9 +63,9 @@ __global FLT4* dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filters[fx_c];
-      FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
        r += TO_ACCUM_TYPE(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
@ -93,4 +73,4 @@ __global FLT4* dst_data,
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
@ -1,9 +1,8 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define FLT4 half4
 #define FLT16 half16
-__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
+__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias,
-                        __global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci,
+                     int2 offset_ci, int2 offset_co, int has_bias) {
                        int2 offset_co, int has_bias) {
  int2 gid = (int2)(get_global_id(0), get_global_id(1));
  int2 lid = (int2)(get_local_id(0), get_local_id(1));
  FLT4 s = (FLT4)(0.0f);
@ -29,4 +28,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
    buffer[gid.x] = s;
    // memory pollution? or protected by opencl
  }
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
@ -31,7 +31,6 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                                  const int4 output_shape, const int2 stride, const int2 kernel_size,
                                  const int2 padding) {
@ -63,4 +62,4 @@ __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only imag
  }
  float4 result = convert_float4(r / window_size);
  write_imagef(output, (int2)(X, Y * output_shape.w + Z), result);
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
@ -51,4 +51,4 @@ __kernel void Concat3input(__global float *input0, __global float *input1, __glo
      output[index_output] = input2[input_idx];
    }
  }
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
@ -1,191 +1,150 @@
 #define CI_TILE 4
 #define CO_TILE 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-//#define __global
+// #define __global
-
+// #pragma OPENCL EXTENSION cl_arm_printf : enable
-//#pragma OPENCL EXTENSION cl_arm_printf : enable
+__kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias,
 __kernel void convolution_NHWC_OHWI(__global float *input,
                                    __global float *weight,
                                    __global float *bias,
                                    __global float *output,
-                                    const int4 input_shape,   // NHWC
+                                    const int4 input_shape,    // NHWC
-                                    const int4 output_shape,  // NHWC
+                                    const int4 output_shape,   // NHWC
-                                    const int4 kernel_stride, // kernelHW_strideHW
+                                    const int4 kernel_stride,  // kernelHW_strideHW
-                                    const int4 pad)           // top bottom left right
+                                    const int4 pad) {
-{
+  int ow = get_global_id(0);
-    int ow = get_global_id(0);
+  int oh = get_global_id(1);
-    int oh = get_global_id(1);
+  int co_slice = get_global_id(2);
    int co_slice = get_global_id(2);
-    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
+  int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
-    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
+  int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
-    int KH = kernel_stride.x, KW = kernel_stride.y;
+  int KH = kernel_stride.x, KW = kernel_stride.y;
-    int strideH = kernel_stride.z, strideW = kernel_stride.w;
+  int strideH = kernel_stride.z, strideW = kernel_stride.w;
-    int padTop = pad.x, padLeft = pad.z;
+  int padTop = pad.x, padLeft = pad.z;
-    int CI_SLICES = UP_DIV(CI, CI_TILE);
+  int CI_SLICES = UP_DIV(CI, CI_TILE);
-    int CO_SLICES = UP_DIV(CO, CO_TILE);
+  int CO_SLICES = UP_DIV(CO, CO_TILE);
-    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)
+  if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) return;
        return;
-    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+  float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int kh = 0; kh < KH; ++kh)
+  for (int kh = 0; kh < KH; ++kh) {
-    {
+    int ih = kh + oh * strideH - padTop;
-        int ih = kh + oh * strideH - padTop;
+    for (int kw = 0; kw < KW; ++kw) {
-        for (int kw = 0; kw < KW; ++kw)
+      int iw = kw + ow * strideW - padLeft;
-        {
+      for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) {
-            int iw = kw + ow * strideW - padLeft;
+        for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) {
-            for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice)
+          int ci = ci_slice * CI_TILE + ci_inner;
-            {
+          if (ci >= CI) break;
                for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
                {
                    int ci = ci_slice * CI_TILE + ci_inner;
                    if (ci >= CI)
                        break;
-                    int input_idx = ih * IW * CI + iw * CI + ci;
+          int input_idx = ih * IW * CI + iw * CI + ci;
-                    float value = 0;
+          float value = 0;
-                    if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
+          if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
-                        value = 0;
+            value = 0;
-                    else
+          else
-                        value = input[input_idx];
+            value = input[input_idx];
-                    int CO_OFFSET = KH * KW * CI;
+          int CO_OFFSET = KH * KW * CI;
-                    int weight_idx = (co_slice * CO_TILE) * CO_OFFSET +
+          int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci;
-                                     kh * KW * CI +
+          acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
-                                     kw * CI +
+          acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
-                                     ci;
+          acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
-                    acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
+          acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
                    acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
                    acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
                    acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
                }
            }
        }
      }
    }
-    int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
+  }
-    if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0)
+  int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
-    {
+  if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) {
-        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
+    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
+    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
+    output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
-        output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
+    output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
-    }
+  } else if (CO % CO_TILE == 1) {
-    else if (CO % CO_TILE == 1)
+    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-    {
+  } else if (CO % CO_TILE == 2) {
-        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
+    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-    }
+    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-    else if (CO % CO_TILE == 2)
+  } else if (CO % CO_TILE == 3) {
-    {
+    output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
-        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
+    output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
-        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
+    output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
-    }
+  }
    else if (CO % CO_TILE == 3)
    {
        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
    }
 }
-
+// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
+// #define FLT4 half4
 //#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //#define FLT4 half4
 #define FLT4 float4
-__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
+__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias,
                                              __global FLT4 *weight,
                                              __global FLT4 *bias,
                                              __global FLT4 *output,
-                                              const int4 input_shape,   // NHWC
+                                              const int4 input_shape,    // NHWC
-                                              const int4 output_shape,  // NHWC
+                                              const int4 output_shape,   // NHWC
-                                              const int4 kernel_stride, // kernelHW_strideHW
+                                              const int4 kernel_stride,  // kernelHW_strideHW
-                                              const int4 pad)           // top bottom left right
+                                              const int4 pad) {
-{
+  int oh = get_global_id(0);        // [0, OH)
-    int oh = get_global_id(0);  // [0, OH)
+  int ow = get_global_id(1);        // [0, OW)
-    int ow = get_global_id(1);  // [0, OW)
+  int co_slice = get_global_id(2);  // [0, UP_DIV(CO, CO_TILE) )
    int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
-    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
+  int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
-    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
+  int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
-    int CI_SLICES = UP_DIV(CI, CI_TILE);
+  int CI_SLICES = UP_DIV(CI, CI_TILE);
-    int CO_SLICES = UP_DIV(CO, CO_TILE);
+  int CO_SLICES = UP_DIV(CO, CO_TILE);
-    int KH = kernel_stride.x, KW = kernel_stride.y;
+  int KH = kernel_stride.x, KW = kernel_stride.y;
-    int strideH = kernel_stride.z, strideW = kernel_stride.w;
+  int strideH = kernel_stride.z, strideW = kernel_stride.w;
-    int padTop = pad.x, padLeft = pad.z;
+  int padTop = pad.x, padLeft = pad.z;
-    if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES)
+  if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) return;
-        return;
+  if (2 * co_slice + 1 >= CO_SLICES) {
-    if (2 * co_slice + 1 >= CO_SLICES)
+    FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    {
+    __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
-        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    for (int kh = 0; kh < KH; ++kh) {
-        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
+      int ih = kh + oh * strideH - padTop;
-        for (int kh = 0; kh < KH; ++kh)
+      for (int kw = 0; kw < KW; ++kw) {
-        {
+        int iw = kw + ow * strideW - padLeft;
-            int ih = kh + oh * strideH - padTop;
+        if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
-            for (int kw = 0; kw < KW; ++kw)
+          for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-            {
+            FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
-                int iw = kw + ow * strideW - padLeft;
+            out0_c4 += w0_ic1_oc4[0] * in_c4.x;
-                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
+            out0_c4 += w0_ic1_oc4[1] * in_c4.y;
-                {
+            out0_c4 += w0_ic1_oc4[2] * in_c4.z;
-                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
+            out0_c4 += w0_ic1_oc4[3] * in_c4.w;
-                    {
+            w0_ic1_oc4 += 4;
-                        FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
+          }
-                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
+        } else {
-                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
+          w0_ic1_oc4 += 4 * CI_SLICES;
                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
                        w0_ic1_oc4 += 4;
                    }
                }
                else
                {
                    w0_ic1_oc4 += 4 * CI_SLICES;
                }
            }
        }
-        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
+      }
    }
-    else
+    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
-    {
+  } else {
-        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-        FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
+    FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
+    __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
-        __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
+    __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
-        for (int kh = 0; kh < KH; ++kh)
+    for (int kh = 0; kh < KH; ++kh) {
-        {
+      int ih = kh + oh * strideH - padTop;
-            int ih = kh + oh * strideH - padTop;
+      for (int kw = 0; kw < KW; ++kw) {
-            for (int kw = 0; kw < KW; ++kw)
+        int iw = kw + ow * strideW - padLeft;
-            {
+        if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
-                int iw = kw + ow * strideW - padLeft;
+          int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
-                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
+          for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
-                {
+            FLT4 in_c4 = input[idx + ci_slice];
                    int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
                    {
                        FLT4 in_c4 = input[idx + ci_slice];
-                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
+            out0_c4 += w0_ic1_oc4[0] * in_c4.x;
-                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
+            out0_c4 += w0_ic1_oc4[1] * in_c4.y;
-                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
+            out0_c4 += w0_ic1_oc4[2] * in_c4.z;
-                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
+            out0_c4 += w0_ic1_oc4[3] * in_c4.w;
-                        w0_ic1_oc4 += 4;
+            w0_ic1_oc4 += 4;
-                        out1_c4 += w1_ic1_oc4[0] * in_c4.x;
+            out1_c4 += w1_ic1_oc4[0] * in_c4.x;
-                        out1_c4 += w1_ic1_oc4[1] * in_c4.y;
+            out1_c4 += w1_ic1_oc4[1] * in_c4.y;
-                        out1_c4 += w1_ic1_oc4[2] * in_c4.z;
+            out1_c4 += w1_ic1_oc4[2] * in_c4.z;
-                        out1_c4 += w1_ic1_oc4[3] * in_c4.w;
+            out1_c4 += w1_ic1_oc4[3] * in_c4.w;
-                        w1_ic1_oc4 += 4;
+            w1_ic1_oc4 += 4;
-                    }
+          }
-                }
+        } else {
-                else
+          w0_ic1_oc4 += 4 * CI_SLICES;
-                {
+          w1_ic1_oc4 += 4 * CI_SLICES;
                    w0_ic1_oc4 += 4 * CI_SLICES;
                    w1_ic1_oc4 += 4 * CI_SLICES;
                }
            }
        }
-        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
+      }
        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
    }
-}
+    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
    output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
@ -8,18 +8,9 @@
 #define TO_FLT4 convert_float4
 #endif
 __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void DepthwiseConv2d_IMG_NC4HW4(
+__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-__read_only image2d_t src_data,
+                                         float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-__global FLT4* filter,
+                                         int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __write_only image2d_t dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -36,32 +27,23 @@ __write_only image2d_t dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        //FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
+        // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
-        FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
+        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
        r += TO_FLT4(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
  FLT4 bias_val = bias[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  //dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
+  // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
  write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
 }
-__kernel void DepthwiseConv2d_IMG_NHWC4(
+__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-__read_only image2d_t src_data,
+                                        float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-__global FLT4* filter,  
+                                        int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __write_only image2d_t dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -78,32 +60,23 @@ __write_only image2d_t dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        //FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
-        FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(Z+x_c*src_size.z, y_c));
+        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c));
        r += TO_FLT4(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
  FLT4 bias_val = bias[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  //dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-  write_imagef(dst_data, (int2)(X*dst_size.z+Z, Y), res0);
+  write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0);
 }
-__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(
+__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-__read_only image2d_t src_data,
+                                            float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
-__global FLT4* filter,  
+                                            int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __write_only image2d_t dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -120,30 +93,21 @@ __write_only image2d_t dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        //FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
        FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
        r += TO_FLT4(src_final * f);
-      };
+      }
    }
  }
  FLT4 bias_val = bias[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  //dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
+  // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
  write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0);
 }
-__kernel void DepthwiseConv2d_BUF_NC4HW4(
+__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-__global FLT4* src_data,
+                                         float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-__global FLT4* filter,
+                                         int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __global FLT4* dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -160,30 +124,21 @@ __global FLT4* dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
+        FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
        r += TO_FLT4(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
  FLT4 bias_val = bias[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
-  dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
+  dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
 }
-__kernel void DepthwiseConv2d_BUF_NHWC4(
+__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-__global FLT4* src_data,
+                                        float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-__global FLT4* filter,  
+                                        int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __global FLT4* dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -200,9 +155,9 @@ __global FLT4* dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
        r += TO_FLT4(src_final * f);
-      };
+      }
      fx_c++;
    }
  }
@ -212,18 +167,9 @@ __global FLT4* dst_data,
  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
 }
-__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(
+__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
-__global FLT4* src_data,
+                                            float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
-__global FLT4* filter,  
+                                            int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
 __global FLT4* bias,
    float relu_clip1,
 __global FLT4* dst_data,
    int2 kernel_size,
    int2 stride,
    int2 padding,
    int2 dilation,
    int4 src_size,
    int4 dst_size) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
@ -240,13 +186,13 @@ __global FLT4* dst_data,
      bool outside_x = x_c < 0 || x_c >= src_size.x;
      if (!outside_x && !outside_y) {
        FLT4 f = filter[fx_c];
-        FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
+        FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
        r += TO_FLT4(src_final * f);
-      };
+      }
    }
  }
  FLT4 bias_val = bias[Z];
  FLT4 res0 = TO_FLT4(r) + bias_val;
  res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
  dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
@ -1,8 +1,7 @@
 #define FLT4 float4
 #define FLT16 float16
-__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
+__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias,
-                        __global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci,
+                     int2 offset_ci, int2 offset_co, int has_bias) {
                        int2 offset_co, int has_bias) {
  int2 gid = (int2)(get_global_id(0), get_global_id(1));
  int2 lid = (int2)(get_local_id(0), get_local_id(1));
  FLT4 s = (FLT4)(0.0f);
@ -28,4 +27,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
    buffer[gid.x] = s;
    // memory pollution? or protected by opencl
  }
-}
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
@ -1,16 +1,13 @@
 #define SLICES 4
-int DivideRoundUp(int n, int div)
+int DivideRoundUp(int n, int div) {
-{
+  int q = n / div;
-    int q = n / div;
+  return n % div == 0 ? q : q + 1;
    return n % div == 0 ? q : q + 1;
 }
-__kernel void SoftMax(__global float4 *input,
+__kernel void SoftMax(__global float4 *input, __global float4 *output, const int4 input_shape) {
-                      __global float4 *output,
+  int X = get_global_id(0);  // width
-                      const int4 input_shape) {
+  int Y = get_global_id(1);  // height
  int X = get_global_id(0); // width
  int Y = get_global_id(1); // height
  int H = input_shape.y;
  int W = input_shape.z;
  int C = input_shape.w;
@ -32,4 +29,4 @@ __kernel void SoftMax(__global float4 *input,
    float4 result = convert_float4(t);
    output[(Y * W + X * H) * C + d] = result;
  }
-}
+}