reconstruct code for GPU ops

2020-11-10 01:01:39 -08:00 · 2020-11-10 01:01:39 -08:00 · d210921115
parent ae62d15bc0
commit d210921115
17 changed files with 401 additions and 616 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmeticself.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmeticself.cl
@ -17,22 +17,6 @@ __kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x >= 0 ? result.x : -result.x;
-  result.y = result.y >= 0 ? result.y : -result.y;
-  result.z = result.z >= 0 ? result.z : -result.z;
-  result.w = result.w >= 0 ? result.w : -result.w;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                              int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -49,22 +33,6 @@ __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = cos(result.x);
-  result.y = cos(result.y);
-  result.z = cos(result.z);
-  result.w = cos(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                              int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -81,22 +49,6 @@ __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = sin(result.x);
-  result.y = sin(result.y);
-  result.z = sin(result.z);
-  result.w = sin(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                              int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -113,22 +65,6 @@ __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = -result.x;
-  result.y = -result.y;
-  result.z = -result.z;
-  result.w = -result.w;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                              int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -145,22 +81,6 @@ __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = exp(result.x);
-  result.y = exp(result.y);
-  result.z = exp(result.z);
-  result.w = exp(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                              int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -177,22 +97,6 @@ __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __wr
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                               int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x > 0 ? log(result.x) : HUGE_VALF;
-  result.y = result.y > 0 ? log(result.y) : HUGE_VALF;
-  result.z = result.z > 0 ? log(result.z) : HUGE_VALF;
-  result.w = result.w > 0 ? log(result.w) : HUGE_VALF;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                                 int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -209,22 +113,6 @@ __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, _
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                  int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x * result.x;
-  result.y = result.y * result.y;
-  result.z = result.z * result.z;
-  result.w = result.w * result.w;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                               int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -241,22 +129,6 @@ __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __w
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF;
-  result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF;
-  result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF;
-  result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                                int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -273,22 +145,6 @@ __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                 int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF;
-  result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF;
-  result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF;
-  result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                                     int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -305,22 +161,6 @@ __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                      int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = result.x > 0 || result.x < 0 ? false : true;
-  result.y = result.y > 0 || result.y < 0 ? false : true;
-  result.z = result.z > 0 || result.z < 0 ? false : true;
-  result.w = result.w > 0 || result.w < 0 ? false : true;
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                                int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -337,22 +177,6 @@ __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                 int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = floor(result.x);
-  result.y = floor(result.y);
-  result.z = floor(result.z);
-  result.w = floor(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                               int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -369,22 +193,6 @@ __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __w
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = ceil(result.x);
-  result.y = ceil(result.y);
-  result.z = ceil(result.z);
-  result.w = ceil(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
-
 __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output,
                                                int4 output_shape) {
  int X = get_global_id(0);  // N*H
@ -401,18 +209,3 @@ __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __
  WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
 }

-__kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output,
-                                                 int4 output_shape) {
-  int X = get_global_id(0);  // N*H
-  int Y = get_global_id(1);  // W
-  int Z = get_global_id(2);  // c/4
-  if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
-    return;
-  }
-  FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)));
-  result.x = round(result.x);
-  result.y = round(result.y);
-  result.z = round(result.z);
-  result.w = round(result.w);
-  WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result);
-}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/sparse_to_dense.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/sparse_to_dense.cl
@ -1,131 +1,48 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #define C4NUM 4
-__kernel void SparseToDenseScalarDim0(__read_only image2d_t input, __write_only image2d_t output, float weight,
-                                      int2 input_shape, float default_value) {
-  FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, 0));
-  FLT4 result = {default_value, default_value, default_value, default_value};
-  int integer = index_input.x / C4NUM;
-  int decimal = (int)(index_input.x) % C4NUM;
-  if (decimal == 0) {
-    result.x = weight;
-  } else if (decimal == 1) {
-    result.y = weight;
-  } else if (decimal == 2) {
-    result.z = weight;
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__kernel void SparseToDenseScalar(__read_only image2d_t input, __global float *output, float weight, int2 inputshape,
+                                  int4 outputshape, float default_value, int stride_w, int inshapeindex1_dim) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= inputshape.x || Y >= inputshape.y) {
+    return;
+  }
+  FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
+  int index = 0;
+  if (inshapeindex1_dim == 1) {
+    index = ((int)index_input.x) * stride_w;
+  } else if (inshapeindex1_dim == 2) {
+    index = ((int)index_input.x) * stride_w + ((int)index_input.y);
+  } else if (inshapeindex1_dim == 3) {
+    index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + ((int)index_input.z);
  } else {
-    result.w = weight;
+    index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
+            ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
  }
-  WRITE_IMAGE(output, (int2)(0, integer), result);
-  return;
+  output[index] = weight;
 }

-__kernel void SparseToDenseScalarDim1(__read_only image2d_t input, __write_only image2d_t output, float weight,
-                                      int2 input_shape, float default_value) {
-  for (int i = 0; i < input_shape.x; ++i) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    int Y = result.x;
-    result.x = weight;
-    WRITE_IMAGE(output, (int2)(0, Y), result);
+__kernel void SparseToDenseVector(__read_only image2d_t input, __global float *output, __global float *weight_vector,
+                                  int2 inputshape, int4 outputshape, float default_value, int stride_w,
+                                  int inshapeindex1_dim) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= inputshape.x || Y >= inputshape.y) {
+    return;
  }
-}
-
-__kernel void SparseToDenseVectorDim1(__read_only image2d_t input, __write_only image2d_t output,
-                                      __global float *weight, int2 input_shape, float default_value) {
-  int index_weight = 0;
-  for (int i = 0; i < input_shape.x; ++i) {
-    FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    int Y = result.x;
-    result.x = weight[index_weight++];
-    WRITE_IMAGE(output, (int2)(0, Y), result);
-  }
-}
-
-__kernel void SparseToDenseScalarDim2Shape2(__read_only image2d_t input, __write_only image2d_t output, float weight,
-                                            int2 input_shape, float default_value) {
-  FLT temp[8] = {default_value, default_value, default_value, default_value,
-                 default_value, default_value, default_value, default_value};
-  FLT result_temp[8] = {default_value, default_value, default_value, default_value,
-                        default_value, default_value, default_value, default_value};
-  int index = 0;  // 0~4
-  int X = 0;
-  FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
-  int Y = (int)index_begin.x;   // N
-  temp[index] = index_begin.y;  // c/4
-  for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
-    FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
-      index++;
-      if (index < C4NUM) {
-        temp[index] = index_input.y;
-      }
-    } else {
-      for (int j = 0; j <= index && index < C4NUM; ++j) {
-        int decimal = (int)temp[j] % C4NUM;
-        result_temp[decimal] = weight;
-        X = ((int)temp[0]) / C4NUM;
-      }
-      FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
-      WRITE_IMAGE(output, (int2)(X, Y), result);
-      index = 0;
-      Y = (int)index_input.x;
-      temp[0] = index_input.y;
-      temp[1] = temp[2] = temp[3] = default_value;
-      result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
-    }
-  }
-
-  // judge the last element for input
-  X = ((int)temp[0]) / C4NUM;
-  for (int i = 0; i <= index && index < C4NUM; ++i) {
-    int decimal = (int)temp[i] % C4NUM;
-    result_temp[decimal] = weight;
-  }
-  FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
-  WRITE_IMAGE(output, (int2)(X, Y), result);
-}
-
-__kernel void SparseToDenseVectorDim2Shape2(__read_only image2d_t input, __write_only image2d_t output,
-                                            __global float *weight, int2 input_shape, float default_value) {
-  FLT temp[8] = {default_value, default_value, default_value, default_value,
-                 default_value, default_value, default_value, default_value};
-  FLT result_temp[8] = {default_value, default_value, default_value, default_value,
-                        default_value, default_value, default_value, default_value};
-  int index = 0;  // 0~4
-  int weight_index = 0;
-  int X = 0;
-  FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0));
-  int Y = (int)index_begin.x;   // N
-  temp[index] = index_begin.y;  // c/4
-  for (int i = 1; i < input_shape.x && index < C4NUM; ++i) {
-    FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i));
-    if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) {
-      index++;
-      if (index < C4NUM) {
-        temp[index] = index_input.y;
-      }
-    } else {
-      for (int j = 0; j <= index && index < C4NUM; ++j) {
-        int decimal = (int)temp[j] % C4NUM;
-        result_temp[decimal] = weight[weight_index++];
-        X = ((int)temp[0]) / C4NUM;
-      }
-      FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
-      WRITE_IMAGE(output, (int2)(X, Y), result);
-      index = 0;
-      Y = (int)index_input.x;
-      temp[0] = index_input.y;
-      temp[1] = temp[2] = temp[3] = default_value;
-      result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value;
-    }
-  }
-
-  // judge the last element for input
-  X = ((int)temp[0]) / C4NUM;
-  for (int i = 0; i <= index && index < C4NUM; ++i) {
-    int decimal = (int)temp[i] % C4NUM;
-    result_temp[decimal] = weight[weight_index++];
-  }
-  FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]};
-  WRITE_IMAGE(output, (int2)(X, Y), result);
+  FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X));
+  int index = 0;
+  if (inshapeindex1_dim == 1) {
+    index = ((int)index_input.x) * stride_w;
+  } else if (inshapeindex1_dim == 2) {
+    index = ((int)index_input.x) * stride_w + (int)index_input.y;
+  } else if (inshapeindex1_dim == 3) {
+    index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + (int)index_input.z;
+  } else {
+    index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w +
+            ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w;
+  }
+  output[index] = weight_vector[X];
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@ -87,24 +87,17 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
  }
 }

-int ArithmeticSelfOpenCLKernel::Init() {
+int ArithmeticSelfOpenCLKernel::CheckSpecs() {
  if (in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) {
    MS_LOG(ERROR) << " only support dim = 4 or 2 but your dim = " << in_tensors_[0]->shape().size();
    return RET_ERROR;
  }
-  auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
+  return RET_OK;
+}

-  std::string kernel_name = "ArithmeticSelf";
-  GetKernelName(&kernel_name, param);
-  kernel_name += "_NHWC4";
-  MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
-  std::set<std::string> build_options;
-  std::string source = arithmeticself_source;
-  std::string program_name = "ArithmeticSelf";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-
-  return mindspore::lite::RET_OK;
+void ArithmeticSelfOpenCLKernel::SetConstArgs() {
+  int arg_cn = 2;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
 }

 void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
@ -121,11 +114,8 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s
  local->push_back(z);
 }

-int ArithmeticSelfOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
-
+void ArithmeticSelfOpenCLKernel::SetGlobalLocal() {
  auto output_shape = out_tensors_[0]->shape();
-  cl_int4 output_shape_ = {};
  uint32_t OH = 1, OW = 1, OC = 1;
  if (output_shape.size() == 4) {
    output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
@ -142,49 +132,48 @@ int ArithmeticSelfOpenCLKernel::Run() {
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  ArithmeticSelfGetWorkGroup(global, &local, max_global[0]);
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}

+int ArithmeticSelfOpenCLKernel::Prepare() {
+  auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_);
+  std::string kernel_name = "ArithmeticSelf";
+  GetKernelName(&kernel_name, param);
+  kernel_name += "_NHWC4";
+  MS_LOG(DEBUG) << "execute kernel name : " << kernel_name;
+  std::set<std::string> build_options;
+  std::string source = arithmeticself_source;
+  std::string program_name = "ArithmeticSelf";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  SetGlobalLocal();
+  SetConstArgs();
+
+  return RET_OK;
+}
+
+int ArithmeticSelfOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running! ";
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
-
-  return mindspore::lite::RET_OK;
+  return RET_OK;
 }

-kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                      const std::vector<lite::Tensor *> &outputs,
-                                                      OpParameter *opParameter, const lite::InnerContext *ctx,
-                                                      const kernel::KernelKey &desc,
-                                                      const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != mindspore::lite::RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>)

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h
@ -32,13 +32,17 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel {

  ~ArithmeticSelfOpenCLKernel() override = default;

-  int Init() override;
+  int Prepare() override;
+
+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;

  int Run() override;

 private:
  void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param);
-
+  cl_int4 output_shape_ = {};
  cl::Kernel kernel_;
 };

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@ -30,16 +30,7 @@ using mindspore::schema::PrimitiveType_BatchNorm;

 namespace mindspore::kernel {

-int BatchNormOpenCLKernel::Init() {
-  std::string kernel_name = "Batch_normalization_NHWC4";
-  std::set<std::string> build_options;
-  std::string source = batchnorm_source;
-  std::string program_name = "Batch_normalization";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  return RET_OK;
-}
+int BatchNormOpenCLKernel::CheckSpecs() { return RET_OK; }

 void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
@ -55,13 +46,17 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
  local->push_back(z);
 }

-int BatchNormOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
+void BatchNormOpenCLKernel::SetConstArgs() {
+  int arg_cn = 6;
  auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
  auto input0_shape = in_tensors_[0]->shape();
-  auto output_shape = out_tensors_[0]->shape();
  cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)};
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
+}

+void BatchNormOpenCLKernel::SetGlobalLocal() {
+  auto output_shape = out_tensors_[0]->shape();
  uint32_t OH = output_shape[1];
  uint32_t OW = output_shape[2];
  uint32_t OC = UP_DIV(output_shape[3], C4NUM);
@ -70,6 +65,25 @@ int BatchNormOpenCLKernel::Run() {
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  BatchNormGetWorkGroup(global, &local, max_global[0]);
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}
+
+int BatchNormOpenCLKernel::Prepare() {
+  std::string kernel_name = "Batch_normalization_NHWC4";
+  std::set<std::string> build_options;
+  std::string source = batchnorm_source;
+  std::string program_name = "Batch_normalization";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  SetConstArgs();
+  SetGlobalLocal();
+
+  return RET_OK;
+}
+
+int BatchNormOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running! ";
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c());   // scale
@ -77,32 +91,11 @@ int BatchNormOpenCLKernel::Run() {
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c());   // mean
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c());   // variance
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

  return RET_OK;
 }

-kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                 const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                                 const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                                 const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.h
@ -31,9 +31,12 @@ class BatchNormOpenCLKernel : public OpenCLKernel {

  ~BatchNormOpenCLKernel() override = default;

-  int Init() override;
-
  int Run() override;
+  int Prepare() override;
+
+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;

 private:
  cl::Kernel kernel_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc
@ -1,3 +1,4 @@
+
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
@ -42,18 +43,13 @@ int CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *par
  return RET_OK;
 }

-int CastOpenCLKernel::Init() {
-  auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
-  std::string kernel_name = "Cast";
-  GetKernelName(&kernel_name, param);
-  kernel_name += "_NHWC4";
-  std::set<std::string> build_options;
-  std::string source = cast_source;
-  std::string program_name = "cast";
-  ocl_runtime_->LoadSource(program_name, source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
-  MS_LOG(DEBUG) << kernel_name << " Init Done!";
-  return RET_OK;
+int CastOpenCLKernel::CheckSpecs() { return RET_OK; }
+
+void CastOpenCLKernel::SetConstArgs() {
+  auto input_shape = in_tensors_[0]->shape();
+  cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
+  int arg_cn = 2;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
 }

 void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
@ -70,11 +66,8 @@ void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *lo
  local->push_back(z);
 }

-int CastOpenCLKernel::Run() {
-  MS_LOG(DEBUG) << this->name() << " Running! ";
+void CastOpenCLKernel::SetGlobalLocal() {
  auto input_shape = in_tensors_[0]->shape();
-  cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)};
-
  uint32_t OH = input_shape[1];
  uint32_t OW = input_shape[2];
  uint32_t OC = UP_DIV(input_shape[3], C4NUM);
@ -83,34 +76,35 @@ int CastOpenCLKernel::Run() {
  std::vector<size_t> local = {1, 1, 1};  // init local
  std::vector<size_t> global = {OH, OW, OC};
  CastGetWorkGroup(global, &local, max_global[0]);
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}
+
+int CastOpenCLKernel::Prepare() {
+  auto param = reinterpret_cast<CastParameter *>(this->op_parameter_);
+  std::string kernel_name = "Cast";
+  GetKernelName(&kernel_name, param);
+  kernel_name += "_NHWC4";
+  std::set<std::string> build_options;
+  std::string source = cast_source;
+  std::string program_name = "cast";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  SetConstArgs();
+  SetGlobalLocal();
+  return RET_OK;
+}
+
+int CastOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running! ";
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());   // input tensor
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());  // out tensor
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

  return RET_OK;
 }

-kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                            const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                            const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                            const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new CastOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: Cast ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h
@ -31,8 +31,11 @@ class CastOpenCLKernel : public OpenCLKernel {
      : OpenCLKernel(parameter, inputs, outputs) {}

  ~CastOpenCLKernel() override = default;
+  int Prepare() override;

-  int Init() override;
+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;

  int Run() override;

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@ -49,7 +49,21 @@ int ConcatOpenCLKernel::RunAxis0() {
  return RET_OK;
 }

-int ConcatOpenCLKernel::Init() {
+void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
+  const int max_divider = 8;
+  const int max_x = 2, max_y = 8;
+  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
+  int yz = max_size / x;
+  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
+  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
+
+  local->clear();
+  local->push_back(x);
+  local->push_back(y);
+  local->push_back(z);
+}
+
+int ConcatOpenCLKernel::CheckSpecs() {
  if (in_tensors_[0]->shape().size() != 4) {
    MS_LOG(ERROR) << " only support dim = 4 ";
    return RET_ERROR;
@ -65,6 +79,32 @@ int ConcatOpenCLKernel::Init() {
    return RET_ERROR;
  }

+  return RET_OK;
+}
+
+void ConcatOpenCLKernel::SetConstArgs() {
+  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
+  auto output_shape = out_tensors_[0]->shape();
+  cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
+  int arg_cn = 2 * in_tensors_.size() + 1;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
+}
+
+void ConcatOpenCLKernel::SetGlobalLocal() {
+  auto output_shape = out_tensors_[0]->shape();
+  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
+  std::vector<size_t> local = {1, 1, 1};
+  uint32_t OH = output_shape[0] * output_shape[1];
+  uint32_t OW = output_shape[2];
+  uint32_t OC = output_shape[3];
+  std::vector<size_t> global = {OH, OW, OC};
+  ConcatGetWorkGroup(global, &local, max_global[0]);
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}
+
+int ConcatOpenCLKernel::Prepare() {
+  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
  std::string kernel_name = "Concat";
  if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
    kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_);
@ -80,38 +120,17 @@ int ConcatOpenCLKernel::Init() {
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  SetGlobalLocal();
+  SetConstArgs();
  return RET_OK;
 }

-void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
-  const int max_divider = 8;
-  const int max_x = 2, max_y = 8;
-  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
-  int yz = max_size / x;
-  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
-  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
-
-  local->clear();
-  local->push_back(x);
-  local->push_back(y);
-  local->push_back(z);
-}
-
 int ConcatOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
  if (param->axis_ == 0) {
    return RunAxis0();
  }
-  auto output_shape = out_tensors_[0]->shape();
-  cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
-  const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
-  std::vector<size_t> local = {1, 1, 1};
-  uint32_t OH = output_shape_.s[0] * output_shape_.s[1];
-  uint32_t OW = output_shape_.s[2];
-  uint32_t OC = output_shape_.s[3];
-  std::vector<size_t> global = {OH, OW, OC};
-  ConcatGetWorkGroup(global, &local, max_global[0]);
  if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
    int arg_cn = 0;
    for (int i = 0; i < in_tensors_.size(); ++i) {
@ -123,35 +142,14 @@ int ConcatOpenCLKernel::Run() {
                      UP_DIV(in_tensors_[i]->shape()[3], C4NUM)};
      ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
    }
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
  } else {
    MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
    return RET_ERROR;
  }
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
  return RET_OK;
 }

-kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                              const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new ConcatOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: Concat ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLConcatKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLConcatKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
@ -31,8 +31,11 @@ class ConcatOpenCLKernel : public OpenCLKernel {

  ~ConcatOpenCLKernel() override = default;

-  int Init() override;
+  int Prepare() override;

+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;
  int Run() override;

 private:
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.cc
@ -62,7 +62,11 @@ int FillOpenCLKernel::RunShape() {
  return RET_OK;
 }

-int FillOpenCLKernel::Init() {
+void FillOpenCLKernel::SetConstArgs() {}
+
+void FillOpenCLKernel::SetGlobalLocal() {}
+
+int FillOpenCLKernel::CheckSpecs() {
  auto param = this->op_parameter_;

  if (out_tensors_[0]->shape().size() > 4) {
@ -76,6 +80,8 @@ int FillOpenCLKernel::Init() {
  return RET_OK;
 }

+int FillOpenCLKernel::Prepare() { return RET_OK; }
+
 int FillOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = this->op_parameter_;
@ -88,28 +94,9 @@ int FillOpenCLKernel::Run() {
  return RET_OK;
 }

-kernel::LiteKernel *FillOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                            const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                            const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                            const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) FillOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new FillOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: fill ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, FillOpenCLKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, FillOpenCLKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, FillOpenCLKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, FillOpenCLKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>);

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h
@ -32,7 +32,11 @@ class FillOpenCLKernel : public OpenCLKernel {

  ~FillOpenCLKernel() override = default;

-  int Init() override;
+  int Prepare() override;
+
+  int CheckSpecs() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;

  int Run() override;

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc
@ -90,24 +90,64 @@ int SparseToDenseOpenCLKernel::InitWeights() {
  return RET_OK;
 }

-int SparseToDenseOpenCLKernel::Init() {
+int SparseToDenseOpenCLKernel::CheckSpecs() {
+  if (in_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().size() > 4) {
+    MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size() << "outdim"
+                  << out_tensors_[0]->shape().size();
+    return RET_ERROR;
+  }
  if (out_tensors_[0]->shape().size() > 2 || in_tensors_.size() < 3) {
    MS_LOG(ERROR) << " only support dim <= 2 and in_tensors_.size >= 3";
    return RET_ERROR;
  }
-  if ((in_tensors_[0]->shape()[1] > 3) && (input_dim_ == 2)) {
-    MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3  && input_dim_=2 ,but your shapes is: "
-                  << in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
+  if (input_dim_ == 2) {
+    if ((in_tensors_[0]->shape()[1] > 4)) {
+      MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3  && input_dim_=2 ,but your shapes is: "
+                    << in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_;
+      return ERROR;
+    }
+  }
+  if (inshapeindex1_dim > 4) {
+    MS_LOG(ERROR) << "Unsupported input_indices[1] > 4: ";
    return ERROR;
  }
-  input_dim_ = in_tensors_[0]->shape().size();
-  weight_scalar_ = in_tensors_[2]->IsScalar();
-  std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "ScalarDim" : "VectorDim") +
-                            std::to_string(in_tensors_[0]->shape()[1] == 1 ? 1 : input_dim_);
-  if (input_dim_ == 2 && in_tensors_[0]->shape()[1] != 1) {
-    kernel_name += "Shape" + std::to_string(in_tensors_[0]->shape()[1]);
+  auto param = reinterpret_cast<SparseToDenseParameter *>(op_parameter_);
+  if (param->validate_indices_) {
+    MS_LOG(ERROR) << "Unspported unordered for in_tensors_indices";
+    return RET_ERROR;
  }
+  return RET_OK;
+}

+void SparseToDenseOpenCLKernel::SetConstArgs() {
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  Image2DInfo img_info(out_tensors_[0]);
+  size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float);
+  stride_w = img_info.RowPitch() / dtype;
+  cl_int2 input_shape = {n_ * h_, w_ * UP_DIV(c_, C4NUM)};
+  auto out_shape_temp = out_tensors_[0]->shape();
+  cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)};
+  int arg_cn = 3;
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim);
+}
+
+void SparseToDenseOpenCLKernel::SetGlobalLocal() {
+  std::vector<size_t> local = {1, 1};
+  size_t OH = n_ * h_;
+  size_t OW = w_ * UP_DIV(c_, C4NUM);
+  std::vector<size_t> global = {OH, OW};
+  OpenCLKernel::AlignGlobalLocal(global, local);
+}
+
+int SparseToDenseOpenCLKernel::Prepare() {
+  input_dim_ = in_tensors_[0]->shape().size();
+  inshapeindex1_dim = in_tensors_[0]->shape()[1];
+  weight_scalar_ = in_tensors_[2]->IsScalar();
+  std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector");
  std::set<std::string> build_options;
  std::string source = sparse_to_dense_source;
  std::string program_name = "SparseToDense";
@ -122,8 +162,10 @@ int SparseToDenseOpenCLKernel::Init() {
      default_ = *reinterpret_cast<float *>(input_tensor3->data_c());
    }
  }
-
  InitWeights();
+  InferShapeTo4D();
+  SetGlobalLocal();
+  SetConstArgs();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;
 }
@ -131,73 +173,47 @@ int SparseToDenseOpenCLKernel::Init() {
 int SparseToDenseOpenCLKernel::InferShapeTo4D() {
  if (in_tensors_[0]->shape().size() <= 4) {
    if (in_tensors_[0]->shape().size() == 1) {
-      N_ = in_tensors_[0]->shape()[0];
+      n_ = in_tensors_[0]->shape()[0];
    } else if (in_tensors_[0]->shape().size() == 2) {
-      N_ = in_tensors_[0]->shape()[0];
-      C_ = in_tensors_[0]->shape()[1];
-    } else if (in_tensors_[0]->shape().size() == 3) {
-      N_ = in_tensors_[0]->shape()[0];
-      W_ = in_tensors_[0]->shape()[1];
-      C_ = in_tensors_[0]->shape()[2];
-    } else {
-      N_ = in_tensors_[0]->shape()[0];
-      H_ = in_tensors_[0]->shape()[1];
-      W_ = in_tensors_[0]->shape()[2];
-      C_ = in_tensors_[0]->shape()[3];
+      n_ = in_tensors_[0]->shape()[0];
+      c_ = in_tensors_[0]->shape()[1];
+    }
+  }
+  if (out_tensors_[0]->shape().size() <= 4) {
+    if (out_tensors_[0]->shape().size() == 1) {
+      out_n_ = out_tensors_[0]->shape()[0];
+    } else if (out_tensors_[0]->shape().size() == 2) {
+      out_n_ = out_tensors_[0]->shape()[0];
+      out_c_ = out_tensors_[0]->shape()[1];
+    } else if (out_tensors_[0]->shape().size() == 3) {
+      out_n_ = out_tensors_[0]->shape()[0];
+      out_w_ = out_tensors_[0]->shape()[1];
+      out_c_ = out_tensors_[0]->shape()[2];
+    } else {
+      out_n_ = out_tensors_[0]->shape()[0];
+      out_h_ = out_tensors_[0]->shape()[1];
+      out_w_ = out_tensors_[0]->shape()[2];
+      out_c_ = out_tensors_[0]->shape()[3];
    }
-  } else {
-    MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size();
-    return RET_ERROR;
  }
  return RET_OK;
 }

 int SparseToDenseOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
-  InferShapeTo4D();
-  cl_int2 input_shape = {static_cast<cl_int>(N_ * H_), static_cast<cl_int>(W_ * UP_DIV(C_, C4NUM))};
  InitOutputToDefault();
-  std::vector<size_t> local = {1, 1};
-  std::vector<size_t> global = {1, 1};
  int arg_cn = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c());
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
-  if (weight_scalar_) {
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF);
+  if (!weight_scalar_) {
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF);
  } else {
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_);
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_);
  }
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
-  ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_);
-  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
  return RET_OK;
 }

-kernel::LiteKernel *SparseToDenseOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                     const std::vector<lite::Tensor *> &outputs,
-                                                     OpParameter *opParameter, const lite::InnerContext *ctx,
-                                                     const kernel::KernelKey &desc,
-                                                     const mindspore::lite::PrimitiveC *primitive) {
-  if (inputs.empty()) {
-    MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size();
-    free(opParameter);
-    return nullptr;
-  }
-  auto *kernel = new (std::nothrow) SparseToDenseOpenCLKernel(opParameter, inputs, outputs);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << " new HswishOpenCLKernel failed ";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << " Init kernel failed, name: hswish ";
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator);
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h
@ -31,9 +31,12 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {

  ~SparseToDenseOpenCLKernel() override = default;

-  int Init() override;
+  int Prepare() override;
  int Run() override;
  int InitWeights() override;
+  void SetConstArgs() override;
+  void SetGlobalLocal() override;
+  int CheckSpecs() override;

 private:
  int InferShapeTo4D();
@ -47,12 +50,19 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel {
  float weight_scalar_{0.f};
  void *weight_vector_{nullptr};
  int input_dim_{1};
+  int inshapeindex1_dim{1};
+  cl_int stride_w{1};
  std::vector<int32_t> output_shape_;

-  size_t N_{1};
-  size_t H_{1};
-  size_t W_{1};
-  size_t C_{1};
+  cl_int n_{1};
+  cl_int h_{1};
+  cl_int w_{1};
+  cl_int c_{1};
+
+  cl_int out_n_{1};
+  cl_int out_h_{1};
+  cl_int out_w_{1};
+  cl_int out_c_{1};
 };
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@ -112,7 +112,7 @@ struct Image2DInfo {
  size_t RowPitch() const {
    auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
    int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
-    size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size;
+    size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
    return row_pitch;
  }

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_self_tests.cc
@ -98,8 +98,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) {
  }
  arithmeticself_kernel->Init();
  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
+  for (auto &input_tensor_ : inputs) {
+    input_tensor_->MallocData(allocator);
  }
  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
@ -186,8 +186,8 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
  }
  arithmeticself_kernel->Init();
  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
+  for (auto &input_tensor_ : inputs) {
+    input_tensor_->MallocData(allocator);
  }
  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
@ -206,7 +206,6 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) {
  }
  sub_graph->Init();
  MS_LOG(INFO) << " initialize input data ";
-  std::cout << sizeof(input_data1) / sizeof(float) << std::endl;
  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));

  std::cout << "==================output data================" << std::endl;
@ -280,8 +279,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfdim2Fp16) {
  }
  arithmeticself_kernel->Init();
  // to do allocate memory for inputs and outputs
-  for (auto &input_tensor : inputs) {
-    input_tensor->MallocData(allocator);
+  for (auto &input_tensor_ : inputs) {
+    input_tensor_->MallocData(allocator);
  }
  MS_LOG(INFO) << " initialize sub_graph ";
  std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel};
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/sparse_to_dense_tests.cc
@ -28,6 +28,78 @@ class TestSparseToDenseOpenCLCI : public mindspore::CommonTest {
  TestSparseToDenseOpenCLCI() {}
 };

+TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape3Vector) {
+  MS_LOG(INFO) << " begin test ";
+  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
+  auto runtime = runtime_wrapper.GetInstance();
+  runtime->Init();
+  auto allocator = runtime->GetAllocator();
+
+  MS_LOG(INFO) << " init tensors ";
+  std::vector<int> input_shape1 = {6, 3};
+  std::vector<int> input_shape2 = {3};
+  std::vector<int> input_shape3 = {6};
+  std::vector<int> input_shape4 = {1};
+  float input_data1[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6};
+  float input_data2[] = {6, 1, 10};
+  float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  float input_data4[] = {0.0};
+  float correctOutput[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto data_type = kNumberTypeFloat32;
+  std::vector<int> output_shape = {6, 1, 10};
+  auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR);
+  auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR);
+  auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR);
+  auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR);
+  auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR);
+  // allocate memory for weights
+  in_tensor2.MallocData();
+  in_tensor3.MallocData();
+  in_tensor4.MallocData();
+  std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4};
+  std::vector<lite::Tensor *> outputs{&output_tensor};
+  // initialize weights
+  memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
+  memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3));
+  memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4));
+  MS_LOG(INFO) << " initialize tensors ";
+  auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter)));
+  if (param == nullptr) {
+    MS_LOG(INFO) << " new ActivationParameter failed ";
+    return;
+  }
+
+  auto *sparse_to_dense_kernel =
+    new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  if (sparse_to_dense_kernel == nullptr) {
+    MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed ";
+    delete param;
+    return;
+  }
+  sparse_to_dense_kernel->Init();
+  MS_LOG(INFO) << " initialize sub_graph ";
+  std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel};
+  auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels);
+  if (sub_graph == nullptr) {
+    MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
+    delete param;
+    delete sparse_to_dense_kernel;
+    return;
+  }
+  // to do allocate memory for inputs
+  in_tensor1.MallocData(allocator);
+  sub_graph->Init();
+  MS_LOG(INFO) << " initialize input data ";
+  memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
+
+  std::cout << "==================output data================" << std::endl;
+  sub_graph->Run();
+  auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c());
+  ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001));
+  delete sub_graph;
+}
+
 TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Scalar) {
  MS_LOG(INFO) << " begin test ";
  auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();