add fullconnection 4d and reshape 4dTo2d, maxpooling+ReLU

2020-10-10 16:18:13 +08:00 · 2020-10-10 16:18:13 +08:00 · 02e594681f
parent 8df757143a
commit 02e594681f
15 changed files with 715 additions and 113 deletions
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
@ -0,0 +1,106 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define C4NUM 4
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+__kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                                   __write_only image2d_t output, int4 in_shape, int2 out_shape) {
+  int gidx = get_global_id(0);  // CO4
+  int gidz = get_global_id(2);  // N
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int hwci4 = ci4 * in_shape.y * in_shape.z;
+  int co4 = UP_DIV(out_shape.y, C4NUM);
+  int n = out_shape.x;
+  bool inside = gidx < co4 && gidz < n;
+  FLT4 result = (FLT4)(0.0f);
+  for (uint i = lidy; i < hwci4 && inside; i += 4) {
+    int index_h = i / (ci4 * in_shape.z);
+    int index_wci4 = i % (ci4 * in_shape.z);
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h));
+    FLT16 w = weight[i * co4 + gidx];
+    result.x += dot(v, w.s0123);
+    result.y += dot(v, w.s4567);
+    result.z += dot(v, w.s89ab);
+    result.w += dot(v, w.scdef);
+  }
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
+    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
+    WRITE_IMAGE(output, (int2)(gidx, gidz), result);
+  }
+}
+
+__kernel void FullConnection_NHWC4_ReLU(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                                        __write_only image2d_t output, int4 in_shape, int2 out_shape) {
+  int gidx = get_global_id(0);  // CO4
+  int gidz = get_global_id(2);  // N
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int hwci4 = ci4 * in_shape.y * in_shape.z;
+  int co4 = UP_DIV(out_shape.y, C4NUM);
+  int n = out_shape.x;
+  bool inside = gidx < co4 && gidz < n;
+  FLT4 result = (FLT4)(0.0f);
+  for (uint i = lidy; i < hwci4 && inside; i += 4) {
+    int index_h = i / (ci4 * in_shape.z);
+    int index_wci4 = i % (ci4 * in_shape.z);
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h));
+    FLT16 w = weight[i * co4 + gidx];
+    result.x += dot(v, w.s0123);
+    result.y += dot(v, w.s4567);
+    result.z += dot(v, w.s89ab);
+    result.w += dot(v, w.scdef);
+  }
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
+    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
+    result = max(result, (FLT4)(0.f));
+    WRITE_IMAGE(output, (int2)(gidx, gidz), result);
+  }
+}
+
+__kernel void FullConnection_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
+                                    __write_only image2d_t output, int4 in_shape, int2 out_shape) {
+  int gidx = get_global_id(0);  // CO4
+  int gidz = get_global_id(2);  // N
+  int lidx = get_local_id(0);
+  int lidy = get_local_id(1);
+  int ci4 = UP_DIV(in_shape.w, C4NUM);
+  int hwci4 = ci4 * in_shape.y * in_shape.z;
+  int co4 = UP_DIV(out_shape.y, C4NUM);
+  int n = out_shape.x;
+  bool inside = gidx < co4 && gidz < n;
+  FLT4 result = (FLT4)(0.0f);
+  for (uint i = lidy; i < hwci4 && inside; i += 4) {
+    int index_ci4h = i / in_shape.z;
+    int index_w = i % in_shape.z;
+    FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_w, gidz * in_shape.y * ci4 + index_ci4h));
+    FLT16 w = weight[i * co4 + gidx];
+    result.x += dot(v, w.s0123);
+    result.y += dot(v, w.s4567);
+    result.z += dot(v, w.s89ab);
+    result.w += dot(v, w.scdef);
+  }
+  __local FLT4 temp[32][4];
+  temp[lidx][lidy] = result;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lidy == 0 && inside) {
+    result += temp[lidx][1];
+    result += temp[lidx][2];
+    result += temp[lidx][3];
+    result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
+    WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result);
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
@ -2,8 +2,8 @@
 #define C4NUM 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                              __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
+                              int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidz = get_global_id(2);  // N
  int lidx = get_local_id(0);
@ -21,7 +21,6 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh
    result.z += dot(v, w.s89ab);
    result.w += dot(v, w.scdef);
  }
-  WRITE_IMAGE(output, (int2)(gidx, gidz), result);
  __local FLT4 temp[32][4];
  temp[lidx][lidy] = result;
  barrier(CLK_LOCAL_MEM_FENCE);
@ -29,15 +28,12 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
-    }
    WRITE_IMAGE(output, (int2)(gidx, gidz), result);
  }
 }

-__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                               __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
+                               int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidz = get_global_id(2);  // N
  int lidx = get_local_id(0);
@ -62,15 +58,12 @@ __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weig
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
-    }
-    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result);
+    WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result);
  }
 }

-__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                              __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
+                              int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidy = get_global_id(1);  // N * H * 4
  int gidz = get_global_id(2);  // W
@ -99,15 +92,12 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
-    }
    WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result);
  }
 }

-__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias,
-                               __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) {
+__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output,
+                               int4 in_shape, int4 out_shape) {
  int gidx = get_global_id(0);  // CO4
  int gidy = get_global_id(1);  // N * H * 4
  int gidz = get_global_id(2);  // W
@ -138,9 +128,6 @@ __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weig
    result += temp[lidx][1];
    result += temp[lidx][2];
    result += temp[lidx][3];
-    if (has_bias != 0) {
-      result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
-    }
    WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result);
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
@ -65,6 +65,35 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }

+__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
+                                          const int4 input_shape, const int4 output_shape, const int2 stride,
+                                          const int2 kernel_size, const int2 padding) {
+  // axis to dst tensor coordinate
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+
+  // boundary check
+  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
+    return;
+  }
+
+  FLT4 maximum = (FLT4)(-10000.0f);
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    if (y_c < 0 || y_c >= input_shape.y) continue;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      if (x_c < 0 || x_c >= input_shape.x) continue;
+      FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
+      maximum = max(src, maximum);
+    }
+  }
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
+}
+
 __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output,
                                      const int4 input_shape, const int4 output_shape, const int2 stride,
                                      const int2 kernel_size, const int2 padding) {
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
@ -2,27 +2,23 @@
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) {
  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) {
+  if (X >= size_out.x * size_out.y * size_out.z * size_out.w) {
    return;
  }
-  int out_index = X * size_out.y + Y;
-  int ih = out_index / size.y;
-  int iw = out_index % size.y;
-  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih)));
+  int in_img_x = size.z * size.w;
+  int out_img_x = size_out.z * size_out.w;
+  WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x),
+              READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x)));
 }

 __kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size,
                             int4 size_out) {
  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) {
+  if (X >= size_out.x * size_out.y * size_out.z * size_out.w) {
    return;
  }
-  int out_index = X * size_out.y + Y;
-  int ih = out_index / size.y;
-  int iw = out_index % size.y;
-  WRITE_IMAGE(dst_data, (int2)(Y, Z * size_out.x + X), READ_IMAGE(src_data, smp_zero, (int2)(iw, Z * size.x + ih)));
+  int in_img_x = size.z;
+  int out_img_x = size_out.z;
+  WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x),
+              READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x)));
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@ -0,0 +1,254 @@
+/**
+ * Copyright 2019 Huawei Technologies n., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <string>
+#include <map>
+#include "nnacl/fp32/common_func.h"
+#include "src/kernel_registry.h"
+#include "src/runtime/kernel/opencl/kernel/fullconnection.h"
+#ifndef PROGRAM_WITH_IL
+#include "src/runtime/kernel/opencl/cl/fullconnection.cl.inc"
+#endif
+
+using mindspore::kernel::KERNEL_ARCH::kGPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::schema::PrimitiveType_FullConnection;
+
+namespace mindspore::kernel {
+
+int FullConnectionOpenCLKernel::Init() {
+  std::string kernel_name = "FullConnection";
+  kernel_name += "_" + std::string(EnumNameFormat(op_format_));
+  auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  transposeA = param->a_transpose_;
+  if (transposeA) {
+    MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet.";
+    return RET_ERROR;
+  }
+  transposeB = param->b_transpose_;
+  enable_fp16_ = ocl_runtime_->GetFp16Enable();
+  if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) ||
+      out_tensors_[0]->shape().size() != 2) {
+    MS_LOG(ERROR) << "fullconnection only support input shape size = 2 or 4";
+    return RET_ERROR;
+  }
+  if (in_tensors_[0]->shape().size() == 4) {
+    if (in_tensors_[0]->shape()[3] % C4NUM != 0) {
+      MS_LOG(ERROR) << "fullconnection only support input shape channel % 4 = 0 if input shape size = 4";
+      return RET_ERROR;
+    }
+    inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2],
+               in_tensors_[0]->shape()[3]};
+  } else {
+    inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]};
+  }
+
+  outShape = out_tensors_[0]->shape();
+  switch (param->act_type_) {
+    case ActType_No:
+      break;
+    case ActType_Relu:
+      kernel_name += "_ReLU";
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
+      return RET_ERROR;
+  }
+#ifdef PROGRAM_WITH_IL
+  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
+#else
+  std::set<std::string> build_options;
+  std::string source = fullconnection_source;
+  std::string program_name = "FullConnection";
+  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
+#endif
+
+  PadWeight();
+  in_ori_format_ = in_tensors_[0]->GetFormat();
+  out_ori_format_ = out_tensors_[0]->GetFormat();
+  in_tensors_[0]->SetFormat(op_format_);
+  out_tensors_[0]->SetFormat(op_format_);
+  MS_LOG(DEBUG) << kernel_name << " Init Done!";
+  return RET_OK;
+}
+
+int FullConnectionOpenCLKernel::ReSize() { return RET_OK; }
+
+void FullConnectionOpenCLKernel::PadWeight() {
+  // ABMCI @ ABCICO = ABMCO
+  auto allocator = ocl_runtime_->GetAllocator();
+  int ci = inShape[1] * inShape[2] * inShape[3];
+  int ci4 = UP_DIV(ci, C4NUM);
+  int co = outShape[1];
+  int co4 = UP_DIV(co, C4NUM);
+  int a = 1;
+  int b = 1;
+
+  size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float);
+  padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
+  padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true);
+  auto padWeightFp32 = reinterpret_cast<float *>(padWeight_);
+  auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_);
+  memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size);
+  auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
+  auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c());
+  bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16;
+
+  // pad weight
+  // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI)
+  // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI)
+  int index = 0;
+  for (int aa = 0; aa < a; aa++) {
+    for (int bb = 0; bb < b; bb++) {
+      int baseAB = (aa * b + bb) * ci * co;
+      for (int i = 0; i < ci4; ++i) {
+        for (int j = 0; j < co4; ++j) {
+          for (int k = 0; k < C4NUM; ++k) {
+            for (int l = 0; l < C4NUM; ++l) {
+              int src_ci = i * C4NUM + l;
+              int src_co = j * C4NUM + k;
+              if (src_ci < ci && src_co < co) {
+                int originId = baseAB + src_ci * co + src_co;
+                if (transposeB) {
+                  originId = baseAB + src_co * ci + src_ci;
+                }
+                if (enable_fp16_) {
+                  if (!isModelFp16) {
+                    padWeightFp16[index++] = originWeightFp32[originId];
+                  } else {
+                    padWeightFp16[index++] = originWeightFp16[originId];
+                  }
+                } else {
+                  if (!isModelFp16) {
+                    padWeightFp32[index++] = originWeightFp32[originId];
+                  } else {
+                    padWeightFp32[index++] = originWeightFp16[originId];
+                  }
+                }
+              } else {
+                index++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  allocator->UnmapBuffer(padWeight_);
+
+  // pad FC Bias
+  size_t im_dst_x, im_dst_y;
+  im_dst_x = co4;
+  im_dst_y = 1;
+  size_t img_dtype = CL_FLOAT;
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
+  std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
+  bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
+  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
+  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
+  if (in_tensors_.size() >= 3) {
+    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
+      }
+    } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
+      for (int i = 0; i < co; i++) {
+        reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
+      }
+    } else {
+      memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size);
+    }
+  }
+  allocator->UnmapBuffer(bias_);
+}
+
+int FullConnectionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
+  size_t im_dst_x, im_dst_y;
+  auto out_shape = out_tensors_[0]->shape();
+  int n = 1, h = 1, w = 1, c = 1;
+  if (out_tensors_[0]->shape().size() == 2) {
+    n = out_shape[0];
+    c = out_shape[1];
+  } else {
+    n = out_shape[0];
+    h = out_shape[1];
+    w = out_shape[2];
+    c = out_shape[3];
+  }
+  if (op_format_ == schema::Format_NHWC4) {
+    im_dst_x = w * UP_DIV(c, C4NUM);
+    im_dst_y = n * h;
+  } else if (op_format_ == schema::Format_NC4HW4) {
+    im_dst_x = w;
+    im_dst_y = n * UP_DIV(c, C4NUM) * h;
+  } else {
+    MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_);
+    return RET_ERROR;
+  }
+  size_t img_dtype = CL_FLOAT;
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
+  img_size->clear();
+  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
+  *img_size = vec;
+  return RET_OK;
+}
+
+int FullConnectionOpenCLKernel::Run() {
+  MS_LOG(DEBUG) << this->name() << " Running!";
+  // local size should less than MAX_GROUP_SIZE
+  std::vector<size_t> local = {32, 4, 1};
+  std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[1]), C4NUM), 4, static_cast<size_t>(outShape[0])};
+  int arg_count = 0;
+  cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]};
+  cl_int2 out_shape = {outShape[0], outShape[1]};
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
+  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
+  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
+  return RET_OK;
+}
+
+kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                                      const std::vector<lite::Tensor *> &outputs,
+                                                      OpParameter *opParameter, const lite::InnerContext *ctx,
+                                                      const kernel::KernelKey &desc,
+                                                      const mindspore::lite::PrimitiveC *primitive) {
+  auto *kernel =
+    new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator)
+}  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_
+
+#include <vector>
+
+#include "src/runtime/kernel/opencl/opencl_kernel.h"
+#include "nnacl/matmul_parameter.h"
+
+namespace mindspore::kernel {
+
+class FullConnectionOpenCLKernel : public OpenCLKernel {
+ public:
+  explicit FullConnectionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                      const std::vector<lite::Tensor *> &outputs)
+      : OpenCLKernel(parameter, inputs, outputs) {}
+  ~FullConnectionOpenCLKernel() override{};
+
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+  void PadWeight();
+  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
+
+ private:
+  cl::Kernel kernel_;
+  void *padWeight_;
+  void *bias_;
+  bool enable_fp16_{false};
+  bool transposeA{false};
+  bool transposeB{true};
+  std::vector<int> inShape;
+  std::vector<int> outShape;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@ -26,7 +26,6 @@

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
-using mindspore::schema::PrimitiveType_FullConnection;
 using mindspore::schema::PrimitiveType_MatMul;

 namespace mindspore::kernel {
@ -135,33 +134,7 @@ void MatMulOpenCLKernel::PadWeight() {
      }
    }
  }
-
-  // pad FC Bias
-  size_t im_dst_x, im_dst_y;
-  im_dst_x = co4;
-  im_dst_y = 1;
-  size_t img_dtype = CL_FLOAT;
-  if (enable_fp16_) {
-    img_dtype = CL_HALF_FLOAT;
-  }
-  std::vector<size_t> img_size{im_dst_x, im_dst_y, img_dtype};
-  bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size);
-  bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true);
-  memset(bias_, 0x00, co4 * C4NUM * dtype_size);
-  if (in_tensors_.size() >= 3) {
-    if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) {
-      for (int i = 0; i < co; i++) {
-        reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i];
-      }
-    } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) {
-      for (int i = 0; i < co; i++) {
-        reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i];
-      }
-    } else {
-      memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size);
-    }
-  }
-  allocator->UnmapBuffer(bias_);
+  allocator->UnmapBuffer(padWeight_);
 }

 int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
@ -209,11 +182,9 @@ int MatMulOpenCLKernel::Run() {
  cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]};
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
-  ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
  ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
-  ocl_runtime_->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0);
  ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
  return RET_OK;
 }
@ -222,12 +193,7 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *>
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                              const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                              const mindspore::lite::PrimitiveC *primitive) {
-  bool hasBias = false;
-  if (opParameter->type_ == PrimitiveType_FullConnection) {
-    hasBias = (reinterpret_cast<MatMulParameter *>(opParameter))->has_bias_;
-  }
-  auto *kernel =
-    new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, hasBias);
+  auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
    return nullptr;
@ -241,7 +207,5 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *>
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator)
-REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h
@ -27,10 +27,8 @@ namespace mindspore::kernel {
 class MatMulOpenCLKernel : public OpenCLKernel {
 public:
  explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                              const std::vector<lite::Tensor *> &outputs, bool hasBias)
-      : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {
-    hasBias_ = hasBias;
-  }
+                              const std::vector<lite::Tensor *> &outputs)
+      : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {}
  ~MatMulOpenCLKernel() override{};

  int Init() override;
@ -42,8 +40,6 @@ class MatMulOpenCLKernel : public OpenCLKernel {
 private:
  cl::Kernel kernel_;
  void *padWeight_;
-  void *bias_;
-  bool hasBias_{false};
  bool enable_fp16_{false};
  bool transposeA{false};
  bool transposeB{true};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@ -57,6 +57,16 @@ int PoolingOpenCLKernel::Init() {
    MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!";
    return RET_INVALID_OP_NAME;
  }
+  switch (parameter_->act_type_) {
+    case ActType_No:
+      break;
+    case ActType_Relu:
+      kernel_name += "_ReLU";
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_;
+      return RET_ERROR;
+  }
  enable_fp16_ = ocl_runtime_->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@ -38,11 +38,24 @@ int ReshapeOpenCLKernel::Init() {
    MS_LOG(ERROR) << "Reshape output size should in 2,4";
    return RET_ERROR;
  }
-  if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
-    MS_LOG(ERROR) << "Reshape input channel " << in_tensors_[0]->shape().back() << " should equal output channel"
-                  << out_tensors_[0]->shape().back();
+  if ((in_tensors_[0]->shape().back() % 4 != 0 || out_tensors_[0]->shape().back() % 4 != 0) &&
+      in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) {
+    MS_LOG(ERROR) << "Reshape input channel align 4 should equal output channel, cin:" << in_tensors_[0]->shape().back()
+                  << " cout:" << out_tensors_[0]->shape().back();
    return RET_ERROR;
  }
+  if (in_tensors_[0]->shape().size() == 2) {
+    inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]};
+  } else {
+    inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2],
+               in_tensors_[0]->shape()[3]};
+  }
+  if (out_tensors_[0]->shape().size() == 2) {
+    outShape = {out_tensors_[0]->shape()[0], 1, 1, out_tensors_[0]->shape()[1]};
+  } else {
+    outShape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
+                out_tensors_[0]->shape()[3]};
+  }
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
@ -64,18 +77,10 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; }

 int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
-  std::vector<int> shapex = out_tensors_[0]->shape();
-  int n, h, w, c;
-  if (shapex.size() == 2) {
-    n = shapex[0];
-    h = w = 1;
-    c = shapex[1];
-  } else {
-    n = shapex[0];
-    h = shapex[1];
-    w = shapex[2];
-    c = shapex[3];
-  }
+  int n = outShape[0];
+  int h = outShape[1];
+  int w = outShape[2];
+  int c = outShape[3];
  if (op_format_ == schema::Format::Format_NHWC4) {
    im_dst_x = w * UP_DIV(c, C4NUM);
    im_dst_y = n * h;
@ -98,22 +103,12 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)

 int ReshapeOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";
-  std::vector<int> shapex = in_tensors_[0]->shape();
-  int h = shapex[1];
-  int w = shapex[2];
-  int c = shapex[3];
-  int c4 = UP_DIV(c, C4NUM);
-  int oh, ow;
-  if (out_tensors_[0]->shape().size() == 2) {
-    oh = ow = 1;
-  } else {
-    oh = out_tensors_[0]->shape()[1];
-    ow = out_tensors_[0]->shape()[2];
-  }
+
  std::vector<size_t> local = {};
-  std::vector<size_t> global = {(size_t)oh, (size_t)ow, (size_t)c4};
-  cl_int4 size = {h, w, c4, 1};
-  cl_int4 size_out = {oh, ow, c4, 1};
+  std::vector<size_t> global = {
+    static_cast<size_t>(outShape[0] * outShape[1] * outShape[2] * UP_DIV(outShape[3], C4NUM))};
+  cl_int4 size = {inShape[0], inShape[1], inShape[2], UP_DIV(inShape[3], C4NUM)};
+  cl_int4 size_out = {outShape[0], outShape[1], outShape[2], UP_DIV(outShape[3], C4NUM)};
  int arg_idx = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
@ -38,6 +38,8 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
 private:
  cl::Kernel kernel_;
  bool enable_fp16_{false};
+  std::vector<int> inShape;
+  std::vector<int> outShape;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/test/models_caffe.cfg
+++ b/mindspore/lite/test/models_caffe.cfg
@ -31,7 +31,6 @@ ml_hardware_liveness
 ml_liveness_detect_landmark
 ml_face_contour
 2012_ATLANTA_1class_20190621_v4.x_nomean
-ml_handpose
 ml_ocr_sfz_add_final_0325
 ml_hardware_pose
 ml_bank_recog
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc
@ -0,0 +1,196 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <memory>
+#include "src/common/log_adapter.h"
+#include "common/common_test.h"
+#include "mindspore/lite/src/common/file_utils.h"
+#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
+#include "mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h"
+#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h"
+
+namespace mindspore {
+class TestFullConnectionOpenCL : public mindspore::CommonTest {
+ public:
+  TestFullConnectionOpenCL() {}
+};
+
+void RunTestCaseFullConnection(const std::vector<int> &shape, void *input_data, void *weight_data, void *bias_data,
+                               void *output_data, bool enable_fp16, int dims) {
+  auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
+  ocl_runtime->Init();
+  size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float);
+  ocl_runtime->SetFp16Enable(enable_fp16);
+  auto allocator = ocl_runtime->GetAllocator();
+  std::vector<int> input_shape, output_shape, weight_shape, bias_shape;
+  if (dims == 2) {
+    int ci = shape[0];
+    int co = shape[1];
+    input_shape = {1, ci};
+    output_shape = {1, co};
+    weight_shape = {co, ci};
+    bias_shape = {co};
+  } else if (dims == 4) {
+    int n = shape[0];
+    int h = shape[1];
+    int w = shape[2];
+    int ci = shape[3];
+    int co = shape[4];
+    input_shape = {n, h, w, ci};
+    output_shape = {n, co};
+    weight_shape = {co, h * w * ci};
+    bias_shape = {co};
+  }
+  auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter)));
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "param_ptr create error.";
+    return;
+  }
+  param->a_transpose_ = false;
+  param->b_transpose_ = true;
+  param->has_bias_ = true;
+  auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                     input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC);
+  auto tensor_x = tensor_x_ptr.get();
+  if (tensor_x == nullptr) {
+    MS_LOG(ERROR) << "tensor_x create error.";
+    return;
+  }
+
+  auto tensor_w_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                     weight_shape, schema::Format_NC);
+  auto tensor_w = tensor_w_ptr.get();
+  if (tensor_w == nullptr) {
+    MS_LOG(ERROR) << "tensor_w create error.";
+    return;
+  }
+  tensor_w->SetData(weight_data);
+
+  auto tensor_bias_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                        bias_shape, schema::Format_NC);
+  auto tensor_bias = tensor_bias_ptr.get();
+  if (tensor_bias == nullptr) {
+    MS_LOG(ERROR) << "tensor_w create error.";
+    return;
+  }
+  tensor_bias->SetData(bias_data);
+
+  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
+                                                       output_shape, schema::Format_NC);
+  auto tensor_out = tensor_out_ptr.get();
+  if (tensor_out == nullptr) {
+    MS_LOG(ERROR) << "tensor_out create error.";
+    return;
+  }
+  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias};
+  std::vector<lite::Tensor *> outputs{tensor_out};
+  auto op_kernel_ptr =
+    std::make_unique<kernel::FullConnectionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
+  auto op_kernel = op_kernel_ptr.release();
+  if (op_kernel == nullptr) {
+    MS_LOG(ERROR) << "op_kernel create error.";
+    return;
+  }
+  op_kernel->Init();
+  inputs[0]->MallocData(allocator);
+
+  std::vector<kernel::LiteKernel *> kernels{op_kernel};
+
+  std::vector<lite::Tensor *> inputs_g{tensor_x};
+  auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels);
+  auto pGraph = pGraph_ptr.get();
+  if (pGraph == nullptr) {
+    MS_LOG(ERROR) << "pGraph create error.";
+    return;
+  }
+  pGraph->Init();
+  memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size);
+  pGraph->Run();
+  if (enable_fp16) {
+    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3),
+                  2e-2);
+  } else {
+    CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5));
+  }
+
+  for (auto t : inputs) {
+    t->SetData(nullptr);
+  }
+  for (auto t : outputs) {
+    t->SetData(nullptr);
+  }
+  MS_LOG(INFO) << "TestFullConnection passed";
+}
+
+TEST_F(TestFullConnectionOpenCL, FullConnection2DFp32) {
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {ci, co};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float> bias_data = {1.0f, 1.0f, 1.0f};
+  std::vector<float> output_data = {11.f, 11.f, 11.f};
+  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false,
+                            2);
+}
+
+TEST_F(TestFullConnectionOpenCL, FullConnection2DFp16) {
+  int ci = 5;
+  int co = 3;
+  std::vector<int> shape = {ci, co};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float16_t> bias_data = {1.0f, 1.0f, 1.0f};
+  std::vector<float16_t> output_data = {11.f, 11.f, 11.f};
+  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true,
+                            2);
+}
+
+TEST_F(TestFullConnectionOpenCL, FullConnection4DFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 1;
+  int c = 4;
+  int co = 2;
+  std::vector<int> shape = {n, h, w, c, co};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float> bias_data = {1.0f, 1.0f};
+  std::vector<float> output_data = {29.f, 29.f};
+  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false,
+                            4);
+}
+
+TEST_F(TestFullConnectionOpenCL, FullConnection4DFp16) {
+  int n = 1;
+  int h = 2;
+  int w = 1;
+  int c = 4;
+  int co = 2;
+  std::vector<int> shape = {n, h, w, c, co};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  std::vector<float16_t> weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
+  std::vector<float16_t> bias_data = {1.0f, 1.0f};
+  std::vector<float16_t> output_data = {29.f, 29.f};
+  RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true,
+                            4);
+}
+}  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@ -88,7 +88,7 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we
  std::vector<lite::Tensor *> inputs{tensor_x, tensor_w};
  std::vector<lite::Tensor *> outputs{tensor_out};
  auto op_kernel_ptr =
-    std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs, false);
+    std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  auto op_kernel = op_kernel_ptr.release();
  if (op_kernel == nullptr) {
    MS_LOG(ERROR) << "op_kernel create error.";
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc
@ -52,7 +52,7 @@ void RunTestCaseReshape(const std::vector<int> &shape, void *input_data, void *o
  }
  std::vector<int> out_shape = {n, oh, ow, c};
  if (is_output_2d) {
-    out_shape = {n, c};
+    out_shape = {n, h * w * c};
  }
  auto tensor_out_ptr =
    std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape,
@ -156,4 +156,20 @@ TEST_F(TestReshapeOpenCL, Reshape4DFp16) {

  RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false);
 }
+
+TEST_F(TestReshapeOpenCL, Reshape4D2DFp32) {
+  int n = 1;
+  int h = 2;
+  int w = 2;
+  int c = 4;
+  int oh = 2;
+  int ow = 2;
+  std::vector<int> shape = {n, h, w, c, oh, ow};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                   8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  std::vector<float> output_data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+                                    8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+
+  RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true);
+}
 }  // namespace mindspore