diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl new file mode 100644 index 00000000000..a5cfaadf9fb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl @@ -0,0 +1,106 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define C4NUM 4 +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +__kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int2 out_shape) { + int gidx = get_global_id(0); // CO4 + int gidz = get_global_id(2); // N + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int hwci4 = ci4 * in_shape.y * in_shape.z; + int co4 = UP_DIV(out_shape.y, C4NUM); + int n = out_shape.x; + bool inside = gidx < co4 && gidz < n; + FLT4 result = (FLT4)(0.0f); + for (uint i = lidy; i < hwci4 && inside; i += 4) { + int index_h = i / (ci4 * in_shape.z); + int index_wci4 = i % (ci4 * in_shape.z); + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h)); + FLT16 w = weight[i * co4 + gidx]; + result.x += dot(v, w.s0123); + result.y += dot(v, w.s4567); + result.z += dot(v, w.s89ab); + result.w += dot(v, w.scdef); + } + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); + WRITE_IMAGE(output, (int2)(gidx, gidz), result); + } +} + +__kernel void FullConnection_NHWC4_ReLU(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int2 out_shape) { + int gidx = get_global_id(0); // CO4 + int gidz = get_global_id(2); // N + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int hwci4 = ci4 * in_shape.y * in_shape.z; + int co4 = UP_DIV(out_shape.y, C4NUM); + int n = out_shape.x; + bool inside = gidx < co4 && gidz < n; + FLT4 result = (FLT4)(0.0f); + for (uint i = lidy; i < hwci4 && inside; i += 4) { + int index_h = i / (ci4 * in_shape.z); + int index_wci4 = i % (ci4 * in_shape.z); + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h)); + FLT16 w = weight[i * co4 + gidx]; + result.x += dot(v, w.s0123); + result.y += dot(v, w.s4567); + result.z += dot(v, w.s89ab); + result.w += dot(v, w.scdef); + } + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); + result = max(result, (FLT4)(0.f)); + WRITE_IMAGE(output, (int2)(gidx, gidz), result); + } +} + +__kernel void FullConnection_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int4 in_shape, int2 out_shape) { + int gidx = get_global_id(0); // CO4 + int gidz = get_global_id(2); // N + int lidx = get_local_id(0); + int lidy = get_local_id(1); + int ci4 = UP_DIV(in_shape.w, C4NUM); + int hwci4 = ci4 * in_shape.y * in_shape.z; + int co4 = UP_DIV(out_shape.y, C4NUM); + int n = out_shape.x; + bool inside = gidx < co4 && gidz < n; + FLT4 result = (FLT4)(0.0f); + for (uint i = lidy; i < hwci4 && inside; i += 4) { + int index_ci4h = i / in_shape.z; + int index_w = i % in_shape.z; + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_w, gidz * in_shape.y * ci4 + index_ci4h)); + FLT16 w = weight[i * co4 + gidx]; + result.x += dot(v, w.s0123); + result.y += dot(v, w.s4567); + result.z += dot(v, w.s89ab); + result.w += dot(v, w.scdef); + } + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; + result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); + WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl index 914fffe5ce8..3b9035221d3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl @@ -2,8 +2,8 @@ #define C4NUM 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { +__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, + int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidz = get_global_id(2); // N int lidx = get_local_id(0); @@ -21,7 +21,6 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh result.z += dot(v, w.s89ab); result.w += dot(v, w.scdef); } - WRITE_IMAGE(output, (int2)(gidx, gidz), result); __local FLT4 temp[32][4]; temp[lidx][lidy] = result; barrier(CLK_LOCAL_MEM_FENCE); @@ -29,15 +28,12 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh result += temp[lidx][1]; result += temp[lidx][2]; result += temp[lidx][3]; - if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); - } WRITE_IMAGE(output, (int2)(gidx, gidz), result); } } -__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { +__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, + int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidz = get_global_id(2); // N int lidx = get_local_id(0); @@ -62,15 +58,12 @@ __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weig result += temp[lidx][1]; result += temp[lidx][2]; result += temp[lidx][3]; - if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); - } - WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, 0), result); + WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); } } -__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { +__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, + int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidy = get_global_id(1); // N * H * 4 int gidz = get_global_id(2); // W @@ -99,15 +92,12 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh result += temp[lidx][1]; result += temp[lidx][2]; result += temp[lidx][3]; - if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); - } WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); } } -__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int4 in_shape, int4 out_shape, int has_bias) { +__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, + int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidy = get_global_id(1); // N * H * 4 int gidz = get_global_id(2); // W @@ -138,9 +128,6 @@ __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weig result += temp[lidx][1]; result += temp[lidx][2]; result += temp[lidx][3]; - if (has_bias != 0) { - result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); - } WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl index 23121428fb2..596f6f4367a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl @@ -65,6 +65,35 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); } +__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, const int4 output_shape, const int2 stride, + const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) continue; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) continue; + FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); + maximum = max(src, maximum); + } + } + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); +} + __kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index 1b35e9aca43..a163fb9175a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -2,27 +2,23 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) { + if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { return; } - int out_index = X * size_out.y + Y; - int ih = out_index / size.y; - int iw = out_index % size.y; - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih))); + int in_img_x = size.z * size.w; + int out_img_x = size_out.z * size_out.w; + WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), + READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); } __kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) { + if (X >= size_out.x * size_out.y * size_out.z * size_out.w) { return; } - int out_index = X * size_out.y + Y; - int ih = out_index / size.y; - int iw = out_index % size.y; - WRITE_IMAGE(dst_data, (int2)(Y, Z * size_out.x + X), READ_IMAGE(src_data, smp_zero, (int2)(iw, Z * size.x + ih))); + int in_img_x = size.z; + int out_img_x = size_out.z; + WRITE_IMAGE(dst_data, (int2)(X % out_img_x, X / out_img_x), + READ_IMAGE(src_data, smp_zero, (int2)(X % in_img_x, X / in_img_x))); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc new file mode 100644 index 00000000000..9b1013fe6b9 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -0,0 +1,254 @@ +/** + * Copyright 2019 Huawei Technologies n., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "nnacl/fp32/common_func.h" +#include "src/kernel_registry.h" +#include "src/runtime/kernel/opencl/kernel/fullconnection.h" +#ifndef PROGRAM_WITH_IL +#include "src/runtime/kernel/opencl/cl/fullconnection.cl.inc" +#endif + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::schema::PrimitiveType_FullConnection; + +namespace mindspore::kernel { + +int FullConnectionOpenCLKernel::Init() { + std::string kernel_name = "FullConnection"; + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); + auto param = reinterpret_cast(op_parameter_); + transposeA = param->a_transpose_; + if (transposeA) { + MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; + return RET_ERROR; + } + transposeB = param->b_transpose_; + enable_fp16_ = ocl_runtime_->GetFp16Enable(); + if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || + out_tensors_[0]->shape().size() != 2) { + MS_LOG(ERROR) << "fullconnection only support input shape size = 2 or 4"; + return RET_ERROR; + } + if (in_tensors_[0]->shape().size() == 4) { + if (in_tensors_[0]->shape()[3] % C4NUM != 0) { + MS_LOG(ERROR) << "fullconnection only support input shape channel % 4 = 0 if input shape size = 4"; + return RET_ERROR; + } + inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], + in_tensors_[0]->shape()[3]}; + } else { + inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]}; + } + + outShape = out_tensors_[0]->shape(); + switch (param->act_type_) { + case ActType_No: + break; + case ActType_Relu: + kernel_name += "_ReLU"; + break; + default: + MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; + return RET_ERROR; + } +#ifdef PROGRAM_WITH_IL + kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); +#else + std::set build_options; + std::string source = fullconnection_source; + std::string program_name = "FullConnection"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); +#endif + + PadWeight(); + in_ori_format_ = in_tensors_[0]->GetFormat(); + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; +} + +int FullConnectionOpenCLKernel::ReSize() { return RET_OK; } + +void FullConnectionOpenCLKernel::PadWeight() { + // ABMCI @ ABCICO = ABMCO + auto allocator = ocl_runtime_->GetAllocator(); + int ci = inShape[1] * inShape[2] * inShape[3]; + int ci4 = UP_DIV(ci, C4NUM); + int co = outShape[1]; + int co4 = UP_DIV(co, C4NUM); + int a = 1; + int b = 1; + + size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); + padWeight_ = allocator->Malloc(a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); + padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); + auto padWeightFp32 = reinterpret_cast(padWeight_); + auto padWeightFp16 = reinterpret_cast(padWeight_); + memset(padWeight_, 0x00, a * b * ci4 * co4 * C4NUM * C4NUM * dtype_size); + auto originWeightFp32 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + auto originWeightFp16 = reinterpret_cast(in_tensors_.at(kWeightIndex)->data_c()); + bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; + + // pad weight + // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) + // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) + int index = 0; + for (int aa = 0; aa < a; aa++) { + for (int bb = 0; bb < b; bb++) { + int baseAB = (aa * b + bb) * ci * co; + for (int i = 0; i < ci4; ++i) { + for (int j = 0; j < co4; ++j) { + for (int k = 0; k < C4NUM; ++k) { + for (int l = 0; l < C4NUM; ++l) { + int src_ci = i * C4NUM + l; + int src_co = j * C4NUM + k; + if (src_ci < ci && src_co < co) { + int originId = baseAB + src_ci * co + src_co; + if (transposeB) { + originId = baseAB + src_co * ci + src_ci; + } + if (enable_fp16_) { + if (!isModelFp16) { + padWeightFp16[index++] = originWeightFp32[originId]; + } else { + padWeightFp16[index++] = originWeightFp16[originId]; + } + } else { + if (!isModelFp16) { + padWeightFp32[index++] = originWeightFp32[originId]; + } else { + padWeightFp32[index++] = originWeightFp16[originId]; + } + } + } else { + index++; + } + } + } + } + } + } + } + allocator->UnmapBuffer(padWeight_); + + // pad FC Bias + size_t im_dst_x, im_dst_y; + im_dst_x = co4; + im_dst_y = 1; + size_t img_dtype = CL_FLOAT; + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } + std::vector img_size{im_dst_x, im_dst_y, img_dtype}; + bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); + bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); + memset(bias_, 0x00, co4 * C4NUM * dtype_size); + if (in_tensors_.size() >= 3) { + if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { + for (int i = 0; i < co; i++) { + reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + } + } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { + for (int i = 0; i < co; i++) { + reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; + } + } else { + memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size); + } + } + allocator->UnmapBuffer(bias_); +} + +int FullConnectionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t im_dst_x, im_dst_y; + auto out_shape = out_tensors_[0]->shape(); + int n = 1, h = 1, w = 1, c = 1; + if (out_tensors_[0]->shape().size() == 2) { + n = out_shape[0]; + c = out_shape[1]; + } else { + n = out_shape[0]; + h = out_shape[1]; + w = out_shape[2]; + c = out_shape[3]; + } + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } + size_t img_dtype = CL_FLOAT; + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +int FullConnectionOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + // local size should less than MAX_GROUP_SIZE + std::vector local = {32, 4, 1}; + std::vector global = {UP_DIV(static_cast(outShape[1]), C4NUM), 4, static_cast(outShape[0])}; + int arg_count = 0; + cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; + cl_int2 out_shape = {outShape[0], outShape[1]}; + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); + ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + return RET_OK; +} + +kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::InnerContext *ctx, + const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = + new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h new file mode 100644 index 00000000000..98de476ab53 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h @@ -0,0 +1,52 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ + +#include + +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "nnacl/matmul_parameter.h" + +namespace mindspore::kernel { + +class FullConnectionOpenCLKernel : public OpenCLKernel { + public: + explicit FullConnectionOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + ~FullConnectionOpenCLKernel() override{}; + + int Init() override; + int ReSize() override; + int Run() override; + void PadWeight(); + int GetImageSize(size_t idx, std::vector *img_size) override; + + private: + cl::Kernel kernel_; + void *padWeight_; + void *bias_; + bool enable_fp16_{false}; + bool transposeA{false}; + bool transposeB{true}; + std::vector inShape; + std::vector outShape; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FULLCONNECTION_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index a3120f84de0..b94da9367eb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -26,7 +26,6 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; -using mindspore::schema::PrimitiveType_FullConnection; using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { @@ -135,33 +134,7 @@ void MatMulOpenCLKernel::PadWeight() { } } } - - // pad FC Bias - size_t im_dst_x, im_dst_y; - im_dst_x = co4; - im_dst_y = 1; - size_t img_dtype = CL_FLOAT; - if (enable_fp16_) { - img_dtype = CL_HALF_FLOAT; - } - std::vector img_size{im_dst_x, im_dst_y, img_dtype}; - bias_ = allocator->Malloc(im_dst_x * im_dst_y * C4NUM * dtype_size, img_size); - bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); - memset(bias_, 0x00, co4 * C4NUM * dtype_size); - if (in_tensors_.size() >= 3) { - if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { - for (int i = 0; i < co; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; - } - } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { - for (int i = 0; i < co; i++) { - reinterpret_cast(bias_)[i] = reinterpret_cast(in_tensors_[2]->data_c())[i]; - } - } else { - memcpy(bias_, in_tensors_[2]->data_c(), co * dtype_size); - } - } - allocator->UnmapBuffer(bias_); + allocator->UnmapBuffer(padWeight_); } int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { @@ -209,11 +182,9 @@ int MatMulOpenCLKernel::Run() { cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, hasBias_ ? 1 : 0); ocl_runtime_->RunKernel(kernel_, global, local, nullptr); return RET_OK; } @@ -222,12 +193,7 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, const mindspore::lite::PrimitiveC *primitive) { - bool hasBias = false; - if (opParameter->type_ == PrimitiveType_FullConnection) { - hasBias = (reinterpret_cast(opParameter))->has_bias_; - } - auto *kernel = - new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs, hasBias); + auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; return nullptr; @@ -241,7 +207,5 @@ kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector } REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLMatMulKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index 141b16941ac..43c0b339aaf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -27,10 +27,8 @@ namespace mindspore::kernel { class MatMulOpenCLKernel : public OpenCLKernel { public: explicit MatMulOpenCLKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, bool hasBias) - : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) { - hasBias_ = hasBias; - } + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs), inShape(MAX_DIMS, 1), outShape(MAX_DIMS, 1) {} ~MatMulOpenCLKernel() override{}; int Init() override; @@ -42,8 +40,6 @@ class MatMulOpenCLKernel : public OpenCLKernel { private: cl::Kernel kernel_; void *padWeight_; - void *bias_; - bool hasBias_{false}; bool enable_fp16_{false}; bool transposeA{false}; bool transposeB{true}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index a41601fdb2f..1fc968ab622 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -57,6 +57,16 @@ int PoolingOpenCLKernel::Init() { MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!"; return RET_INVALID_OP_NAME; } + switch (parameter_->act_type_) { + case ActType_No: + break; + case ActType_Relu: + kernel_name += "_ReLU"; + break; + default: + MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; + return RET_ERROR; + } enable_fp16_ = ocl_runtime_->GetFp16Enable(); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index a20f7dca42f..eaead966ac8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -38,11 +38,24 @@ int ReshapeOpenCLKernel::Init() { MS_LOG(ERROR) << "Reshape output size should in 2,4"; return RET_ERROR; } - if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { - MS_LOG(ERROR) << "Reshape input channel " << in_tensors_[0]->shape().back() << " should equal output channel" - << out_tensors_[0]->shape().back(); + if ((in_tensors_[0]->shape().back() % 4 != 0 || out_tensors_[0]->shape().back() % 4 != 0) && + in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { + MS_LOG(ERROR) << "Reshape input channel align 4 should equal output channel, cin:" << in_tensors_[0]->shape().back() + << " cout:" << out_tensors_[0]->shape().back(); return RET_ERROR; } + if (in_tensors_[0]->shape().size() == 2) { + inShape = {in_tensors_[0]->shape()[0], 1, 1, in_tensors_[0]->shape()[1]}; + } else { + inShape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], + in_tensors_[0]->shape()[3]}; + } + if (out_tensors_[0]->shape().size() == 2) { + outShape = {out_tensors_[0]->shape()[0], 1, 1, out_tensors_[0]->shape()[1]}; + } else { + outShape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], + out_tensors_[0]->shape()[3]}; + } #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -64,18 +77,10 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; } int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - std::vector shapex = out_tensors_[0]->shape(); - int n, h, w, c; - if (shapex.size() == 2) { - n = shapex[0]; - h = w = 1; - c = shapex[1]; - } else { - n = shapex[0]; - h = shapex[1]; - w = shapex[2]; - c = shapex[3]; - } + int n = outShape[0]; + int h = outShape[1]; + int w = outShape[2]; + int c = outShape[3]; if (op_format_ == schema::Format::Format_NHWC4) { im_dst_x = w * UP_DIV(c, C4NUM); im_dst_y = n * h; @@ -98,22 +103,12 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) int ReshapeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector shapex = in_tensors_[0]->shape(); - int h = shapex[1]; - int w = shapex[2]; - int c = shapex[3]; - int c4 = UP_DIV(c, C4NUM); - int oh, ow; - if (out_tensors_[0]->shape().size() == 2) { - oh = ow = 1; - } else { - oh = out_tensors_[0]->shape()[1]; - ow = out_tensors_[0]->shape()[2]; - } + std::vector local = {}; - std::vector global = {(size_t)oh, (size_t)ow, (size_t)c4}; - cl_int4 size = {h, w, c4, 1}; - cl_int4 size_out = {oh, ow, c4, 1}; + std::vector global = { + static_cast(outShape[0] * outShape[1] * outShape[2] * UP_DIV(outShape[3], C4NUM))}; + cl_int4 size = {inShape[0], inShape[1], inShape[2], UP_DIV(inShape[3], C4NUM)}; + cl_int4 size_out = {outShape[0], outShape[1], outShape[2], UP_DIV(outShape[3], C4NUM)}; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h index 7a570967781..4ee7d789d31 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h @@ -38,6 +38,8 @@ class ReshapeOpenCLKernel : public OpenCLKernel { private: cl::Kernel kernel_; bool enable_fp16_{false}; + std::vector inShape; + std::vector outShape; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/models_caffe.cfg b/mindspore/lite/test/models_caffe.cfg index 39da5005f6d..0d29afbb626 100644 --- a/mindspore/lite/test/models_caffe.cfg +++ b/mindspore/lite/test/models_caffe.cfg @@ -31,7 +31,6 @@ ml_hardware_liveness ml_liveness_detect_landmark ml_face_contour 2012_ATLANTA_1class_20190621_v4.x_nomean -ml_handpose ml_ocr_sfz_add_final_0325 ml_hardware_pose ml_bank_recog diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc new file mode 100644 index 00000000000..ccaae1b1e04 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc @@ -0,0 +1,196 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "src/common/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h" +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" + +namespace mindspore { +class TestFullConnectionOpenCL : public mindspore::CommonTest { + public: + TestFullConnectionOpenCL() {} +}; + +void RunTestCaseFullConnection(const std::vector &shape, void *input_data, void *weight_data, void *bias_data, + void *output_data, bool enable_fp16, int dims) { + auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); + ocl_runtime->Init(); + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); + auto allocator = ocl_runtime->GetAllocator(); + std::vector input_shape, output_shape, weight_shape, bias_shape; + if (dims == 2) { + int ci = shape[0]; + int co = shape[1]; + input_shape = {1, ci}; + output_shape = {1, co}; + weight_shape = {co, ci}; + bias_shape = {co}; + } else if (dims == 4) { + int n = shape[0]; + int h = shape[1]; + int w = shape[2]; + int ci = shape[3]; + int co = shape[4]; + input_shape = {n, h, w, ci}; + output_shape = {n, co}; + weight_shape = {co, h * w * ci}; + bias_shape = {co}; + } + auto param = static_cast(malloc(sizeof(MatMulParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "param_ptr create error."; + return; + } + param->a_transpose_ = false; + param->b_transpose_ = true; + param->has_bias_ = true; + auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; + return; + } + + auto tensor_w_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + weight_shape, schema::Format_NC); + auto tensor_w = tensor_w_ptr.get(); + if (tensor_w == nullptr) { + MS_LOG(ERROR) << "tensor_w create error."; + return; + } + tensor_w->SetData(weight_data); + + auto tensor_bias_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + bias_shape, schema::Format_NC); + auto tensor_bias = tensor_bias_ptr.get(); + if (tensor_bias == nullptr) { + MS_LOG(ERROR) << "tensor_w create error."; + return; + } + tensor_bias->SetData(bias_data); + + auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), + output_shape, schema::Format_NC); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; + return; + } + std::vector inputs{tensor_x, tensor_w, tensor_bias}; + std::vector outputs{tensor_out}; + auto op_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto op_kernel = op_kernel_ptr.release(); + if (op_kernel == nullptr) { + MS_LOG(ERROR) << "op_kernel create error."; + return; + } + op_kernel->Init(); + inputs[0]->MallocData(allocator); + + std::vector kernels{op_kernel}; + + std::vector inputs_g{tensor_x}; + auto pGraph_ptr = std::make_unique(inputs_g, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); + if (pGraph == nullptr) { + MS_LOG(ERROR) << "pGraph create error."; + return; + } + pGraph->Init(); + memcpy(inputs[0]->MutableData(), input_data, tensor_x->ElementsNum() * dtype_size); + pGraph->Run(); + if (enable_fp16) { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), + 2e-2); + } else { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); + } + + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } + MS_LOG(INFO) << "TestFullConnection passed"; +} + +TEST_F(TestFullConnectionOpenCL, FullConnection2DFp32) { + int ci = 5; + int co = 3; + std::vector shape = {ci, co}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f}; + std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector bias_data = {1.0f, 1.0f, 1.0f}; + std::vector output_data = {11.f, 11.f, 11.f}; + RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false, + 2); +} + +TEST_F(TestFullConnectionOpenCL, FullConnection2DFp16) { + int ci = 5; + int co = 3; + std::vector shape = {ci, co}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f}; + std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector bias_data = {1.0f, 1.0f, 1.0f}; + std::vector output_data = {11.f, 11.f, 11.f}; + RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true, + 2); +} + +TEST_F(TestFullConnectionOpenCL, FullConnection4DFp32) { + int n = 1; + int h = 2; + int w = 1; + int c = 4; + int co = 2; + std::vector shape = {n, h, w, c, co}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector bias_data = {1.0f, 1.0f}; + std::vector output_data = {29.f, 29.f}; + RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false, + 4); +} + +TEST_F(TestFullConnectionOpenCL, FullConnection4DFp16) { + int n = 1; + int h = 2; + int w = 1; + int c = 4; + int co = 2; + std::vector shape = {n, h, w, c, co}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; + std::vector weight_data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + std::vector bias_data = {1.0f, 1.0f}; + std::vector output_data = {29.f, 29.f}; + RunTestCaseFullConnection(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), true, + 4); +} +} // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index 151e20e39df..97ebefcb65b 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -88,7 +88,7 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we std::vector inputs{tensor_x, tensor_w}; std::vector outputs{tensor_out}; auto op_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs, false); + std::make_unique(reinterpret_cast(param), inputs, outputs); auto op_kernel = op_kernel_ptr.release(); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc index 9204ec22fe1..91b5b733cab 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc @@ -52,7 +52,7 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o } std::vector out_shape = {n, oh, ow, c}; if (is_output_2d) { - out_shape = {n, c}; + out_shape = {n, h * w * c}; } auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, @@ -156,4 +156,20 @@ TEST_F(TestReshapeOpenCL, Reshape4DFp16) { RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false); } + +TEST_F(TestReshapeOpenCL, Reshape4D2DFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 2; + int ow = 2; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + + RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true); +} } // namespace mindspore