From 1f5c6efb5d2f91511c9677230665c19c619eb64c Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 17 Sep 2020 18:01:35 +0800 Subject: [PATCH] transpose support NCHW2NHWC --- .../src/runtime/kernel/opencl/cl/transpose.cl | 297 +++++++++++------- .../runtime/kernel/opencl/kernel/transpose.cc | 87 +++-- .../runtime/kernel/opencl/kernel/transpose.h | 4 +- mindspore/lite/test/run_test.sh | 9 + .../kernel/opencl/avg_pooling_tests.cc | 24 +- .../kernel/opencl/conv2d_transpose_tests.cc | 91 +----- .../src/runtime/kernel/opencl/matmul_tests.cc | 78 +---- .../kernel/opencl/max_pooling_tests.cc | 23 +- .../src/runtime/kernel/opencl/reduce_tests.cc | 23 +- .../runtime/kernel/opencl/reshape_tests.cc | 22 +- .../runtime/kernel/opencl/softmax_tests.cc | 20 +- .../runtime/kernel/opencl/transpose_tests.cc | 20 +- .../src/runtime/kernel/opencl/utils_tests.h | 23 +- 13 files changed, 329 insertions(+), 392 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl index 91a9958cee8..847ebb98b7a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl @@ -1,138 +1,191 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= HW.y || Y >= C.y) { +__kernel void transpose_0312_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) { + int X = get_global_id(0); // H4, C4 for src + int Y = get_global_id(1); // W, H for src + int Z = get_global_id(2); // C4, W4 for src + if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) { return; } - FLT4 result[4]; - result[0] = (FLT4)(0.0f); - result[1] = (FLT4)(0.0f); - result[2] = (FLT4)(0.0f); - result[3] = (FLT4)(0.0f); - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3)); - result[0].x = x0.x; - result[0].y = x1.x; - result[0].z = x2.x; - result[0].w = x3.x; - - result[1].x = x0.y; - result[1].y = x1.y; - result[1].z = x2.y; - result[1].w = x3.y; - - result[2].x = x0.z; - result[2].y = x1.z; - result[2].z = x2.z; - result[2].w = x3.z; - - result[3].x = x0.w; - result[3].y = x1.w; - result[3].z = x2.w; - result[3].w = x3.w; - - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]); - WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); + int H4 = UP_DIV(shape.y, 4); + int C4 = UP_DIV(shape.w, 4); + FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z * H4 + X, Y)); + FLT4 src1 = (FLT4)0.f; + if (4 * Z + 1 < shape.w) { + src1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 1) * H4 + X, Y)); + } + FLT4 src2 = (FLT4)0.f; + if (4 * Z + 2 < shape.w) { + src2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 2) * H4 + X, Y)); + } + FLT4 src3 = (FLT4)0.f; + if (4 * Z + 3 < shape.w) { + src3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * Z + 3) * H4 + X, Y)); + } + FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x); + FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y); + FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z); + FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w); + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X), dst0); + if (4 * X + 1 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 1), dst1); + } + if (4 * X + 2 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 2), dst2); + } + if (4 * X + 3 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 3), dst3); + } } -__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W, - int H) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= HW.y || Y >= C.y) { +__kernel void transpose_0312_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) { + int X = get_global_id(0); // H4, C4 for src + int Y = get_global_id(1); // W, H for src + int Z = get_global_id(2); // C4, W4 for src + if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) { return; } - FLT4 result[4]; - result[0] = (FLT4)(0.0f); - result[1] = (FLT4)(0.0f); - result[2] = (FLT4)(0.0f); - result[3] = (FLT4)(0.0f); - bool over_size = W * C.y > 65535; - FLT4 x0, x1, x2, x3; - if (over_size) { - x0 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X)); - x1 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 1)); - x2 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 2)); - x3 = READ_IMAGE(src_data, smp_zero, (int2)(Y, 4 * X + 3)); - } else { - x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W * C.y + Y, (4 * X) / W)); - x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W * C.y + Y, (4 * X + 1) / W)); - x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W * C.y + Y, (4 * X + 2) / W)); - x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W * C.y + Y, (4 * X + 3) / W)); + FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z, X * shape.z + Y)); + FLT4 src1 = (FLT4)0.f; + if (4 * Z + 1 < shape.w) { + src1 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 1, X * shape.z + Y)); + } + FLT4 src2 = (FLT4)0.f; + if (4 * Z + 2 < shape.w) { + src2 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 2, X * shape.z + Y)); + } + FLT4 src3 = (FLT4)0.f; + if (4 * Z + 3 < shape.w) { + src3 = READ_IMAGE(src_data, smp_zero, (int2)(4 * Z + 3, X * shape.z + Y)); + } + FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x); + FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y); + FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z); + FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w); + WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X), dst0); + if (4 * X + 1 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 1), dst1); + } + if (4 * X + 2 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 2), dst2); + } + if (4 * X + 3 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y, Z * shape.y + 4 * X + 3), dst3); } - - result[0].x = x0.x; - result[0].y = x1.x; - result[0].z = x2.x; - result[0].w = x3.x; - - result[1].x = x0.y; - result[1].y = x1.y; - result[1].z = x2.y; - result[1].w = x3.y; - - result[2].x = x0.z; - result[2].y = x1.z; - result[2].z = x2.z; - result[2].w = x3.z; - - result[3].x = x0.w; - result[3].y = x1.w; - result[3].z = x2.w; - result[3].w = x3.w; - - if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]); - if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]); - if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]); - if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]); } -__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global float4 *dst_data, int2 HW, int2 C, int W, - int H) { - int X = get_global_id(0); - int Y = get_global_id(1); - if (X >= HW.y || Y >= C.y) { +__kernel void transpose_0312_oversize_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, + int4 shape) { + int X = get_global_id(0); // H4, C4 for src + int Y = get_global_id(1); // W, H for src + int Z = get_global_id(2); // C4, W4 for src + if (4 * X >= shape.y || Y >= shape.z || 4 * Z >= shape.w) { return; } - FLT4 result[4]; - result[0] = (FLT4)(0.0f); - result[1] = (FLT4)(0.0f); - result[2] = (FLT4)(0.0f); - result[3] = (FLT4)(0.0f); - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W, Y * H + (4 * X) / W)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W, Y * H + (4 * X + 1) / W)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W, Y * H + (4 * X + 2) / W)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W, Y * H + (4 * X + 3) / W)); - result[0].x = x0.x; - result[0].y = x1.x; - result[0].z = x2.x; - result[0].w = x3.x; - - result[1].x = x0.y; - result[1].y = x1.y; - result[1].z = x2.y; - result[1].w = x3.y; - - result[2].x = x0.z; - result[2].y = x1.z; - result[2].z = x2.z; - result[2].w = x3.z; - - result[3].x = x0.w; - result[3].y = x1.w; - result[3].z = x2.w; - result[3].w = x3.w; - - if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = convert_float4(result[0]); - if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = convert_float4(result[1]); - if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = convert_float4(result[2]); - if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = convert_float4(result[3]); + int H4 = UP_DIV(shape.y, 4); + int C4 = UP_DIV(shape.w, 4); + FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z)); + FLT4 src1 = (FLT4)0.f; + if (4 * Z + 1 < shape.w) { + src1 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 1)); + } + FLT4 src2 = (FLT4)0.f; + if (4 * Z + 2 < shape.w) { + src2 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 2)); + } + FLT4 src3 = (FLT4)0.f; + if (4 * Z + 3 < shape.w) { + src3 = READ_IMAGE(src_data, smp_zero, (int2)(Y * H4 + X, 4 * Z + 3)); + } + FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x); + FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y); + FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z); + FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w); + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X), dst0); + if (4 * X + 1 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 1), dst1); + } + if (4 * X + 2 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 2), dst2); + } + if (4 * X + 3 < shape.y) { + WRITE_IMAGE(dst_data, (int2)(Y * C4 + Z, 4 * X + 3), dst3); + } +} + +__kernel void transpose_0231_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) { + int X = get_global_id(0); // H, W for src + int Y = get_global_id(1); // W4, C4 for src + int Z = get_global_id(2); // C4, H4 for src + if (X >= shape.y || 4 * Y >= shape.z || 4 * Z >= shape.w) { + return; + } + int W4 = UP_DIV(shape.y, 4); + int C4 = UP_DIV(shape.w, 4); + FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z)); + FLT4 src1 = (FLT4)0.f; + if (4 * Z + 1 < shape.w) { + src1 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 1)); + } + FLT4 src2 = (FLT4)0.f; + if (4 * Z + 2 < shape.w) { + src2 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 2)); + } + FLT4 src3 = (FLT4)0.f; + if (4 * Z + 3 < shape.w) { + src3 = READ_IMAGE(src_data, smp_zero, (int2)(X * W4 + Y, 4 * Z + 3)); + } + FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x); + FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y); + FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z); + FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w); + WRITE_IMAGE(dst_data, (int2)(4 * Y * C4 + Z, X), dst0); + if (4 * Y + 1 < shape.z) { + WRITE_IMAGE(dst_data, (int2)((4 * Y + 1) * C4 + Z, X), dst1); + } + if (4 * Y + 2 < shape.z) { + WRITE_IMAGE(dst_data, (int2)((4 * Y + 2) * C4 + Z, X), dst2); + } + if (4 * Y + 3 < shape.z) { + WRITE_IMAGE(dst_data, (int2)((4 * Y + 3) * C4 + Z, X), dst3); + } +} + +__kernel void transpose_0231_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 shape) { + int X = get_global_id(0); // H, W for src + int Y = get_global_id(1); // W4, C4 for src + int Z = get_global_id(2); // C4, H4 for src + if (X >= shape.y || 4 * Y >= shape.z || 4 * Z >= shape.w) { + return; + } + FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z)); + FLT4 src1 = (FLT4)0.f; + if (4 * Z + 1 < shape.w) { + src1 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 1)); + } + FLT4 src2 = (FLT4)0.f; + if (4 * Z + 2 < shape.w) { + src2 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 2)); + } + FLT4 src3 = (FLT4)0.f; + if (4 * Z + 3 < shape.w) { + src3 = READ_IMAGE(src_data, smp_zero, (int2)(X, Y * shape.w + 4 * Z + 3)); + } + FLT4 dst0 = (FLT4)(src0.x, src1.x, src2.x, src3.x); + FLT4 dst1 = (FLT4)(src0.y, src1.y, src2.y, src3.y); + FLT4 dst2 = (FLT4)(src0.z, src1.z, src2.z, src3.z); + FLT4 dst3 = (FLT4)(src0.w, src1.w, src2.w, src3.w); + WRITE_IMAGE(dst_data, (int2)(4 * Y, Z * shape.y + X), dst0); + if (4 * Y + 1 < shape.z) { + WRITE_IMAGE(dst_data, (int2)(4 * Y + 1, Z * shape.y + X), dst1); + } + if (4 * Y + 2 < shape.z) { + WRITE_IMAGE(dst_data, (int2)(4 * Y + 2, Z * shape.y + X), dst2); + } + if (4 * Y + 3 < shape.z) { + WRITE_IMAGE(dst_data, (int2)(4 * Y + 3, Z * shape.y + X), dst3); + } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index c05fffbe355..9904a60d16d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -35,20 +35,27 @@ int TransposeOpenCLKernel::Init() { std::string kernel_name = "transpose"; enable_fp16_ = ocl_runtime_->GetFp16Enable(); auto param = reinterpret_cast(op_parameter_); + if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) { + MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet."; + return RET_ERROR; + } if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && param->perm_[3] == 2) { - type = TransposeType::NHWC2NCHW; + kernel_name += "_0312"; + type = TransposeType::AXIS0312; + } else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && + param->perm_[3] == 1) { + kernel_name += "_0231"; + type = TransposeType::AXIS0231; } else { MS_LOG(ERROR) << "unsupported transpose axes."; return RET_ERROR; } - out_mem_type_ = OpenCLMemType::BUF; - kernel_name += "_" + std::string(EnumNameFormat(op_format_)); - if (out_mem_type_ == OpenCLMemType::BUF) { - kernel_name += "_BUF"; - } else { - kernel_name += "_IMG"; + if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { + // just for input + kernel_name += "_oversize"; } + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -58,18 +65,10 @@ int TransposeOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) { - MS_LOG(ERROR) << "input H * W % 4 != 0 not support!"; - return RET_ERROR; - } in_ori_format_ = in_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat(); in_tensors_[0]->SetFormat(op_format_); out_tensors_[0]->SetFormat(op_format_); - if (out_mem_type_ == OpenCLMemType::BUF) { - out_ori_format_ = schema::Format::Format_NCHW; - out_tensors_[0]->SetFormat(schema::Format::Format_NCHW); - } MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; @@ -78,20 +77,14 @@ int TransposeOpenCLKernel::Init() { int TransposeOpenCLKernel::ReSize() { return RET_OK; } int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { - size_t im_dst_x, im_dst_y; - int n = out_tensors_[0]->shape()[0]; - int h = out_tensors_[0]->shape()[1]; - int w = out_tensors_[0]->shape()[2]; - int c = out_tensors_[0]->shape()[3]; - if (op_format_ == schema::Format::Format_NHWC4) { - im_dst_x = w * UP_DIV(c, C4NUM); - im_dst_y = n * h; - } else if (op_format_ == schema::Format::Format_NC4HW4) { - im_dst_x = w; - im_dst_y = n * UP_DIV(c, C4NUM) * h; - } else { - MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); - return RET_ERROR; + size_t im_dst_x = 1, im_dst_y = 1; + auto out_shape = out_tensors_[0]->shape(); + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = out_shape[2] * UP_DIV(out_shape[3], C4NUM); // W * C4 + im_dst_y = out_shape[0] * out_shape[1]; // N * H + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = out_shape[2]; // W + im_dst_y = out_shape[0] * UP_DIV(out_shape[3], C4NUM) * out_shape[1]; // N * C4 * H } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { @@ -104,30 +97,26 @@ int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_siz } int TransposeOpenCLKernel::Run() { - // notice: input image2d size = {c/4, h * w} MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector shapex = in_tensors_[0]->shape(); - int h = shapex[1]; - int w = shapex[2]; - int c = shapex[3]; - int c4 = UP_DIV(c, 4); - int hw4 = UP_DIV(h * w, 4); - std::vector local = {16, 16}; - std::vector global = {UP_ROUND(hw4, local[0]), UP_ROUND(c4, local[1])}; + std::vector shapex = out_tensors_[0]->shape(); + size_t n = shapex[0]; // n=1 + size_t h = shapex[1]; + size_t w = shapex[2]; + size_t c = shapex[3]; + size_t c4 = UP_DIV(c, 4); + std::vector local = {}; + std::vector global; + if (type == TransposeType::AXIS0312) { + global = {UP_DIV(h, C4NUM), w, c4}; + } else if (type == TransposeType::AXIS0231) { + global = {h, UP_DIV(w, C4NUM), c4}; + } - cl_int2 HW = {h * w, hw4}; - cl_int2 C = {c, c4}; + cl_int4 shape = {static_cast(n), static_cast(h), static_cast(w), static_cast(c)}; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - if (out_mem_type_ == OpenCLMemType::BUF) { - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); - } else { - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - } - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, HW); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, C); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, w); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, h); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); ocl_runtime_->RunKernel(kernel_, global, local, nullptr); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index aee866153a5..e5cc2c4238f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -25,7 +25,7 @@ namespace mindspore::kernel { -enum class TransposeType { NHWC2NCHW, NCHW2NHWC }; +enum class TransposeType { AXIS0312, AXIS0231 }; class TransposeOpenCLKernel : public OpenCLKernel { public: @@ -42,7 +42,7 @@ class TransposeOpenCLKernel : public OpenCLKernel { private: cl::Kernel kernel_; bool enable_fp16_{false}; - TransposeType type{TransposeType::NHWC2NCHW}; + TransposeType type{TransposeType::AXIS0312}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh index ea99abf4fc2..33072381954 100755 --- a/mindspore/lite/test/run_test.sh +++ b/mindspore/lite/test/run_test.sh @@ -37,3 +37,12 @@ cp -fr $TEST_DATA_DIR/testPK ./data ./lite-test --gtest_filter="TestSliceOpenCLfp32.Slicefp32CI*" ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*" +./lite-test --gtest_filter="TestAvgPoolingOpenCL*" +./lite-test --gtest_filter="TestConv2dTransposeOpenCL*" +./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*" +./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*" +./lite-test --gtest_filter="TestMaxPoolingOpenCL*" +./lite-test --gtest_filter="TestReduceOpenCL*" +./lite-test --gtest_filter="TestReshapeOpenCL*" +./lite-test --gtest_filter="TestSoftmaxOpenCL*" +./lite-test --gtest_filter="TestTransposeOpenCL*" diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc index 2f09536f2c1..a5aab09f34e 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc @@ -55,11 +55,8 @@ void InitAvgPoolingParam(PoolingParameter *param) { void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); int n = shape[0]; int h = shape[1]; @@ -67,8 +64,7 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void int c = shape[3]; int oh = shape[4]; int ow = shape[5]; - auto param_ptr = std::make_unique(); - auto param = param_ptr.get(); + auto param = static_cast(malloc(sizeof(PoolingParameter))); if (param == nullptr) { MS_LOG(ERROR) << "param create error."; return; @@ -94,7 +90,7 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -115,13 +111,17 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test AvgPool2d passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc index f7ab7ce0b38..a5225063327 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc @@ -34,11 +34,8 @@ void RunTestCaseConv2dTranspose(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); int pad = shape[0]; int n = shape[1]; @@ -89,8 +86,7 @@ void RunTestCaseConv2dTranspose(const std::vector &shape, void *input_data, } std::vector inputs{tensor_x, tensor_w, tensor_bias}; std::vector outputs{tensor_out}; - auto opParameter_ptr = std::make_unique(); - auto opParameter = opParameter_ptr.get(); + auto opParameter = static_cast(malloc(sizeof(ConvParameter))); if (opParameter == nullptr) { MS_LOG(ERROR) << "opParameter create error."; return; @@ -105,7 +101,7 @@ void RunTestCaseConv2dTranspose(const std::vector &shape, void *input_data, opParameter->output_channel_ = co; auto op_kernel_ptr = std::make_unique( reinterpret_cast(opParameter), inputs, outputs); - auto op_kernel = op_kernel_ptr.get(); + auto op_kernel = op_kernel_ptr.release(); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; @@ -132,81 +128,16 @@ void RunTestCaseConv2dTranspose(const std::vector &shape, void *input_data, CompareOutput(outputs[0]->data_c(), output_data, n * oh * ow * co, static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } lite::opencl::OpenCLRuntime::DeleteInstance(); } -void RunTestCaseConv2dTranspose(const std::vector shape, const std::vector file_path, - bool enable_fp16) { - size_t input_size; - std::string input_path = file_path[0]; - auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size); - if (input_data == nullptr) { - MS_LOG(ERROR) << "input_data load error."; - return; - } - - size_t weight_size; - std::string weight_path = file_path[1]; - auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size); - if (weight_data == nullptr) { - MS_LOG(ERROR) << "weight_data load error."; - return; - } - - size_t bias_size; - std::string bias_path = file_path[2]; - auto bias_data = mindspore::lite::ReadFile(bias_path.c_str(), &bias_size); - if (bias_data == nullptr) { - MS_LOG(ERROR) << "bias_data load error."; - return; - } - size_t output_size; - std::string output_path = file_path[3]; - auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size); - if (output_data == nullptr) { - MS_LOG(ERROR) << "output_data load error."; - return; - } - RunTestCaseConv2dTranspose(shape, input_data, weight_data, bias_data, output_data, enable_fp16); -} - TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32) { - int pad = 0; - int n = 1; - int h = 240; - int w = 240; - int kh = 2; - int kw = 2; - int ci = 128; - int co = 128; - std::vector shape = {pad, n, h, w, kh, kw, ci, co}; - std::vector file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp32_input.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp32_weight.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp32_bias.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp32_output.bin"}; - RunTestCaseConv2dTranspose(shape, file_path, false); -} - -TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) { - int pad = 0; - int n = 1; - int h = 240; - int w = 240; - int kh = 2; - int kw = 2; - int ci = 128; - int co = 128; - std::vector shape = {pad, n, h, w, kh, kw, ci, co}; - std::vector file_path = {"./test_data/conv2d_transpose/conv2d_transpose_fp16_input.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp16_weight.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp16_bias.bin", - "./test_data/conv2d_transpose/conv2d_transpose_fp16_output.bin"}; - RunTestCaseConv2dTranspose(shape, file_path, true); -} - -TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32_2) { int pad = 0; int n = 1; int h = 2; @@ -224,7 +155,7 @@ TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp32_2) { RunTestCaseConv2dTranspose(shape, input_data.data(), weight_data.data(), bias_data.data(), output_data.data(), false); } -TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16_2) { +TEST_F(TestConv2dTransposeOpenCL, Conv2dTransposeFp16) { int pad = 0; int n = 1; int h = 2; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index b7353499dc3..def820ab90f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -33,11 +33,8 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we bool enable_fp16, int dims) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(int16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); std::vector input_shape, output_shape, weight_shape; if (dims == 2) { @@ -56,8 +53,7 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we output_shape = {a, b, m, co}; weight_shape = {a, b, co, ci}; } - auto param_ptr = std::make_unique(); - auto param = param_ptr.get(); + auto param = static_cast(malloc(sizeof(MatMulParameter))); if (param == nullptr) { MS_LOG(ERROR) << "param_ptr create error."; return; @@ -93,7 +89,7 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we std::vector outputs{tensor_out}; auto op_kernel_ptr = std::make_unique(reinterpret_cast(param), inputs, outputs, false); - auto op_kernel = op_kernel_ptr.get(); + auto op_kernel = op_kernel_ptr.release(); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; @@ -114,64 +110,22 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we memcpy(inputs[0]->data_c(), input_data, tensor_x->ElementsNum() * dtype_size); pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); } - tensor_x->SetData(nullptr); - tensor_out->SetData(nullptr); - MS_LOG(INFO) << "TestMatMulFp32 passed"; + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } + MS_LOG(INFO) << "TestMatMul passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); } -void RunTestCaseMatMul(const std::vector shape, const std::vector file_path, bool enable_fp16) { - size_t input_size; - std::string input_path = file_path[0]; - auto input_data = mindspore::lite::ReadFile(input_path.c_str(), &input_size); - if (input_data == nullptr) { - MS_LOG(ERROR) << "input_data load error."; - return; - } - size_t weight_size; - std::string weight_path = file_path[1]; - auto weight_data = mindspore::lite::ReadFile(weight_path.c_str(), &weight_size); - if (weight_data == nullptr) { - MS_LOG(ERROR) << "weight_data load error."; - return; - } - size_t output_size; - std::string output_path = file_path[2]; - auto output_data = mindspore::lite::ReadFile(output_path.c_str(), &output_size); - if (output_data == nullptr) { - MS_LOG(ERROR) << "output_data load error."; - return; - } - RunTestCaseMatMul(shape, input_data, weight_data, output_data, enable_fp16, 2); -} - -TEST_F(TestMatMulOpenCL, MatMulFp32) { - int ci = 1280; - int co = 1001; - std::vector shape = {ci, co}; - std::vector file_path = {"./test_data/matmul/matmul_fp32_input.bin", - "./test_data/matmul/matmul_fp32_weight.bin", - "./test_data/matmul/matmul_fp32_output.bin"}; - RunTestCaseMatMul(shape, file_path, false); -} - -TEST_F(TestMatMulOpenCL, MatMulFp16) { - int ci = 1280; - int co = 1001; - std::vector shape = {ci, co}; - std::vector file_path = {"./test_data/matmul/matmul_fp16_input.bin", - "./test_data/matmul/matmul_fp16_weight.bin", - "./test_data/matmul/matmul_fp16_output.bin"}; - RunTestCaseMatMul(shape, file_path, true); -} - -TEST_F(TestMatMulOpenCL, MatMulFp32_2) { +TEST_F(TestMatMulOpenCL, MatMul2DFp32) { int ci = 5; int co = 3; std::vector shape = {ci, co}; @@ -182,7 +136,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_2) { RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 2); } -TEST_F(TestMatMulOpenCL, MatMulFp16_2) { +TEST_F(TestMatMulOpenCL, MatMul2DFp16) { int ci = 5; int co = 3; std::vector shape = {ci, co}; @@ -193,7 +147,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp16_2) { RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), true, 2); } -TEST_F(TestMatMulOpenCL, MatMulFp32_4D) { +TEST_F(TestMatMulOpenCL, MatMul4DFp32) { int a = 1; int b = 2; int c = 2; @@ -210,7 +164,7 @@ TEST_F(TestMatMulOpenCL, MatMulFp32_4D) { RunTestCaseMatMul(shape, input_data.data(), weight_data.data(), output_data.data(), false, 4); } -TEST_F(TestMatMulOpenCL, MatMulFp16_4D) { +TEST_F(TestMatMulOpenCL, MatMul4DFp16) { int a = 1; int b = 2; int c = 2; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc index cf0d4d4f831..0ca9ce0c9a6 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc @@ -55,11 +55,8 @@ void InitMaxPoolingParam(PoolingParameter *param) { void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); int n = shape[0]; int h = shape[1]; @@ -67,8 +64,7 @@ void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void int c = shape[3]; int oh = shape[4]; int ow = shape[5]; - auto param_ptr = std::make_unique(); - auto param = param_ptr.get(); + auto param = static_cast(malloc(sizeof(PoolingParameter))); if (param == nullptr) { MS_LOG(ERROR) << "param create error."; return; @@ -94,7 +90,7 @@ void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -115,13 +111,16 @@ void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test MaxPool2d passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc index 0f038a52599..3447689831f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc @@ -33,14 +33,10 @@ void RunTestCaseReduce(const std::vector &shape, void *input_data, void *ou int reduce_mode) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); - auto param_ptr = std::make_unique(); - auto param = param_ptr.get(); + auto param = static_cast(malloc(sizeof(ReduceParameter))); if (param == nullptr) { MS_LOG(ERROR) << "param_ptr create error."; return; @@ -73,7 +69,7 @@ void RunTestCaseReduce(const std::vector &shape, void *input_data, void *ou std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -94,13 +90,16 @@ void RunTestCaseReduce(const std::vector &shape, void *input_data, void *ou pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test Reduce passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc index ce858d3bb00..878474b8e0c 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc @@ -33,11 +33,8 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o bool is_output_2d) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); int n = shape[0]; int h = shape[1]; @@ -55,7 +52,7 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o } std::vector out_shape = {n, oh, ow, c}; if (is_output_2d) { - std::vector out_shape = {n, c}; + out_shape = {n, c}; } auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, @@ -68,7 +65,7 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(nullptr, inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -89,13 +86,16 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test Reshape passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc index 630575908c3..a953c674133 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc @@ -32,11 +32,8 @@ class TestSoftmaxOpenCL : public mindspore::CommonTest { void RunTestCaseSoftmax(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); int n, h, w, c; bool is_2d = false; @@ -72,7 +69,7 @@ void RunTestCaseSoftmax(const std::vector &shape, void *input_data, void *o std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(nullptr, inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -93,13 +90,16 @@ void RunTestCaseSoftmax(const std::vector &shape, void *input_data, void *o pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); + CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { CompareOutput(outputs[0]->data_c(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test Softmax passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc index f81dbc530b9..97424d51c89 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc @@ -32,13 +32,9 @@ class TestTransposeOpenCL : public mindspore::CommonTest { void RunTestTranspose(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - size_t dtype_size = sizeof(float); - if (enable_fp16) { - ocl_runtime->SetFp16Enable(true); - dtype_size = sizeof(float16_t); - } - auto param_ptr = std::make_unique(); - auto param = param_ptr.get(); + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); + auto param = static_cast(malloc(sizeof(TransposeParameter))); if (param == nullptr) { MS_LOG(ERROR) << "param_ptr create error."; return; @@ -73,7 +69,7 @@ void RunTestTranspose(const std::vector &shape, void *input_data, void *out std::vector outputs{tensor_out}; auto arith_kernel_ptr = std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.get(); + auto arith_kernel = arith_kernel_ptr.release(); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; @@ -99,8 +95,12 @@ void RunTestTranspose(const std::vector &shape, void *input_data, void *out CompareOutput(outputs[0]->data_c(), output_data, h * w * c, static_cast(1e-5)); } - inputs[0]->SetData(nullptr); - outputs[0]->SetData(nullptr); + for (auto t : inputs) { + t->SetData(nullptr); + } + for (auto t : outputs) { + t->SetData(nullptr); + } MS_LOG(INFO) << "Test TransposeFp32 passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h index 82d0bc11fb5..047b22d593d 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h @@ -34,23 +34,26 @@ void CompareOutput(void *output, void *expect, size_t elem_num, T atol, float rt T *output_data = reinterpret_cast(output); T *expect_data = reinterpret_cast(expect); - printf("output[0:12]:"); + std::cout << std::setprecision(5) << std::setiosflags(std::ios::fixed) << std::setw(7); + std::cout << "output[0:12]:"; for (int i = 0; i < 12 && i < elem_num; i++) { - printf("[%d]:%.3f ", i, output_data[i]); + std::cout << output_data[i] << " "; } - printf("\n"); - printf("expect[0:12]:"); + std::cout << std::endl; + std::cout << "expect[0:12]:"; for (int i = 0; i < 12 && i < elem_num; i++) { - printf("[%d]:%.3f ", i, expect_data[i]); + std::cout << expect_data[i] << " "; } - printf("\n"); + std::cout << std::endl; for (int i = 0; i < elem_num; ++i) { - if (std::fabs(output_data[i] - expect_data[i]) > atol + rtol * std::fabs(expect_data[i])) { - printf("error at idx[%d] expect=%.3f output=%.3f \n", i, expect_data[i], output_data[i]); - return; + auto left = static_cast(std::fabs(output_data[i] - expect_data[i])); + auto right = static_cast(atol + rtol * std::fabs(expect_data[i])); + if (left > right) { + std::cout << "error at idx[" << i << "] expect=" << expect_data[i] << " output=" << output_data[i] << std::endl; } + ASSERT_LE(left, right); } - printf("compare success!\n"); + std::cout << "compare success!" << std::endl; } template