From 748a595c17288be65ebc9aff729693c5107b3b2b Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 3 Sep 2020 16:33:48 +0800 Subject: [PATCH] op support NC4HW4 --- .../runtime/kernel/opencl/cl/avg_pool2d.cl | 42 ++++- .../kernel/opencl/cl/conv2d_transpose2x2.cl | 62 ++++++- .../src/runtime/kernel/opencl/cl/matmul.cl | 32 +++- .../runtime/kernel/opencl/cl/max_pool2d.cl | 34 +++- .../src/runtime/kernel/opencl/cl/reshape.cl | 16 +- .../src/runtime/kernel/opencl/cl/softmax.cl | 103 ++++++++++- .../src/runtime/kernel/opencl/cl/transpose.cl | 45 ++++- .../kernel/opencl/kernel/conv2d_transpose.cc | 40 +++-- .../runtime/kernel/opencl/kernel/matmul.cc | 26 ++- .../runtime/kernel/opencl/kernel/pooling2d.cc | 35 ++-- .../runtime/kernel/opencl/kernel/reshape.cc | 34 ++-- .../runtime/kernel/opencl/kernel/softmax.cc | 20 +-- .../runtime/kernel/opencl/kernel/to_format.cc | 10 +- .../runtime/kernel/opencl/kernel/transpose.cc | 35 +++- .../runtime/kernel/opencl/kernel/transpose.h | 1 - .../kernel/opencl/avg_pooling_tests.cc | 151 ++++++++-------- .../kernel/opencl/max_pooling_tests.cc | 161 +++++++++++------- .../runtime/kernel/opencl/reshape_tests.cc | 74 ++++++-- .../runtime/kernel/opencl/transpose_tests.cc | 4 +- 19 files changed, 678 insertions(+), 247 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl index 7728a102109..0df90fe08d8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl @@ -1,6 +1,7 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#define divide_no_check(a, b) (a / b) __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { // axis to dst tensor coordinate @@ -34,8 +35,9 @@ __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, cons __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { +__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, + const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); int Y = get_global_id(1); @@ -57,10 +59,42 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = xs + kx; bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; - r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f); + r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); window_size += !outside ? 1.0f : 0.0f; } } - FLT4 result = TO_FLT4(r / window_size); + FLT4 result = TO_FLT4(divide_no_check(r, window_size)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); } + +__kernel void AvgPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, const int4 output_shape, const int2 stride, + const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 r = (FLT4)(0.0f); + FLT window_size = 0.0f; + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + bool outside_y = y_c < 0 || y_c >= input_shape.y; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; + r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c, Z * input_shape.x + x_c)) : (FLT4)(0.0f); + window_size += !outside ? 1.0f : 0.0f; + } + } + FLT4 result = TO_FLT4(divide_no_check(r, window_size)); + WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl index eea2139ee2f..ac76b8742f6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl @@ -1,8 +1,8 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 *weight, __read_only image2d_t biases, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, - int4 src_size, int4 dst_size) { +__kernel void conv2d_transpose2x2_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight, + __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, + int2 stride, int2 padding, int4 src_size, int4 dst_size) { int h = get_global_id(0); int kh = h % 2; int src_h = h / 2; @@ -55,3 +55,59 @@ __kernel void conv2d_transpose2x2(__read_only image2d_t src_data, __global FLT16 WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2); WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3); } + +__kernel void conv2d_transpose2x2_NC4HW4(__read_only image2d_t src_data, __global FLT16 *weight, + __read_only image2d_t biases, __write_only image2d_t dst_data, + int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) { + int h = get_global_id(0); + int kh = h % 2; + int src_h = h / 2; + src_h = src_h * 2; + int w = get_global_id(1); + int kw = w % 2; + int src_w = w / 2; + src_w = src_w * 2; + int co = get_global_id(2); + if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; + FLT4 r0 = (FLT4)(0.f); + FLT4 r1 = (FLT4)(0.f); + FLT4 r2 = (FLT4)(0.f); + FLT4 r3 = (FLT4)(0.f); + int base_w = (co * 4 + kh * 2 + kw) * src_size.z; + for (int ci = 0; ci < src_size.z; ++ci) { + FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h)); + FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h + 1)); + FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h)); + FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h + 1)); + FLT16 weight_cache = weight[base_w++]; + r0 += x0.x * weight_cache.s0123; + r0 += x0.y * weight_cache.s4567; + r0 += x0.z * weight_cache.s89ab; + r0 += x0.w * weight_cache.scdef; + + r1 += x1.x * weight_cache.s0123; + r1 += x1.y * weight_cache.s4567; + r1 += x1.z * weight_cache.s89ab; + r1 += x1.w * weight_cache.scdef; + + r2 += x2.x * weight_cache.s0123; + r2 += x2.y * weight_cache.s4567; + r2 += x2.z * weight_cache.s89ab; + r2 += x2.w * weight_cache.scdef; + + r3 += x3.x * weight_cache.s0123; + r3 += x3.y * weight_cache.s4567; + r3 += x3.z * weight_cache.s89ab; + r3 += x3.w * weight_cache.scdef; + } + FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); + r0 += bias_val; + r1 += bias_val; + r2 += bias_val; + r3 += bias_val; + + WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh), r0); + WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh + 2), r1); + WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh), r2); + WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh + 2), r3); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl index e67c73358e9..300584c2f9a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl @@ -1,7 +1,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { +__kernel void MatMul_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { int2 gid = (int2)(get_global_id(0), get_global_id(1)); int2 lid = (int2)(get_local_id(0), get_local_id(1)); FLT4 result = (FLT4)(0.0f); @@ -27,3 +27,31 @@ __kernel void MatMul(__read_only image2d_t input, __global FLT16 *weight, __read WRITE_IMAGE(output, (int2)(gid.x, 0), result); } } + +__kernel void MatMul_NC4HW4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, + __write_only image2d_t output, int2 offset_ci, int2 offset_co, int has_bias) { + int2 gid = (int2)(get_global_id(0), get_global_id(1)); + int2 lid = (int2)(get_local_id(0), get_local_id(1)); + FLT4 result = (FLT4)(0.0f); + bool inside = gid.x < offset_co.y; + for (uint i = lid.y; i < offset_ci.y && inside; i += 4) { + FLT4 v = READ_IMAGE(input, smp_zero, (int2)(0, i)); + FLT16 w = weight[gid.x + i * offset_co.y]; + result.x += dot(v, w.s0123); + result.y += dot(v, w.s4567); + result.z += dot(v, w.s89ab); + result.w += dot(v, w.scdef); + } + __local FLT4 temp[64][4]; + temp[lid.x][lid.y] = result; + barrier(CLK_LOCAL_MEM_FENCE); + if (lid.y == 0 && inside) { + result += temp[lid.x][1]; + result += temp[lid.x][2]; + result += temp[lid.x][3]; + if (has_bias != 0) { + result += READ_IMAGE(bias, smp_zero, (int2)(gid.x, 0)); + } + WRITE_IMAGE(output, (int2)(0, gid.x), result); + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl index f7b25423608..23121428fb2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl @@ -36,8 +36,9 @@ __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, cons __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { +__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, + const int2 padding) { // axis to dst tensor coordinate int X = get_global_id(0); int Y = get_global_id(1); @@ -63,3 +64,32 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d } WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); } + +__kernel void MaxPooling2d_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, const int4 output_shape, const int2 stride, + const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) continue; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) continue; + FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c, Z * input_shape.x + x_c)); + maximum = max(src, maximum); + } + } + WRITE_IMAGE(output, (int2)(Y, Z * output_shape.x + X), maximum); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index c292a5dee86..1b35e9aca43 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -1,6 +1,6 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { +__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, int4 size_out) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -12,3 +12,17 @@ __kernel void reshape(__read_only image2d_t src_data, __write_only image2d_t dst int iw = out_index % size.y; WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(iw * size.z + Z, ih))); } + +__kernel void reshape_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, + int4 size_out) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size_out.x || Y >= size_out.y || Z >= size_out.z) { + return; + } + int out_index = X * size_out.y + Y; + int ih = out_index / size.y; + int iw = out_index % size.y; + WRITE_IMAGE(dst_data, (int2)(Y, Z * size_out.x + X), READ_IMAGE(src_data, smp_zero, (int2)(iw, Z * size.x + ih))); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl index 08dcc1f2fcf..96b8a035f67 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl @@ -1,9 +1,10 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#define divide_no_check(a, b) (a / b) __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void SoftMax_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) { +__kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) { int X = get_global_id(0); // H int Y = get_global_id(1); // W int H = input_shape.x; @@ -24,7 +25,38 @@ __kernel void SoftMax_BUF(__read_only image2d_t input, __global FLT4 *output, co for (int d = 0; d < S; ++d) { FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); - t = exp(t) / sum; + t = divide_no_check(exp(t), sum); + __global FLT *output_flt = (__global FLT *)output; + output_flt += (X * W + Y) * C + 4 * d; + output_flt[0] = t.x; + if (d * 4 + 1 < C) output_flt[1] += t.y; + if (d * 4 + 2 < C) output_flt[2] += t.z; + if (d * 4 + 3 < C) output_flt[3] += t.w; + } +} + +__kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const int4 input_shape) { + int X = get_global_id(0); // H + int Y = get_global_id(1); // W + int H = input_shape.x; + int W = input_shape.y; + int C = input_shape.z; + int S = input_shape.w; + + if (X >= H || Y >= W) return; + + FLT sum = 0.0f; + for (int d = 0; d < S; ++d) { + FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + sum += exp(t.x); + if (d * 4 + 1 < C) sum += exp(t.y); + if (d * 4 + 2 < C) sum += exp(t.z); + if (d * 4 + 3 < C) sum += exp(t.w); + } + + for (int d = 0; d < S; ++d) { + FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); + t = divide_no_check(exp(t), sum); __global FLT *output_flt = (__global FLT *)output; output_flt += (X * W + Y) * C + 4 * d; output_flt[0] = t.x; @@ -39,7 +71,7 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou int Y = get_global_id(1); if (X >= input_shape.x || Y >= input_shape.y) return; - float sum = 0.0f; + FLT sum = 0.0f; for (int d = 0; d < input_shape.w; ++d) { FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X)); sum += exp(t.x); @@ -50,7 +82,7 @@ __kernel void SoftMax_IMG(__read_only image2d_t input, __write_only image2d_t ou for (int d = 0; d < input_shape.w; ++d) { FLT4 t = READ_IMAGE(input, smp_none, (int2)(Y * input_shape.w + d, X)); - t = exp(t) / sum; + t = divide_no_check(exp(t), sum); FLT4 result = TO_FLT4(t); WRITE_IMAGE(output, (int2)(Y * input_shape.w + d, X), result); } @@ -86,7 +118,7 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t sum += dot((FLT4)(1.0f), tmp[5]); sum += dot((FLT4)(1.0f), tmp[6]); sum += dot((FLT4)(1.0f), tmp[7]); - tmpx1[0] = 1.0f / sum; + tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; @@ -104,8 +136,8 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t } while (slices_count < slices_x32); } -__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, const int slices, - const int slices_x32) { +__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, + const int slices, const int slices_x32) { int tid = get_local_id(0); FLT sum = 0.0f; for (size_t i = tid; i < slices - 1; i += 32) { @@ -131,7 +163,7 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output, sum += dot((FLT4)(1.0f), tmp[5]); sum += dot((FLT4)(1.0f), tmp[6]); sum += dot((FLT4)(1.0f), tmp[7]); - tmpx1[0] = 1.0f / sum; + tmpx1[0] = divide_no_check(1.0f, sum); } barrier(CLK_LOCAL_MEM_FENCE); sum = tmpx1[0]; @@ -158,3 +190,58 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global FLT4 *output, } } } + +__kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, + const int slices, const int slices_x32) { + int tid = get_local_id(0); + FLT sum = 0.0f; + for (size_t i = tid; i < slices - 1; i += 32) { + FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); + sum += dot((FLT4)(1.0f), exp(src)); + } + if ((slices - 1) % 32 == tid) { + FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); + + sum += dot(TO_FLT4(mask), exp(src)); + } + + __local FLT4 tmp[8]; + __local FLT *tmpx1 = (__local FLT *)tmp; + tmpx1[tid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (tid == 0) { + sum = dot((FLT4)(1.0f), tmp[0]); + sum += dot((FLT4)(1.0f), tmp[1]); + sum += dot((FLT4)(1.0f), tmp[2]); + sum += dot((FLT4)(1.0f), tmp[3]); + sum += dot((FLT4)(1.0f), tmp[4]); + sum += dot((FLT4)(1.0f), tmp[5]); + sum += dot((FLT4)(1.0f), tmp[6]); + sum += dot((FLT4)(1.0f), tmp[7]); + tmpx1[0] = divide_no_check(1.0f, sum); + } + barrier(CLK_LOCAL_MEM_FENCE); + sum = tmpx1[0]; + for (size_t i = tid; i < slices - 1; i += 32) { + FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); + result = exp(result) * sum; + output[i] = result; + } + if ((slices - 1) % 32 == tid) { + FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); + result = exp(result) * sum; + __global FLT4 *remain_ptr4 = output; + remain_ptr4 += slices - 1; + __global FLT *remain_ptr = (__global FLT *)remain_ptr4; + remain_ptr[0] = result.x; + if (mask.y > 0.f) { + remain_ptr[1] = result.y; + } + if (mask.z > 0.f) { + remain_ptr[2] = result.z; + } + if (mask.w > 0.f) { + remain_ptr[3] = result.w; + } + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl index ac11eaa1e30..7be8e94b748 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl @@ -43,7 +43,8 @@ __kernel void transpose_IMG(__read_only image2d_t src_data, __write_only image2d WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]); } -__kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W) { +__kernel void transpose_NHWC4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W, + int H) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= HW.y || Y >= C.y) { @@ -83,3 +84,45 @@ __kernel void transpose_BUF(__read_only image2d_t src_data, global FLT4 *dst_dat if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2]; if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3]; } + +__kernel void transpose_NC4HW4_BUF(__read_only image2d_t src_data, global FLT4 *dst_data, int2 HW, int2 C, int W, + int H) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= HW.y || Y >= C.y) { + return; + } + FLT4 result[4]; + result[0] = (FLT4)(0.0f); + result[1] = (FLT4)(0.0f); + result[2] = (FLT4)(0.0f); + result[3] = (FLT4)(0.0f); + FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X) % W, Y * H + (4 * X) / W)); + FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 1) % W, Y * H + (4 * X + 1) / W)); + FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 2) % W, Y * H + (4 * X + 2) / W)); + FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((4 * X + 3) % W, Y * H + (4 * X + 3) / W)); + result[0].x = x0.x; + result[0].y = x1.x; + result[0].z = x2.x; + result[0].w = x3.x; + + result[1].x = x0.y; + result[1].y = x1.y; + result[1].z = x2.y; + result[1].w = x3.y; + + result[2].x = x0.z; + result[2].y = x1.z; + result[2].z = x2.z; + result[2].w = x3.z; + + result[3].x = x0.w; + result[3].y = x1.w; + result[3].z = x2.w; + result[3].w = x3.w; + + if (4 * Y < C.x) dst_data[4 * Y * HW.y + X] = result[0]; + if (4 * Y + 1 < C.x) dst_data[(4 * Y + 1) * HW.y + X] = result[1]; + if (4 * Y + 2 < C.x) dst_data[(4 * Y + 2) * HW.y + X] = result[2]; + if (4 * Y + 3 < C.x) dst_data[(4 * Y + 3) * HW.y + X] = result[3]; +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index e79349bf5f5..27106da1fd2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -40,7 +40,7 @@ int Conv2dTransposeOpenCLKernel::Init() { MS_LOG(ERROR) << "only support pad =0."; return RET_ERROR; } - std::string kernel_name = "conv2d_transpose2x2"; + std::string kernel_name = "conv2d_transpose2x2_" + std::string(EnumNameFormat(op_format_)); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); enable_fp16_ = ocl_runtime->GetFp16Enable(); #ifdef PROGRAM_WITH_IL @@ -54,9 +54,9 @@ int Conv2dTransposeOpenCLKernel::Init() { #endif PadWeight(); in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); + in_tensors_[0]->SetFormat(op_format_); out_ori_format_ = out_tensors_[0]->GetFormat(); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); + out_tensors_[0]->SetFormat(op_format_); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -142,8 +142,20 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { int Conv2dTransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); - im_dst_y = out_tensors_[0]->Height(); + int n = out_tensors_[0]->shape()[0]; + int h = out_tensors_[0]->shape()[1]; + int w = out_tensors_[0]->shape()[2]; + int c = out_tensors_[0]->shape()[3]; + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { img_dtype = CL_HALF_FLOAT; @@ -156,23 +168,17 @@ int Conv2dTransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *i int Conv2dTransposeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector shapex = in_tensors_[0]->shape(); - int n = shapex[0]; - if (n > 1) { - MS_LOG(ERROR) << " n > 1 not supported!"; - return RET_ERROR; - } ConvParameter *param = reinterpret_cast(op_parameter_); - int ci = in_tensors_[0]->Channel(); - int co = out_tensors_[0]->Channel(); + int ci = in_tensors_[0]->shape()[3]; + int co = out_tensors_[0]->shape()[3]; int co4 = UP_DIV(co, C4NUM); int kh = param->kernel_h_; int kw = param->kernel_w_; int pad = param->pad_u_; - int oh = out_tensors_[0]->Height(); - int ow = out_tensors_[0]->Width(); - int h = in_tensors_[0]->Height(); - int w = in_tensors_[0]->Width(); + int oh = out_tensors_[0]->shape()[1]; + int ow = out_tensors_[0]->shape()[2]; + int h = in_tensors_[0]->shape()[1]; + int w = in_tensors_[0]->shape()[2]; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); // local size should less than MAX_GROUP_SIZE std::vector local = {16, 1, 16}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index a020dd857df..9431e555d5a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -34,6 +34,7 @@ namespace mindspore::kernel { int MatMulOpenCLKernel::Init() { std::string kernel_name = "MatMul"; + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); enable_fp16_ = ocl_runtime->GetFp16Enable(); #ifdef PROGRAM_WITH_IL @@ -46,6 +47,10 @@ int MatMulOpenCLKernel::Init() { ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif int ci, co; + if (in_tensors_[1]->shape().size() != 2) { + MS_LOG(ERROR) << "matmul do not support input shape size=" << in_tensors_[1]->shape().size(); + return RET_ERROR; + } if (in_tensors_[1]->shape().size() == 2) { ci = in_tensors_[1]->shape()[1]; co = in_tensors_[1]->shape()[0]; @@ -59,13 +64,8 @@ int MatMulOpenCLKernel::Init() { PadWeight(); in_ori_format_ = in_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat(); - if (out_tensors_[0]->shape().size() == 2) { - out_tensors_[0]->SetFormat(schema::Format_NC4); - in_tensors_[0]->SetFormat(schema::Format_NC4); - } else { - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); - } + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -142,8 +142,16 @@ void MatMulOpenCLKernel::PadWeight() { int MatMulOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - im_dst_x = sizeCO.s[1]; - im_dst_y = 1; + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = sizeCO.s[1]; + im_dst_y = 1; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = 1; + im_dst_y = sizeCO.s[1]; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { img_dtype = CL_HALF_FLOAT; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 3d87827769d..27e4f214ff3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -64,6 +64,7 @@ int PoolingOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); #else + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); if (out_mem_type_ == OpenCLMemType::BUF) { MS_LOG(ERROR) << "buffer output not support yet."; return RET_ERROR; @@ -75,27 +76,38 @@ int PoolingOpenCLKernel::Init() { ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); out_ori_format_ = out_tensors_[0]->GetFormat(); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } std::vector PoolingOpenCLKernel::InitGlobalSize() const { - const size_t global_x = out_tensors_[0]->Height(); - const size_t global_y = out_tensors_[0]->Width(); - const size_t global_z = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + const size_t global_x = out_tensors_[0]->shape()[1]; + const size_t global_y = out_tensors_[0]->shape()[2]; + const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); std::vector global = {global_x, global_y, global_z}; return global; } int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t im_dst_x, im_dst_y; - im_dst_x = out_tensors_[0]->Width() * CO4; - im_dst_y = out_tensors_[0]->Height(); + int n = out_tensors_[0]->shape()[0]; + int h = out_tensors_[0]->shape()[1]; + int w = out_tensors_[0]->shape()[2]; + int c = out_tensors_[0]->shape()[3]; + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { img_dtype = CL_HALF_FLOAT; @@ -114,9 +126,10 @@ int PoolingOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - int slices = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - cl_int4 input_shape = {in_tensors_[0]->Height(), in_tensors_[0]->Width(), in_tensors_[0]->Channel(), slices}; - cl_int4 output_shape = {out_tensors_[0]->Height(), out_tensors_[0]->Width(), out_tensors_[0]->Channel(), slices}; + int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); + cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices}; + cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3], + slices}; cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_}; cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index a504d6a84ef..d3d21a8199c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -32,14 +32,10 @@ namespace mindspore::kernel { int ReshapeOpenCLKernel::Init() { std::string kernel_name = "reshape"; + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); enable_fp16_ = ocl_runtime->GetFp16Enable(); - in_ori_format_ = in_tensors_[0]->GetFormat(); - out_ori_format_ = out_tensors_[0]->GetFormat(); - if (in_ori_format_ != schema::Format_NHWC4 && in_ori_format_ != schema::Format_NHWC) { - MS_LOG(ERROR) << "Reshape input format:" << in_ori_format_ << " not support yet."; - return RET_ERROR; - } + if (in_tensors_[0]->shape().back() != out_tensors_[0]->shape().back()) { MS_LOG(ERROR) << "Reshape input channel " << in_tensors_[0]->shape().back() << " should equal output channel" << out_tensors_[0]->shape().back(); @@ -54,12 +50,10 @@ int ReshapeOpenCLKernel::Init() { ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); - if (out_tensors_[0]->shape().size() == 2) { - out_ori_format_ = schema::Format_NC; - out_tensors_[0]->SetFormat(schema::Format_NC4); - } + in_ori_format_ = in_tensors_[0]->GetFormat(); + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -69,17 +63,27 @@ int ReshapeOpenCLKernel::ReSize() { return RET_OK; } int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; std::vector shapex = out_tensors_[0]->shape(); - int h, w, c; + int n, h, w, c; if (shapex.size() == 2) { + n = shapex[0]; h = w = 1; c = shapex[1]; } else { + n = shapex[0]; h = shapex[1]; w = shapex[2]; c = shapex[3]; } - im_dst_x = w * UP_DIV(c, C4NUM); - im_dst_y = h; + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { img_dtype = CL_HALF_FLOAT; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 39a869bf5a1..9c6a1774c4b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -86,6 +86,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) int SoftmaxOpenCLKernel::Init() { std::string kernel_name = "SoftMax"; std::string program_name = "SoftMax"; + std::string source = softmax_source; runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); enable_fp16_ = runtime_->GetFp16Enable(); @@ -102,6 +103,7 @@ int SoftmaxOpenCLKernel::Init() { MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_tensors_[0]->shape().size(); return RET_ERROR; } + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); #else @@ -124,21 +126,9 @@ int SoftmaxOpenCLKernel::Init() { #endif in_ori_format_ = in_tensors_[0]->GetFormat(); out_ori_format_ = out_tensors_[0]->GetFormat(); - if (in_tensors_[0]->shape().size() == 2) { - in_tensors_[0]->SetFormat(schema::Format_NC4); - } else { - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - } - - if (is_image_out_) { - if (out_tensors_[0]->shape().size() == 2) { - out_ori_format_ = schema::Format_NC; - out_tensors_[0]->SetFormat(schema::Format_NC4); - } else { - out_ori_format_ = schema::Format_NHWC; - out_tensors_[0]->SetFormat(schema::Format_NHWC4); - } - } else { + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); + if (!is_image_out_) { out_tensors_[0]->SetFormat(out_ori_format_); } MS_LOG(DEBUG) << kernel_name << " Init Done!"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 94dd6dc66e3..8d0e042cf95 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -65,6 +65,14 @@ int ToFormatOpenCLKernel::Init() { int ToFormatOpenCLKernel::InitNHWCShape() { std::vector shapex = out_tensors_[0]->shape(); size_t n, h, w, c; + if (shapex.size() == 2) { + n = shapex[0]; + h = 1; + w = 1; + c = shapex[1]; + nhwc_shape_ = {n, h, w, c}; + return RET_OK; + } if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) { n = shapex[0]; @@ -118,7 +126,7 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size im_dst_x = w * UP_DIV(c, C4NUM); im_dst_y = h; } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { - int c = nhwc_shape_[1]; + int c = nhwc_shape_[3]; im_dst_x = UP_DIV(c, C4NUM); im_dst_y = 1; } else { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index 0d783303576..53cd41f07fd 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -36,7 +36,9 @@ int TransposeOpenCLKernel::Init() { std::string kernel_name = "transpose"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); enable_fp16_ = ocl_runtime->GetFp16Enable(); - if (!is_image_out_) { + out_mem_type_ = OpenCLMemType::BUF; + kernel_name += "_" + std::string(EnumNameFormat(op_format_)); + if (out_mem_type_ == OpenCLMemType::BUF) { kernel_name += "_BUF"; } else { kernel_name += "_IMG"; @@ -50,17 +52,19 @@ int TransposeOpenCLKernel::Init() { ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - if ((in_tensors_[0]->Height() * in_tensors_[0]->Width()) % 4 != 0) { + if ((in_tensors_[0]->shape()[1] * in_tensors_[0]->shape()[2]) % 4 != 0) { MS_LOG(ERROR) << "input H * W % 4 != 0 not support!"; return RET_ERROR; } in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - out_ori_format_ = schema::Format_NCHW; - out_tensors_[0]->SetFormat(schema::Format_NCHW); - if (!is_image_out_) { - out_mem_type_ = OpenCLMemType::BUF; + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_tensors_[0]->SetFormat(op_format_); + if (out_mem_type_ == OpenCLMemType::BUF) { + out_ori_format_ = schema::Format_NCHW; + out_tensors_[0]->SetFormat(schema::Format_NCHW); } + MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -69,8 +73,20 @@ int TransposeOpenCLKernel::ReSize() { return RET_OK; } int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - im_dst_x = out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Width(), C4NUM); - im_dst_y = out_tensors_[0]->Channel(); + int n = out_tensors_[0]->shape()[0]; + int h = out_tensors_[0]->shape()[1]; + int w = out_tensors_[0]->shape()[2]; + int c = out_tensors_[0]->shape()[3]; + if (op_format_ == schema::Format_NHWC4) { + im_dst_x = w * UP_DIV(c, C4NUM); + im_dst_y = n * h; + } else if (op_format_ == schema::Format_NC4HW4) { + im_dst_x = w; + im_dst_y = n * UP_DIV(c, C4NUM) * h; + } else { + MS_LOG(ERROR) << "not support op format:" << EnumNameFormat(op_format_); + return RET_ERROR; + } size_t img_dtype = CL_FLOAT; if (enable_fp16_) { img_dtype = CL_HALF_FLOAT; @@ -102,6 +118,7 @@ int TransposeOpenCLKernel::Run() { ocl_runtime->SetKernelArg(kernel_, arg_idx++, HW); ocl_runtime->SetKernelArg(kernel_, arg_idx++, C); ocl_runtime->SetKernelArg(kernel_, arg_idx++, w); + ocl_runtime->SetKernelArg(kernel_, arg_idx++, h); ocl_runtime->RunKernel(kernel_, global, local, nullptr); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index 698ef9e7355..4576bd066c6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -38,7 +38,6 @@ class TransposeOpenCLKernel : public OpenCLKernel { private: cl::Kernel kernel_; - bool is_image_out_{false}; bool enable_fp16_{false}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc index f74cd7cfdab..31fa220ca32 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc @@ -21,6 +21,7 @@ #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h" +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" namespace mindspore { @@ -51,97 +52,107 @@ void InitAvgPoolingParam(PoolingParameter *param) { param->pool_mode_ = PoolMode_AvgPool; } -TEST_F(TestAvgPoolingOpenCL, AvgPoolFp32) { - MS_LOG(INFO) << "start TEST_F TestPoolingOpenCL"; +void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); - - MS_LOG(INFO) << "create PoolingParameter"; - auto param = new (std::nothrow) PoolingParameter(); + size_t dtype_size = sizeof(float); + if (enable_fp16) { + ocl_runtime->SetFp16Enable(true); + dtype_size = sizeof(float16_t); + } + auto allocator = ocl_runtime->GetAllocator(); + int n = shape[0]; + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + int oh = shape[4]; + int ow = shape[5]; + auto param_ptr = std::make_unique(); + auto param = param_ptr.get(); + if (param == nullptr) { + MS_LOG(ERROR) << "param create error."; + return; + } InitAvgPoolingParam(param); - - MS_LOG(INFO) << "create Tensors"; - std::vector shape_in = { - param->input_batch_, - param->input_h_, - param->input_w_, - param->input_channel_, - }; - std::vector shape_out = { - param->output_batch_, - param->output_h_, - param->output_w_, - param->output_channel_, - }; - auto data_type = kNumberTypeFloat32; - auto tensorType = schema::NodeType_ValueNode; - lite::tensor::Tensor *tensor_in = - new (std::nothrow) lite::tensor::Tensor(data_type, shape_in, schema::Format_NHWC, tensorType); - lite::tensor::Tensor *tensor_out = - new (std::nothrow) lite::tensor::Tensor(data_type, shape_out, schema::Format_NHWC, tensorType); - if (tensor_in == nullptr) { - MS_LOG(ERROR) << "tensor_in null"; + std::vector input_shape = {n, h, w, c}; + auto tensor_x_ptr = std::make_unique( + TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; return; } + std::vector out_shape = {n, oh, ow, c}; + auto tensor_out_ptr = std::make_unique( + TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NHWC); + auto tensor_out = tensor_out_ptr.get(); if (tensor_out == nullptr) { - MS_LOG(ERROR) << "tensor_out null"; + MS_LOG(ERROR) << "tensor_out create error."; return; } - std::vector inputs{tensor_in}; + std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - - MS_LOG(INFO) << "create OpenCL Kernel"; - auto *pooling_kernel = - new (std::nothrow) kernel::PoolingOpenCLKernel(reinterpret_cast(param), inputs, outputs); - if (pooling_kernel == nullptr) { - MS_LOG(ERROR) << "pooling_kernel null"; + auto arith_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto arith_kernel = arith_kernel_ptr.get(); + if (arith_kernel == nullptr) { + MS_LOG(ERROR) << "arith_kernel create error."; return; } - pooling_kernel->Init(); - std::vector kernels{pooling_kernel}; + arith_kernel->Init(); - MS_LOG(INFO) << "create SubGraphOpenCLKernel"; - auto *pGraph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + inputs[0]->MallocData(allocator); + + std::vector kernels{arith_kernel}; + auto pGraph_ptr = std::make_unique(inputs, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); if (pGraph == nullptr) { - MS_LOG(ERROR) << "pGraph null"; + MS_LOG(ERROR) << "pGraph create error."; return; } pGraph->Init(); - - MS_LOG(INFO) << "initialize data"; - std::vector tensor_map = {tensor_in}; - for (auto &tensor_file : tensor_map) { - auto tensor = tensor_file; - size_t size = tensor->Size(); - const float data[16] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, - 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; - memcpy(tensor->Data(), data, size); - } - - MS_LOG(INFO) << "pGraph->Run()"; + memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size); pGraph->Run(); - MS_LOG(INFO) << "==================output data================="; - float *output_data = reinterpret_cast(tensor_out->Data()); - printf("output:"); - for (int i = 0; i < 4; i++) { - printf("%.3f ", output_data[i]); + if (enable_fp16) { + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); + } else { + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - printf("\n"); - float expect[4] = {2.0f, 3.0f, 4.0f, 5.0f}; + inputs[0]->SetData(nullptr); + outputs[0]->SetData(nullptr); - for (int i = 0; i < tensor_out->ElementsNum(); ++i) - if (std::fabs(output_data[i] - expect[i]) > 1e-5) { - printf("idx[%d] except=%.3f output=%.3f, ", i, expect[i], output_data[i]); - } - printf("test all close OK!\n"); - lite::CompareOutputData(output_data, expect, 4); - delete tensor_in; - delete tensor_out; - delete pooling_kernel; - delete pGraph; - delete param; + MS_LOG(INFO) << "Test AvgPool2d passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); } +TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {6.0f, 7.0f, 8.0f, 9.0f}; + + RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), false); +} + +TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {6.0f, 7.0f, 8.0f, 9.0f}; + + RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), true); +} } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc index 321b1ccd061..fe453a0ca1f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc @@ -13,10 +13,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +#include #include #include "utils/log_adapter.h" #include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h" @@ -26,98 +27,132 @@ namespace mindspore { class TestMaxPoolingOpenCL : public mindspore::CommonTest {}; -void InitParameter(PoolingParameter *param) { +void InitMaxPoolingParam(PoolingParameter *param) { + param->input_batch_ = 1; + param->input_h_ = 2; + param->input_w_ = 2; + param->input_channel_ = 4; + + param->output_batch_ = 1; + param->output_h_ = 1; + param->output_w_ = 1; + param->output_channel_ = 4; + param->window_h_ = 2; param->window_w_ = 2; + param->stride_h_ = 2; param->stride_w_ = 2; + param->pad_u_ = 0; param->pad_d_ = 0; param->pad_l_ = 0; param->pad_r_ = 0; + param->pool_mode_ = PoolMode_MaxPool; } -TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) { - MS_LOG(INFO) << "ocl runtime"; +void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); + size_t dtype_size = sizeof(float); + if (enable_fp16) { + ocl_runtime->SetFp16Enable(true); + dtype_size = sizeof(float16_t); + } auto allocator = ocl_runtime->GetAllocator(); - - MS_LOG(INFO) << "PoolingParameter"; - auto param = new (std::nothrow) PoolingParameter; - InitParameter(param); - - // define tensor - MS_LOG(INFO) << "define tensor1"; - std::vector input_shape = {1, 16, 256, 192}; - std::vector output_shape = {1, 8, 128, 192}; - auto data_type = kNumberTypeFloat32; - auto tensorType = schema::NodeType_ValueNode; - MS_LOG(INFO) << "define tensor2"; - auto input_tensor = new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensorType); - auto output_tensor = - new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensorType); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "input_tensor null"; + int n = shape[0]; + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + int oh = shape[4]; + int ow = shape[5]; + auto param_ptr = std::make_unique(); + auto param = param_ptr.get(); + if (param == nullptr) { + MS_LOG(ERROR) << "param create error."; return; } - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "output_tensor null"; + InitMaxPoolingParam(param); + std::vector input_shape = {n, h, w, c}; + auto tensor_x_ptr = std::make_unique( + TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; return; } - MS_LOG(INFO) << "define input"; - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - - // run - MS_LOG(INFO) << "pooling_kernel"; - auto *pooling_kernel = - new (std::nothrow) kernel::PoolingOpenCLKernel(reinterpret_cast(param), inputs, outputs); - if (pooling_kernel == nullptr) { - MS_LOG(ERROR) << "pooling_kernel null"; + std::vector out_shape = {n, oh, ow, c}; + auto tensor_out_ptr = std::make_unique( + TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NHWC); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; return; } - MS_LOG(INFO) << "pooling_kernel init"; - pooling_kernel->Init(); + std::vector inputs{tensor_x}; + std::vector outputs{tensor_out}; + auto arith_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto arith_kernel = arith_kernel_ptr.get(); + if (arith_kernel == nullptr) { + MS_LOG(ERROR) << "arith_kernel create error."; + return; + } + arith_kernel->Init(); - std::vector kernels{pooling_kernel}; inputs[0]->MallocData(allocator); - auto *pGraph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + + std::vector kernels{arith_kernel}; + auto pGraph_ptr = std::make_unique(inputs, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); if (pGraph == nullptr) { - MS_LOG(ERROR) << "pGraph null"; + MS_LOG(ERROR) << "pGraph create error."; return; } - MS_LOG(INFO) << "pGraph init"; pGraph->Init(); - - // load data - MS_LOG(INFO) << "load data1"; - std::string input_file = "maxpool_in.bin"; - std::string expect_file = "maxpool_out.bin"; - MS_LOG(INFO) << "load data2"; - LoadTestData(input_tensor->Data(), input_tensor->Size(), input_file); - auto *input_data = reinterpret_cast(input_tensor->Data()); - printf("input[0:10]:"); - for (int i = 0; i < 10; i++) { - printf("[%d]:%.3f ", i, input_data[i]); - } - printf("\n"); - + memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size); pGraph->Run(); - MS_LOG(INFO) << "compare result"; - std::cout << "compare result" << std::endl; - CompareOutput(output_tensor, expect_file, static_cast(1e-5)); - for (auto tensor : inputs) { - delete tensor; + if (enable_fp16) { + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); + } else { + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } - for (auto tensor : outputs) { - delete tensor; - } - delete pooling_kernel; - delete pGraph; + inputs[0]->SetData(nullptr); + outputs[0]->SetData(nullptr); + + MS_LOG(INFO) << "Test MaxPool2d passed"; lite::opencl::OpenCLRuntime::DeleteInstance(); } +TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; + + RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), false); +} + +TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; + + RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), true); +} } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc index 6c21bc61c81..72efebd1a33 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc @@ -29,7 +29,8 @@ class TestReshapeOpenCL : public mindspore::CommonTest { TestReshapeOpenCL() {} }; -void RunTestCaseReshape(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { +void RunTestCaseReshape(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16, + bool is_output_2d) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); size_t dtype_size = sizeof(float); @@ -38,8 +39,13 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o dtype_size = sizeof(float16_t); } auto allocator = ocl_runtime->GetAllocator(); - int c = shape[0]; - std::vector input_shape = {1, 1, 1, c}; + int n = shape[0]; + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + int oh = shape[4]; + int ow = shape[5]; + std::vector input_shape = {n, h, w, c}; auto tensor_x_ptr = std::make_unique( TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC); auto tensor_x = tensor_x_ptr.get(); @@ -47,9 +53,13 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o MS_LOG(ERROR) << "tensor_x create error."; return; } - std::vector out_shape = {1, c}; - auto tensor_out_ptr = std::make_unique( - TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, schema::Format_NC); + std::vector out_shape = {n, oh, ow, c}; + if (is_output_2d) { + std::vector out_shape = {n, c}; + } + auto tensor_out_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape, + is_output_2d ? schema::Format_NC : schema::Format_NHWC); auto tensor_out = tensor_out_ptr.get(); if (tensor_out == nullptr) { MS_LOG(ERROR) << "tensor_out create error."; @@ -75,13 +85,13 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o return; } pGraph->Init(); - memcpy(inputs[0]->Data(), input_data, c * dtype_size); + memcpy(inputs[0]->Data(), input_data, inputs[0]->ElementsNum() * dtype_size); pGraph->Run(); if (enable_fp16) { - CompareOutput(outputs[0]->Data(), output_data, c, static_cast(1e-3), 2e-2); + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), 2e-2); } else { - CompareOutput(outputs[0]->Data(), output_data, c, static_cast(1e-5)); + CompareOutput(outputs[0]->Data(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); } inputs[0]->SetData(nullptr); outputs[0]->SetData(nullptr); @@ -91,20 +101,58 @@ void RunTestCaseReshape(const std::vector &shape, void *input_data, void *o } TEST_F(TestReshapeOpenCL, ReshapeFp32) { + int n = 1; + int h = 1; + int w = 1; int c = 7; - std::vector shape = {c}; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), false); + RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, true); } TEST_F(TestReshapeOpenCL, ReshapeFp16) { + int n = 1; + int h = 1; + int w = 1; int c = 7; - std::vector shape = {c}; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; - RunTestCaseReshape(shape, input_data.data(), output_data.data(), true); + RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, true); +} + +TEST_F(TestReshapeOpenCL, Reshape4DFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + int oh = 1; + int ow = 4; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + + RunTestCaseReshape(shape, input_data.data(), output_data.data(), false, false); +} + +TEST_F(TestReshapeOpenCL, Reshape4DFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + int oh = 1; + int ow = 4; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; + + RunTestCaseReshape(shape, input_data.data(), output_data.data(), true, false); } } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc index c0dbcf3b225..dcadb543125 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc @@ -94,8 +94,8 @@ void RunTestTranspose(const std::vector &shape, void *input_data, void *out } TEST_F(TestTransposeOpenCL, TransposeFp32) { - int h = 64; - int w = 1; + int h = 1; + int w = 64; int c = 7360; std::vector shape = {h, w, c}; size_t input_size;