diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl deleted file mode 100644 index 30bde691805..00000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl +++ /dev/null @@ -1,36 +0,0 @@ -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif -#define divide_no_check(a, b) (a / b) -__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, - const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(2); - int Y = get_global_id(1); - int Z = get_global_id(0); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 r = (FLT4)(0.0f); - FLT window_size = 0.0f; - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - bool outside_y = y_c < 0 || y_c >= input_shape.y; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; - r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); - window_size += !outside ? 1.0f : 0.0f; - } - } - FLT4 result = TO_FLT4(divide_no_check(r, window_size)); - WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl index fd5423c298b..591243c7197 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl @@ -1,119 +1,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void conv2d_transpose2x2_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight, - __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, - int2 stride, int2 padding, int4 src_size, int4 dst_size) { - int h = get_global_id(0); - int kh = h % 2; - int src_h = h / 2; - src_h = src_h * 2; - int w = get_global_id(1); - int kw = w % 2; - int src_w = w / 2; - src_w = src_w * 2; - int co = get_global_id(2); - if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; - FLT4 r0 = (FLT4)(0.f); - FLT4 r1 = (FLT4)(0.f); - FLT4 r2 = (FLT4)(0.f); - FLT4 r3 = (FLT4)(0.f); - int base_w = (co * 4 + kh * 2 + kw) * src_size.z; - for (int ci = 0; ci < src_size.z; ++ci) { - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1)); - FLT16 weight_cache = weight[base_w++]; - r0 += x0.x * weight_cache.s0123; - r0 += x0.y * weight_cache.s4567; - r0 += x0.z * weight_cache.s89ab; - r0 += x0.w * weight_cache.scdef; - - r1 += x1.x * weight_cache.s0123; - r1 += x1.y * weight_cache.s4567; - r1 += x1.z * weight_cache.s89ab; - r1 += x1.w * weight_cache.scdef; - - r2 += x2.x * weight_cache.s0123; - r2 += x2.y * weight_cache.s4567; - r2 += x2.z * weight_cache.s89ab; - r2 += x2.w * weight_cache.scdef; - - r3 += x3.x * weight_cache.s0123; - r3 += x3.y * weight_cache.s4567; - r3 += x3.z * weight_cache.s89ab; - r3 += x3.w * weight_cache.scdef; - } - FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); - r0 += bias_val; - r1 += bias_val; - r2 += bias_val; - r3 += bias_val; - - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2); - WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3); -} - -__kernel void conv2d_transpose2x2_NC4HW4(__read_only image2d_t src_data, __global FLT16 *weight, - __read_only image2d_t biases, __write_only image2d_t dst_data, - int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) { - int h = get_global_id(0); - int kh = h % 2; - int src_h = h / 2; - src_h = src_h * 2; - int w = get_global_id(1); - int kw = w % 2; - int src_w = w / 2; - src_w = src_w * 2; - int co = get_global_id(2); - if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; - FLT4 r0 = (FLT4)(0.f); - FLT4 r1 = (FLT4)(0.f); - FLT4 r2 = (FLT4)(0.f); - FLT4 r3 = (FLT4)(0.f); - int base_w = (co * 4 + kh * 2 + kw) * src_size.z; - for (int ci = 0; ci < src_size.z; ++ci) { - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h + 1)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h + 1)); - FLT16 weight_cache = weight[base_w++]; - r0 += x0.x * weight_cache.s0123; - r0 += x0.y * weight_cache.s4567; - r0 += x0.z * weight_cache.s89ab; - r0 += x0.w * weight_cache.scdef; - - r1 += x1.x * weight_cache.s0123; - r1 += x1.y * weight_cache.s4567; - r1 += x1.z * weight_cache.s89ab; - r1 += x1.w * weight_cache.scdef; - - r2 += x2.x * weight_cache.s0123; - r2 += x2.y * weight_cache.s4567; - r2 += x2.z * weight_cache.s89ab; - r2 += x2.w * weight_cache.scdef; - - r3 += x3.x * weight_cache.s0123; - r3 += x3.y * weight_cache.s4567; - r3 += x3.z * weight_cache.s89ab; - r3 += x3.w * weight_cache.scdef; - } - FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); - r0 += bias_val; - r1 += bias_val; - r2 += bias_val; - r3 += bias_val; - - WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh), r0); - WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh + 2), r1); - WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh), r2); - WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh + 2), r3); -} - -__kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight, - __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, +__kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, + __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) { int dst_h = get_global_id(0); int rem_h = dst_h % stride.x; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl index 0714674fb6c..8b0cb160953 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl @@ -2,8 +2,8 @@ #define C4NUM 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, - __write_only image2d_t output, int4 in_shape, int2 out_shape, float act_min, +__kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, + __read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min, float act_max) { int gidx = get_global_id(0); // CO4 int gidz = get_global_id(2); // N diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl index 3b9035221d3..04119e2cb03 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl @@ -2,7 +2,7 @@ #define C4NUM 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, +__kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidz = get_global_id(2); // N @@ -32,37 +32,7 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh } } -__kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, - int4 in_shape, int4 out_shape) { - int gidx = get_global_id(0); // CO4 - int gidz = get_global_id(2); // N - int lidx = get_local_id(0); - int lidy = get_local_id(1); - int ci4 = UP_DIV(in_shape.w, C4NUM); - int co4 = UP_DIV(out_shape.w, C4NUM); - int n = out_shape.z; - bool inside = gidx < co4 && gidz < n; - FLT4 result = (FLT4)(0.0f); - for (uint i = lidy; i < ci4 && inside; i += 4) { - FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0)); - FLT16 w = weight[i * co4 + gidx]; - result.x += dot(v, w.s0123); - result.y += dot(v, w.s4567); - result.z += dot(v, w.s89ab); - result.w += dot(v, w.scdef); - } - __local FLT4 temp[32][4]; - temp[lidx][lidy] = result; - barrier(CLK_LOCAL_MEM_FENCE); - if (lidy == 0 && inside) { - result += temp[lidx][1]; - result += temp[lidx][2]; - result += temp[lidx][3]; - WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); - } -} - -__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, +__kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, int4 in_shape, int4 out_shape) { int gidx = get_global_id(0); // CO4 int gidy = get_global_id(1); // N * H * 4 @@ -95,39 +65,3 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); } } - -__kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, - int4 in_shape, int4 out_shape) { - int gidx = get_global_id(0); // CO4 - int gidy = get_global_id(1); // N * H * 4 - int gidz = get_global_id(2); // W - int lidx = get_local_id(0); - int lidy = get_local_id(1); - int ci4 = UP_DIV(in_shape.w, C4NUM); - int co4 = UP_DIV(out_shape.w, C4NUM); - int n = out_shape.x; - int h = out_shape.y; - int w = out_shape.z; - int nh_index = gidy / 4; - bool inside = gidx < co4 && gidz < w && nh_index < n * h; - int n_index = nh_index / h; - int h_index = nh_index % h; - FLT4 result = (FLT4)(0.0f); - for (uint i = lidy; i < ci4 && inside; i += 4) { - FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index)); - FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; - result.x += dot(v, weight_value.s0123); - result.y += dot(v, weight_value.s4567); - result.z += dot(v, weight_value.s89ab); - result.w += dot(v, weight_value.scdef); - } - __local FLT4 temp[32][4]; - temp[lidx][lidy] = result; - barrier(CLK_LOCAL_MEM_FENCE); - if (lidy == 0 && inside) { - result += temp[lidx][1]; - result += temp[lidx][2]; - result += temp[lidx][3]; - WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); - } -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl deleted file mode 100644 index e5f7b54fa23..00000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl +++ /dev/null @@ -1,61 +0,0 @@ -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif -__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, - const int4 output_shape, const int2 stride, const int2 kernel_size, - const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(2); - int Y = get_global_id(1); - int Z = get_global_id(0); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 maximum = (FLT4)(-10000.0f); - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.y) continue; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.x) continue; - FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); - maximum = max(src, maximum); - } - } - WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); -} - -__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, const int4 output_shape, const int2 stride, - const int2 kernel_size, const int2 padding) { - // axis to dst tensor coordinate - int X = get_global_id(2); - int Y = get_global_id(1); - int Z = get_global_id(0); - - // boundary check - if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { - return; - } - - FLT4 maximum = (FLT4)(-10000.0f); - int xs = X * stride.x - padding.x; - int ys = Y * stride.y - padding.y; - for (int ky = 0; ky < kernel_size.y; ++ky) { - int y_c = ys + ky; - if (y_c < 0 || y_c >= input_shape.y) continue; - for (int kx = 0; kx < kernel_size.x; ++kx) { - int x_c = xs + kx; - if (x_c < 0 || x_c >= input_shape.x) continue; - FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); - maximum = max(src, maximum); - } - } - WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl new file mode 100644 index 00000000000..d345617c0c2 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/pooling2d.cl @@ -0,0 +1,126 @@ +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif +#define divide_no_check(a, b) (a / b) +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +__kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, + const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(2); + int Y = get_global_id(1); + int Z = get_global_id(0); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 r = (FLT4)(0.0f); + FLT window_size = 0.0f; + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + bool outside_y = y_c < 0 || y_c >= input_shape.y; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; + r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); + window_size += !outside ? 1.0f : 0.0f; + } + } + FLT4 result = TO_FLT4(divide_no_check(r, window_size)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); +} + +__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, const int4 output_shape, const int2 stride, + const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(2); + int Y = get_global_id(1); + int Z = get_global_id(0); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 r = (FLT4)(0.0f); + FLT window_size = 0.0f; + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + bool outside_y = y_c < 0 || y_c >= input_shape.y; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; + r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); + window_size += !outside ? 1.0f : 0.0f; + } + } + FLT4 result = TO_FLT4(divide_no_check(r, window_size)); + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f))); +} + +__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, + const int4 output_shape, const int2 stride, const int2 kernel_size, + const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(2); + int Y = get_global_id(1); + int Z = get_global_id(0); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) continue; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) continue; + FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); + maximum = max(src, maximum); + } + } + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); +} + +__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, + const int4 input_shape, const int4 output_shape, const int2 stride, + const int2 kernel_size, const int2 padding) { + // axis to dst tensor coordinate + int X = get_global_id(2); + int Y = get_global_id(1); + int Z = get_global_id(0); + + // boundary check + if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { + return; + } + + FLT4 maximum = (FLT4)(-10000.0f); + int xs = X * stride.x - padding.x; + int ys = Y * stride.y - padding.y; + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = ys + ky; + if (y_c < 0 || y_c >= input_shape.y) continue; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = xs + kx; + if (x_c < 0 || x_c >= input_shape.x) continue; + FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); + maximum = max(src, maximum); + } + } + WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index 5701ab1da56..8f281297b1c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -42,30 +42,41 @@ using mindspore::schema::PrimitiveType_Activation; namespace mindspore::kernel { -int ActivationOpenClKernel::Init() { - std::map kernel_names{ +std::string ActivationOpenCLKernel::GetActTypeString(int act_type) { + static std::map supported_act_type = { {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"}, {ActivationType_RELU6, "Relu6"}, {ActivationType_TANH, "Tanh"}, {ActivationType_SWISH, "Swish"}, {ActivationType_HSWISH, "HSwish"}}; - if (kernel_names.count(type_) == 0) { - MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; - return mindspore::lite::RET_ERROR; + auto result_iter = supported_act_type.find(act_type); + if (result_iter != supported_act_type.end()) { + return result_iter->second; } + return ""; +} + +int ActivationOpenCLKernel::CheckSpecs() { + if (GetActTypeString(type_).empty()) { + MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; + return RET_ERROR; + } + return RET_OK; +} + +int ActivationOpenCLKernel::Prepare() { outShape = Image2DInfo(out_tensors_[0]); - local_size_ = {}; - global_size_ = {outShape.width, outShape.height}; std::string source = activation_source; std::set build_options; std::string program_name = "Activation"; ocl_runtime_->LoadSource(program_name, source); - std::string kernel_name = kernel_names[type_]; + std::string kernel_name = GetActTypeString(type_); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); - SetArgs(); + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " init Done!"; return mindspore::lite::RET_OK; } -int ActivationOpenClKernel::SetArgs() { +void ActivationOpenCLKernel::SetConstArgs() { int arg_idx = 2; cl_int2 image_size = {static_cast(outShape.width), static_cast(outShape.height)}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size); @@ -78,50 +89,26 @@ int ActivationOpenClKernel::SetArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4); } - return RET_OK; } -int ActivationOpenClKernel::Run() { - MS_LOG(DEBUG) << this->name() << " begin running!"; +void ActivationOpenCLKernel::SetGlobalLocal() { + local_range_ = cl::NullRange; + global_range_ = {outShape.width, outShape.height}; +} + +int ActivationOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); - if (ret != mindspore::lite::RET_OK) { + auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); + if (ret != RET_OK) { MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; - return mindspore::lite::RET_ERROR; + return RET_ERROR; } - return mindspore::lite::RET_OK; + return RET_OK; } -kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - if (inputs.empty()) { - MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size(); - return nullptr; - } - if (inputs[0]->shape().size() > 2 && inputs[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Activation kernel:" << opParameter->name_ << " failed: Unsupported multi-batch."; - free(opParameter); - return nullptr; - } - auto *kernel = - new (std::nothrow) ActivationOpenClKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "New kernel:" << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - MS_LOG(ERROR) << "Init activation kernel:" << opParameter->name_ << " failed!"; - delete kernel; - return nullptr; - } - return kernel; -} -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenClActivationKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenClActivationKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h index d979bb538c2..6bd2ace379c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h @@ -18,26 +18,30 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ACTIVATION_H_ #include +#include #include "src/runtime/kernel/opencl/opencl_kernel.h" #include "nnacl/fp32/activation.h" namespace mindspore::kernel { -class ActivationOpenClKernel : public OpenCLKernel { +class ActivationOpenCLKernel : public OpenCLKernel { public: - ActivationOpenClKernel(OpParameter *parameter, const std::vector &inputs, + ActivationOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) : OpenCLKernel(parameter, inputs, outputs), type_(reinterpret_cast(parameter)->type_), alpha_(reinterpret_cast(parameter)->alpha_) {} - ~ActivationOpenClKernel() override = default; + ~ActivationOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: - int SetArgs(); + static std::string GetActTypeString(int act_type); cl::Kernel kernel_; int type_; float alpha_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 6f84611095c..ce4a8659b96 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -35,41 +35,111 @@ using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { -std::vector ArithmeticOpenCLKernel::InitGlobalSize() const { - auto out_shape = out_tensors_[0]->shape(); - if (out_shape.size() == 2) { - const size_t global_x = 1; - const size_t global_y = 1; - const size_t global_z = UP_ROUND_DIV(out_shape[1], C4NUM); - std::vector global = {global_x, global_y, global_z}; - return global; +int ArithmeticOpenCLKernel::CheckSpecs() { + auto *arithmetic_parameter = reinterpret_cast(op_parameter_); + if (arithmetic_parameter->broadcasting_) { + element_flag_ = false; + kernel_name_ = "BroadcastNHWC4"; + if (out_tensors_[0]->shape()[0] > 1) { + MS_LOG(ERROR) << "Broadcasting don't support N > 1"; + return RET_ERROR; + } } else { - const size_t global_x = out_shape[2]; - const size_t global_y = out_shape[1]; - const size_t global_z = UP_ROUND_DIV(out_shape[3], C4NUM); - std::vector global = {global_x, global_y, global_z}; - return global; + kernel_name_ = "Element"; } + + switch (op_parameter_->type_) { + case PrimitiveType_Mul: + kernel_name_ += "Mul"; + break; + case PrimitiveType_Add: + kernel_name_ += "Add"; + break; + case PrimitiveType_Sub: + kernel_name_ += "Sub"; + break; + case PrimitiveType_Div: + kernel_name_ += "Div"; + break; + case PrimitiveType_LogicalAnd: + kernel_name_ += "And"; + break; + case PrimitiveType_LogicalOr: + kernel_name_ += "Or"; + break; + case PrimitiveType_Maximum: + kernel_name_ += "Max"; + break; + case PrimitiveType_Minimum: + kernel_name_ += "Min"; + break; + case PrimitiveType_FloorDiv: + kernel_name_ += "FloorDiv"; + break; + case PrimitiveType_FloorMod: + kernel_name_ += "FloorMod"; + break; + case PrimitiveType_SquaredDifference: + kernel_name_ += "SquaredDifference"; + break; + case PrimitiveType_Equal: + kernel_name_ += "Equal"; + break; + case PrimitiveType_NotEqual: + kernel_name_ += "NotEqual"; + break; + case PrimitiveType_Less: + kernel_name_ += "Less"; + break; + case PrimitiveType_LessEqual: + kernel_name_ += "LessEqual"; + break; + case PrimitiveType_Greater: + kernel_name_ += "Greater"; + break; + case PrimitiveType_GreaterEqual: + kernel_name_ += "GreaterEqual"; + break; + default: + MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; + return RET_ERROR; + } + + switch (arithmetic_parameter->activation_type_) { + case schema::ActivationType_NO_ACTIVATION: + break; + case schema::ActivationType_RELU: + activation_min_ = 0.f; + break; + case schema::ActivationType_RELU6: + activation_min_ = 0.f; + activation_max_ = 6.f; + break; + default: + MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; + return RET_ERROR; + } + return RET_OK; } -void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { +void ArithmeticOpenCLKernel::SetGlobalLocal() { if (element_flag_) { - local_size_ = {16, 16}; + local_range_ = {}; auto out_shape = out_tensors_[0]->shape(); if (out_shape.size() == 2) { size_t H = out_shape[0]; size_t W = UP_DIV(out_shape[1], C4NUM); - global_size_ = {W, H}; + global_range_ = {W, H}; } else { size_t H = out_shape[0] * out_shape[1]; size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); - global_size_ = {W, H}; + global_range_ = {W, H}; } } else { - local_size_ = {}; + local_range_ = {}; auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); - global_size_ = {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), - static_cast(out_shape[1] * out_shape[0])}; + global_range_ = {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), + static_cast(out_shape[1] * out_shape[0])}; } } @@ -137,7 +207,7 @@ int ArithmeticOpenCLKernel::InitWeights() { return RET_OK; } -int ArithmeticOpenCLKernel::SetArgs() { +void ArithmeticOpenCLKernel::SetConstArgs() { int arg_idx = 3; if (!element_flag_) { cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], @@ -157,124 +227,37 @@ int ArithmeticOpenCLKernel::SetArgs() { } ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag); } else { - cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; + cl_int2 output_shape{static_cast(global_range_[0]), static_cast(global_range_[1])}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); } ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); - return RET_OK; } -int ArithmeticOpenCLKernel::Init() { - std::string kernel_name; - auto *arithmetic_parameter = reinterpret_cast(op_parameter_); - - if (arithmetic_parameter->broadcasting_) { - element_flag_ = false; - kernel_name = "BroadcastNHWC4"; - if (out_tensors_[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Broadcasting don't support N > 1"; - return RET_ERROR; - } - } else { - kernel_name = "Element"; - } - - switch (op_parameter_->type_) { - case PrimitiveType_Mul: - kernel_name += "Mul"; - break; - case PrimitiveType_Add: - kernel_name += "Add"; - break; - case PrimitiveType_Sub: - kernel_name += "Sub"; - break; - case PrimitiveType_Div: - kernel_name += "Div"; - break; - case PrimitiveType_LogicalAnd: - kernel_name += "And"; - break; - case PrimitiveType_LogicalOr: - kernel_name += "Or"; - break; - case PrimitiveType_Maximum: - kernel_name += "Max"; - break; - case PrimitiveType_Minimum: - kernel_name += "Min"; - break; - case PrimitiveType_FloorDiv: - kernel_name += "FloorDiv"; - break; - case PrimitiveType_FloorMod: - kernel_name += "FloorMod"; - break; - case PrimitiveType_SquaredDifference: - kernel_name += "SquaredDifference"; - break; - case PrimitiveType_Equal: - kernel_name += "Equal"; - break; - case PrimitiveType_NotEqual: - kernel_name += "NotEqual"; - break; - case PrimitiveType_Less: - kernel_name += "Less"; - break; - case PrimitiveType_LessEqual: - kernel_name += "LessEqual"; - break; - case PrimitiveType_Greater: - kernel_name += "Greater"; - break; - case PrimitiveType_GreaterEqual: - kernel_name += "GreaterEqual"; - break; - default: - MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; - return RET_ERROR; - } - - switch (arithmetic_parameter->activation_type_) { - case schema::ActivationType_NO_ACTIVATION: - break; - case schema::ActivationType_RELU: - activation_min_ = 0.f; - break; - case schema::ActivationType_RELU6: - activation_min_ = 0.f; - activation_max_ = 6.f; - break; - default: - MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; - return RET_ERROR; - } - +int ArithmeticOpenCLKernel::Prepare() { lite::STATUS error_code = RET_OK; #ifdef PROGRAM_WITH_IL - kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); + kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); #else if (out_mem_type_ == MemType::IMG) { - kernel_name += "_IMG"; + kernel_name_ += "_IMG"; } else { - kernel_name += "_BUF"; + kernel_name_ += "_BUF"; } std::string program_name = "Arithmetic"; std::set build_options; std::string source = arithmetic_source; ocl_runtime_->LoadSource(program_name, source); - error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options); #endif if (error_code != RET_OK) { return error_code; } - Image2dGetWorkGroupSize(); + SetGlobalLocal(); InitWeights(); - SetArgs(); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; + SetConstArgs(); + MS_LOG(DEBUG) << kernel_name_ << " Init Done!"; return RET_OK; } @@ -287,64 +270,44 @@ int ArithmeticOpenCLKernel::Run() { auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Create OpenCL Arithmetic kernel failed!"; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init kernel failed, name: Arithmetic"; - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 6173e66e526..9a5095f2c9e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -18,6 +18,7 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ #include +#include #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" #include "src/runtime/kernel/opencl/opencl_kernel.h" @@ -30,24 +31,21 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ArithmeticOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; int InitWeights() override; - int SetArgs(); + void SetConstArgs() override; + void SetGlobalLocal() override; private: - std::vector InitGlobalSize() const; - void Image2dGetWorkGroupSize(); - cl::Kernel kernel_; bool element_flag_{true}; float activation_min_{-FLT_MAX}; float activation_max_{FLT_MAX}; std::vector> inputs_nhwc_shapes_; std::vector inputs_weight_ptrs_; - - std::vector local_size_; - std::vector global_size_; + std::string kernel_name_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 3ccc3dc2aef..5040ab150c3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -31,13 +31,17 @@ using mindspore::schema::PrimitiveType_DeConv2D; namespace mindspore::kernel { -int Conv2dTransposeOpenCLKernel::Init() { +int Conv2dTransposeOpenCLKernel::CheckSpecs() { ConvParameter *param = reinterpret_cast(op_parameter_); if (param->pad_l_ != param->pad_r_ || param->kernel_h_ - param->stride_h_ != 2 * param->pad_l_ || param->pad_u_ != param->pad_d_ || param->kernel_w_ - param->stride_w_ != 2 * param->pad_u_) { MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; return RET_ERROR; } + return RET_OK; +} + +int Conv2dTransposeOpenCLKernel::Prepare() { std::string kernel_name = "conv2d_transpose_NHWC4"; enable_fp16_ = ocl_runtime_->GetFp16Enable(); #ifdef PROGRAM_WITH_IL @@ -49,12 +53,56 @@ int Conv2dTransposeOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - PadWeight(); + InitWeights(); + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } -void Conv2dTransposeOpenCLKernel::PadWeight() { +void Conv2dTransposeOpenCLKernel::SetGlobalLocal() { + ConvParameter *param = reinterpret_cast(op_parameter_); + int co = out_tensors_[0]->shape()[3]; + int co4 = UP_DIV(co, C4NUM); + int stride_h = param->stride_h_; + int stride_w = param->stride_w_; + int oh = out_tensors_[0]->shape()[1]; + int ow = out_tensors_[0]->shape()[2]; + local_size_ = {16, 1, 16}; + global_size_ = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4}; + AlignGlobalLocal(global_size_, local_size_); +} + +void Conv2dTransposeOpenCLKernel::SetConstArgs() { + int arg_cnt = 2; + ConvParameter *param = reinterpret_cast(op_parameter_); + int ci = in_tensors_[0]->shape()[3]; + int co = out_tensors_[0]->shape()[3]; + int kh = param->kernel_h_; + int kw = param->kernel_w_; + int pad_h = param->pad_l_; + int pad_w = param->pad_u_; + int stride_h = param->stride_h_; + int stride_w = param->stride_w_; + int oh = out_tensors_[0]->shape()[1]; + int ow = out_tensors_[0]->shape()[2]; + int h = in_tensors_[0]->shape()[1]; + int w = in_tensors_[0]->shape()[2]; + cl_int2 kernel_size = {kh, kw}; + cl_int2 stride = {stride_h, stride_w}; + cl_int2 padding = {pad_h, pad_w}; + cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1}; + cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); +} + +int Conv2dTransposeOpenCLKernel::InitWeights() { ConvParameter *param = reinterpret_cast(op_parameter_); int ci = in_tensors_[0]->shape()[3]; int co = out_tensors_[0]->shape()[3]; @@ -138,67 +186,18 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { } } allocator->UnmapBuffer(bias_); + return RET_OK; } int Conv2dTransposeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - ConvParameter *param = reinterpret_cast(op_parameter_); - int ci = in_tensors_[0]->shape()[3]; - int co = out_tensors_[0]->shape()[3]; - int co4 = UP_DIV(co, C4NUM); - int kh = param->kernel_h_; - int kw = param->kernel_w_; - int pad_h = param->pad_l_; - int pad_w = param->pad_u_; - int stride_h = param->stride_h_; - int stride_w = param->stride_w_; - int oh = out_tensors_[0]->shape()[1]; - int ow = out_tensors_[0]->shape()[2]; - int h = in_tensors_[0]->shape()[1]; - int w = in_tensors_[0]->shape()[2]; - // local size should less than MAX_GROUP_SIZE - std::vector local = {16, 1, 16}; - std::vector global = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4}; - - cl_int2 kernel_size = {kh, kw}; - cl_int2 stride = {stride_h, stride_w}; - cl_int2 padding = {pad_h, pad_w}; - cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1}; - cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; int arg_cnt = 0; ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return mindspore::lite::RET_OK; } -kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) Conv2dTransposeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h index 69afe8f77f4..8cb78c21dba 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h @@ -32,12 +32,14 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~Conv2dTransposeOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: - void PadWeight(); - cl::Kernel kernel_; void *padWeight_{nullptr}; void *bias_{nullptr}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index a881860277d..031d9d2ffe5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -34,7 +34,11 @@ using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { int FullConnectionOpenCLKernel::Init() { - std::string kernel_name = "FullConnection_NHWC4"; + // deleted soon + return CheckSpecs(); +} + +int FullConnectionOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(op_parameter_); transposeA = param->a_transpose_; if (transposeA) { @@ -48,9 +52,6 @@ int FullConnectionOpenCLKernel::Init() { MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; return RET_ERROR; } - // call default move constructor(elemwised moved) - inShape = Image2DInfo(in_tensors_[0]); - outShape = Image2DInfo(out_tensors_[0]); switch (param->act_type_) { case ActType_No: break; @@ -65,6 +66,13 @@ int FullConnectionOpenCLKernel::Init() { MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; return RET_ERROR; } + return RET_OK; +} + +int FullConnectionOpenCLKernel::Prepare() { + std::string kernel_name = "FullConnection_NHWC4"; + inShape = Image2DInfo(in_tensors_[0]); + outShape = Image2DInfo(out_tensors_[0]); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -74,13 +82,14 @@ int FullConnectionOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - - PadWeight(); + InitWeights(); + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } -void FullConnectionOpenCLKernel::PadWeight() { +int FullConnectionOpenCLKernel::InitWeights() { auto allocator = ocl_runtime_->GetAllocator(); int ci = inShape.C; int ci4 = UP_DIV(ci, C4NUM); @@ -167,48 +176,37 @@ void FullConnectionOpenCLKernel::PadWeight() { } } allocator->UnmapBuffer(bias_); + return RET_OK; } -int FullConnectionOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void FullConnectionOpenCLKernel::SetGlobalLocal() { std::vector local = {32, 4, 1}; std::vector global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; - int arg_count = 0; + AlignGlobalLocal(global, local); +} + +void FullConnectionOpenCLKernel::SetConstArgs() { + int arg_count = 2; cl_int4 in_shape = {static_cast(inShape.N), static_cast(inShape.H), static_cast(inShape.W), static_cast(inShape.C)}; cl_int2 out_shape = {static_cast(outShape.N), static_cast(outShape.C)}; - ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_); ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); +} + +int FullConnectionOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_count = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h index e7cfcddde4a..1455ff70db5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h @@ -31,12 +31,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~FullConnectionOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; + int Init() override; private: - void PadWeight(); - cl::Kernel kernel_; void *padWeight_{nullptr}; void *bias_{nullptr}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index de2194d3115..d7d8ba071b2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -30,8 +30,7 @@ using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { -int MatMulOpenCLKernel::Init() { - std::string kernel_name = "MatMul_NHWC4"; +int MatMulOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(op_parameter_); transposeA = param->a_transpose_; if (transposeA) { @@ -45,6 +44,11 @@ int MatMulOpenCLKernel::Init() { MS_LOG(ERROR) << "matmul only support input shape size=2 or 4."; return mindspore::lite::RET_ERROR; } + return RET_OK; +} + +int MatMulOpenCLKernel::Prepare() { + std::string kernel_name = "MatMul_NHWC4"; dims = in_tensors_[0]->shape().size(); for (int i = 0; i < dims; i++) { inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i]; @@ -61,13 +65,14 @@ int MatMulOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - - PadWeight(); + InitWeights(); + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } -void MatMulOpenCLKernel::PadWeight() { +int MatMulOpenCLKernel::InitWeights() { // ABMCI @ ABCICO = ABMCO auto allocator = ocl_runtime_->GetAllocator(); int ci = inShape[3]; @@ -128,45 +133,36 @@ void MatMulOpenCLKernel::PadWeight() { } } allocator->UnmapBuffer(padWeight_); + return RET_OK; } -int MatMulOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void MatMulOpenCLKernel::SetGlobalLocal() { // local size should less than MAX_GROUP_SIZE std::vector local = {32, 4, 1}; std::vector global = {UP_DIV(static_cast(outShape[3]), C4NUM), 4 * static_cast(outShape[0]) * static_cast(outShape[1]), static_cast(outShape[2])}; - int arg_count = 0; + AlignGlobalLocal(global, local); +} + +void MatMulOpenCLKernel::SetConstArgs() { + int arg_count = 2; cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; - ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); +} + +int MatMulOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_count = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return mindspore::lite::RET_OK; } -kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index 8c41c2fd585..66855a4b3fa 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -31,12 +31,14 @@ class MatMulOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~MatMulOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + int InitWeights() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: - void PadWeight(); - cl::Kernel kernel_; void *padWeight_{nullptr}; bool enable_fp16_{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 7461c565454..1075c68bfbb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -21,8 +21,7 @@ #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" #ifndef PROGRAM_WITH_IL -#include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc" -#include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc" +#include "src/runtime/kernel/opencl/cl/pooling2d.cl.inc" #endif using mindspore::kernel::KERNEL_ARCH::kGPU; @@ -36,27 +35,25 @@ using mindspore::schema::PrimitiveType_Pooling; namespace mindspore { namespace kernel { -int PoolingOpenCLKernel::Init() { + +int PoolingOpenCLKernel::CheckSpecs() { + if (parameter_->pool_mode_ != PoolMode_MaxPool && parameter_->pool_mode_ != PoolMode_AvgPool) { + MS_LOG(ERROR) << "Init `Pooling2d` kernel failed, unsupported pool mode!"; + return RET_ERROR; + } + if (parameter_->act_type_ != ActType_No && parameter_->act_type_ != ActType_Relu) { + MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; + return RET_ERROR; + } + return RET_OK; +} + +int PoolingOpenCLKernel::Prepare() { std::string kernel_name; -#ifndef PROGRAM_WITH_IL - std::string source; - std::string program_name; -#endif if (parameter_->pool_mode_ == PoolMode_MaxPool) { kernel_name = "MaxPooling2d"; -#ifndef PROGRAM_WITH_IL - source = max_pool2d_source; - program_name = "MaxPooling2d"; -#endif } else if (parameter_->pool_mode_ == PoolMode_AvgPool) { kernel_name = "AvgPooling2d"; -#ifndef PROGRAM_WITH_IL - source = avg_pool2d_source; - program_name = "AvgPooling2d"; -#endif - } else { - MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!"; - return RET_INVALID_OP_NAME; } switch (parameter_->act_type_) { case ActType_No: @@ -66,42 +63,35 @@ int PoolingOpenCLKernel::Init() { break; default: MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; - return RET_ERROR; + break; } - enable_fp16_ = ocl_runtime_->GetFp16Enable(); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else kernel_name += "_NHWC4"; - if (out_mem_type_ == MemType::BUF) { - MS_LOG(ERROR) << "buffer output not support yet."; - return mindspore::lite::RET_ERROR; - } else { - kernel_name += "_IMG"; - } + kernel_name += "_IMG"; std::set build_options; + std::string source = pooling2d_source; + std::string program_name = "Pooling2d"; ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - InitGlobalSize(); + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } -void PoolingOpenCLKernel::InitGlobalSize() { +void PoolingOpenCLKernel::SetGlobalLocal() { const size_t global_x = out_tensors_[0]->shape()[1]; const size_t global_y = out_tensors_[0]->shape()[2]; const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); - global_size_ = {global_z, global_y, global_x}; - int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); - local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); - global_size_ = GetCommonGlobalSize(local_size_, global_size_); + global_range_ = {global_z, global_y, global_x}; + local_range_ = {}; } -int PoolingOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; - +void PoolingOpenCLKernel::SetConstArgs() { int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices}; cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3], @@ -109,40 +99,24 @@ int PoolingOpenCLKernel::Run() { cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_}; cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_}; - - int arg_idx = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + int arg_idx = 2; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); +} - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); +int PoolingOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_idx = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return mindspore::lite::RET_OK; } -kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (RET_OK != ret) { - MS_LOG(ERROR) << "Init OpenCL Pooling kernel failed!"; - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLKernelCreator) } // namespace kernel } // namespace mindspore diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index 17b73312971..537f80bba03 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -31,14 +31,15 @@ class PoolingOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs), parameter_(reinterpret_cast(parameter)) {} ~PoolingOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: - void InitGlobalSize(); PoolingParameter *parameter_; cl::Kernel kernel_; - bool enable_fp16_{false}; std::vector local_size_; std::vector global_size_; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc index 28e14005b76..d0def98fe2b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -40,14 +40,19 @@ using mindspore::schema::ReduceMode_ReduceSumSquare; namespace mindspore::kernel { -int ReduceOpenCLKernel::Init() { - InitNHWCShape(); - auto reduce_param = reinterpret_cast(op_parameter_); - if (reduce_param == nullptr) { - return RET_NULL_PTR; +std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) { + static const std::map reduce_type2str{{ReduceMode_ReduceMean, "mean"}, + {ReduceMode_ReduceSum, "sum"}}; + auto result_iter = reduce_type2str.find(type); + if (result_iter != reduce_type2str.end()) { + return result_iter->second; } - std::map reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}}; - if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) { + return ""; +} + +int ReduceOpenCLKernel::CheckSpecs() { + auto reduce_param = reinterpret_cast(op_parameter_); + if (GetReduceTypeStr(reduce_param->mode_).empty()) { MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; return RET_PARAM_INVALID; } @@ -67,7 +72,17 @@ int ReduceOpenCLKernel::Init() { MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; return RET_PARAM_INVALID; } - std::string kernel_name = reduce_type2str.at(reduce_param->mode_); + return RET_OK; +} + +int ReduceOpenCLKernel::Prepare() { + outShape = Image2DInfo(out_tensors_[0]); + auto reduce_param = reinterpret_cast(op_parameter_); + if (reduce_param == nullptr) { + return RET_NULL_PTR; + } + + std::string kernel_name = GetReduceTypeStr(reduce_param->mode_); if (wc_reduce_) { kernel_name += "_WC"; } @@ -77,7 +92,6 @@ int ReduceOpenCLKernel::Init() { kernel_name += "_local"; } kernel_name += "_NHWC4"; - enable_fp16_ = ocl_runtime_->GetFp16Enable(); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); @@ -88,32 +102,26 @@ int ReduceOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } - -void ReduceOpenCLKernel::InitNHWCShape() { - std::vector shapex = out_tensors_[0]->shape(); - size_t n = 1, h = 1, w = 1, c = 1; - if (shapex.size() == 2) { - n = shapex[0]; - c = shapex[1]; - } else if (shapex.size() == 4) { - n = shapex[0]; - h = shapex[1]; - w = shapex[2]; - c = shapex[3]; - } - nhwc_shape_ = {n, h, w, c}; -} - -int ReduceOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void ReduceOpenCLKernel::SetConstArgs() { std::vector shapex = in_tensors_[0]->shape(); int h = shapex[1]; int w = shapex[2]; int c = shapex[3]; int c4 = UP_DIV(c, C4NUM); + cl_int4 size = {h, w, c4, c}; + int arg_idx = 2; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); +} +void ReduceOpenCLKernel::SetGlobalLocal() { + std::vector shapex = in_tensors_[0]->shape(); + int h = shapex[1]; + int c = shapex[3]; + int c4 = UP_DIV(c, C4NUM); std::vector local = {}; if (use_local_) { local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; @@ -122,35 +130,20 @@ int ReduceOpenCLKernel::Run() { if (wc_reduce_) { global = {static_cast(h), 1, 1}; } - cl_int4 size = {h, w, c4, c}; + AlignGlobalLocal(global, local); +} + +int ReduceOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return mindspore::lite::RET_OK; } -kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h index 56b73b14636..6553e24a003 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h @@ -18,7 +18,7 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ #include - +#include #include "src/lite_kernel.h" #include "src/runtime/kernel/opencl/opencl_kernel.h" #include "nnacl/reduce_parameter.h" @@ -31,14 +31,16 @@ class ReduceOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ReduceOpenCLKernel() override = default; - int Init() override; int Run() override; - void InitNHWCShape(); + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: + static std::string GetReduceTypeStr(int type); cl::Kernel kernel_; - bool enable_fp16_{false}; - std::vector nhwc_shape_; + Image2DInfo outShape = Image2DInfo(nullptr); bool use_local_{false}; bool wc_reduce_{false}; static const size_t LOCAL_CACHE_THREAD{16}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h index 9c8d97823ca..44185184ea4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h @@ -32,7 +32,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel { int Run() override; int Prepare() override; - int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc index ef862b43202..b50738ce1d4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc @@ -32,27 +32,32 @@ using mindspore::schema::PrimitiveType_Resize; namespace mindspore::kernel { -int ResizeOpenCLKernel::Init() { - auto resize_param = reinterpret_cast(op_parameter_); - if (resize_param == nullptr) { - return RET_NULL_PTR; - } - alignCorner = resize_param->align_corners_; - preserveAspectRatio = resize_param->preserve_aspect_ratio_; +int ResizeOpenCLKernel::CheckSpecs() { auto in_shape = in_tensors_[0]->shape(); auto out_shape = out_tensors_[0]->shape(); if (in_shape.size() != 4 || out_shape.size() != 4 || in_shape[0] != out_shape[0] || in_shape[3] != out_shape[3]) { MS_LOG(ERROR) << "resize op only support 4D and axes HW"; return RET_PARAM_INVALID; } + auto resize_param = reinterpret_cast(op_parameter_); + if (resize_param->method_ != schema::ResizeMethod_LINEAR && resize_param->method_ != schema::ResizeMethod_NEAREST) { + MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_; + return RET_PARAM_INVALID; + } + return RET_OK; +} + +int ResizeOpenCLKernel::Prepare() { + auto resize_param = reinterpret_cast(op_parameter_); + alignCorner = resize_param->align_corners_; + preserveAspectRatio = resize_param->preserve_aspect_ratio_; + auto in_shape = in_tensors_[0]->shape(); + auto out_shape = out_tensors_[0]->shape(); std::string kernel_name = "resize"; if (resize_param->method_ == schema::ResizeMethod_LINEAR) { kernel_name += "_bilinear"; } else if (resize_param->method_ == schema::ResizeMethod_NEAREST) { kernel_name += "_nearest_neighbor"; - } else { - MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_; - return RET_PARAM_INVALID; } kernel_name += "_NHWC4"; #ifdef PROGRAM_WITH_IL @@ -64,6 +69,8 @@ int ResizeOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } @@ -74,8 +81,7 @@ float ResizeOpenCLKernel::getResizeScaleFactor(int input_size, int output_size) : static_cast(input_size) / static_cast(output_size); } -int ResizeOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void ResizeOpenCLKernel::SetConstArgs() { auto in_shape = in_tensors_[0]->shape(); auto out_shape = out_tensors_[0]->shape(); int n = out_shape[0]; @@ -85,39 +91,30 @@ int ResizeOpenCLKernel::Run() { int c4 = UP_DIV(c, C4NUM); float scale_h = getResizeScaleFactor(in_tensors_[0]->shape()[1], out_tensors_[0]->shape()[1]); float scale_w = getResizeScaleFactor(in_tensors_[0]->shape()[2], out_tensors_[0]->shape()[2]); - std::vector local = {}; - std::vector global = {static_cast(c4), static_cast(w), static_cast(h)}; cl_int4 in_size = {in_shape[0], in_shape[1], in_shape[2], UP_DIV(in_shape[3], C4NUM)}; cl_int4 out_size = {n, h, w, c4}; cl_float2 scale = {scale_h, scale_w}; - int arg_idx = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + int arg_idx = 2; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); +} + +void ResizeOpenCLKernel::SetGlobalLocal() { + local_range_ = {}; + auto out_shape = Image2DInfo(out_tensors_[0]); + global_range_ = {out_shape.Slice, out_shape.W, out_shape.H}; +} + +int ResizeOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_idx = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLResizeKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) ResizeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLResizeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLResizeKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h index 3c59fa4d65c..935ff3abc35 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h @@ -31,8 +31,11 @@ class ResizeOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ResizeOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: float getResizeScaleFactor(int input_size, int output_size); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 4147602d7a6..ada387fcc1e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -42,51 +42,17 @@ std::vector SoftmaxOpenCLKernel::GetMaskForLastChannel(int channels) { return mask; } -int SoftmaxOpenCLKernel::InitGlobalSize() { - size_t global_x, global_y; - const size_t global_z = 1; - if (axis_ == 1) { - global_x = UP_DIV(nhwc_shape_[3], C4NUM); - global_y = nhwc_shape_[2]; - } else if (axis_ == 2) { - global_x = UP_DIV(nhwc_shape_[3], C4NUM); - global_y = nhwc_shape_[1]; - } else if (axis_ == 3) { - global_x = nhwc_shape_[2]; - global_y = nhwc_shape_[1]; - } else { - global_x = 1; - global_y = 1; - } - global_size_ = {global_x, global_y, global_z}; - return lite::RET_OK; -} - -int SoftmaxOpenCLKernel::SetWorkGroupSize() { - // set work group size - InitGlobalSize(); - int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); - local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); - global_size_ = GetCommonGlobalSize(local_size_, global_size_); - return lite::RET_OK; -} - -int SoftmaxOpenCLKernel::SetWorkGroupSize1x1() { - local_size_ = {32, 1, 1}; - global_size_ = {32, 1, 1}; - return lite::RET_OK; -} - -int SoftmaxOpenCLKernel::Init() { - std::string kernel_name = "SoftMax"; - std::string program_name = "SoftMax"; - auto softmax_param = reinterpret_cast(op_parameter_); - axis_ = softmax_param->axis_; +int SoftmaxOpenCLKernel::CheckSpecs() { + axis_ = parameter_->axis_; auto in_shape = in_tensors_[0]->shape(); if (in_shape.size() > 4) { MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_shape.size(); return RET_ERROR; } + if (in_shape[0] > 1) { + MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch."; + return RET_ERROR; + } if (axis_ < 0) { axis_ = in_shape.size() + axis_; } @@ -95,11 +61,15 @@ int SoftmaxOpenCLKernel::Init() { MS_LOG(ERROR) << "Init `Softmax` kernel failed: softmax axis should be H W or C"; return RET_ERROR; } - nhwc_shape_ = GetNHWCShape(in_shape); + return RET_OK; +} + +int SoftmaxOpenCLKernel::Prepare() { + std::string kernel_name = "SoftMax"; + + out_shape = Image2DInfo(out_tensors_[0]); std::string source = softmax_source; - enable_fp16_ = ocl_runtime_->GetFp16Enable(); - // framework not set this param yet! just use default. - if (nhwc_shape_[1] == 1 && nhwc_shape_[2] == 1 && axis_ == 3) { + if (out_shape.H == 1 && out_shape.W == 1 && axis_ == 3) { // support 4d tensor onexone_flag_ = true; kernel_name += "1x1"; @@ -112,62 +82,63 @@ int SoftmaxOpenCLKernel::Init() { kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); #else std::set build_options; + std::string program_name = "SoftMax"; ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return lite::RET_OK; } -int SoftmaxOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void SoftmaxOpenCLKernel::SetGlobalLocal() { + if (onexone_flag_) { + local_size_ = {32}; + global_size_ = {32}; + } else { + size_t global_x, global_y; + if (axis_ == 1) { + global_x = out_shape.Slice; + global_y = out_shape.W; + } else if (axis_ == 2) { + global_x = out_shape.Slice; + global_y = out_shape.H; + } else if (axis_ == 3) { + global_x = out_shape.W; + global_y = out_shape.H; + } else { + global_x = 1; + global_y = 1; + } + global_size_ = {global_x, global_y}; + local_size_ = {}; + } + AlignGlobalLocal(global_size_, local_size_); +} - int arg_idx = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - int channel = nhwc_shape_[3]; - int c4 = UP_DIV(channel, C4NUM); +void SoftmaxOpenCLKernel::SetConstArgs() { + int arg_idx = 2; + int channel = out_shape.C; + int c4 = out_shape.Slice; auto mask_ = GetMaskForLastChannel(channel); cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask); - cl_int4 input_shape = {nhwc_shape_[0], nhwc_shape_[1], nhwc_shape_[2], c4}; + cl_int4 input_shape = {static_cast(out_shape.N), static_cast(out_shape.H), static_cast(out_shape.W), + c4}; ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape); - if (onexone_flag_) { - SetWorkGroupSize1x1(); - } else { - SetWorkGroupSize(); - } +} +int SoftmaxOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_idx = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); // run opengl kernel - ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return lite::RET_OK; } -kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - delete kernel; - return nullptr; - } - if (inputs[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch."; - delete kernel; - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - MS_LOG(ERROR) << "Init `Softmax` kernel failed!"; - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h index 988c8355719..b67936f701f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h @@ -33,8 +33,11 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { } ~SoftmaxOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: int InitGlobalSize(); @@ -47,9 +50,8 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { bool onexone_flag_{false}; std::vector local_size_; std::vector global_size_; - bool enable_fp16_{false}; int axis_{0}; - std::vector nhwc_shape_; + Image2DInfo out_shape = Image2DInfo(nullptr); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index e744da8e136..ce34c0b42a8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -33,9 +33,7 @@ using mindspore::schema::PrimitiveType_Transpose; namespace mindspore::kernel { -int TransposeOpenCLKernel::Init() { - std::string kernel_name = "transpose"; - enable_fp16_ = ocl_runtime_->GetFp16Enable(); +int TransposeOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(op_parameter_); if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) { MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet."; @@ -43,16 +41,24 @@ int TransposeOpenCLKernel::Init() { } if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && param->perm_[3] == 2) { - kernel_name += "_0312"; type = TransposeType::AXIS0312; } else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && param->perm_[3] == 1) { - kernel_name += "_0231"; type = TransposeType::AXIS0231; } else { MS_LOG(ERROR) << "unsupported transpose axes."; return mindspore::lite::RET_ERROR; } + return RET_OK; +} + +int TransposeOpenCLKernel::Prepare() { + std::string kernel_name = "transpose"; + if (type == TransposeType::AXIS0312) { + kernel_name += "_0312"; + } else if (type == TransposeType::AXIS0231) { + kernel_name += "_0231"; + } if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { // just for input kernel_name += "_oversize"; @@ -68,58 +74,49 @@ int TransposeOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - + SetConstArgs(); + SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return mindspore::lite::RET_OK; } -int TransposeOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; +void TransposeOpenCLKernel::SetConstArgs() { std::vector shapex = out_tensors_[0]->shape(); size_t n = shapex[0]; // n=1 size_t h = shapex[1]; size_t w = shapex[2]; size_t c = shapex[3]; - size_t c4 = UP_DIV(c, 4); - std::vector local = {}; - std::vector global; - if (type == TransposeType::AXIS0312) { // NHWC -> NCHW - global = {UP_DIV(h, C4NUM), w, c4}; - } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC - global = {h, UP_DIV(w, C4NUM), c4}; - } + int arg_idx = 2; cl_int4 shape = {static_cast(n), static_cast(h), static_cast(w), static_cast(c)}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); +} + +void TransposeOpenCLKernel::SetGlobalLocal() { + std::vector shapex = out_tensors_[0]->shape(); + size_t h = shapex[1]; + size_t w = shapex[2]; + size_t c = shapex[3]; + size_t c4 = UP_DIV(c, 4); + if (type == TransposeType::AXIS0312) { // NHWC -> NCHW + global_range_ = {UP_DIV(h, C4NUM), w, c4}; + } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC + global_range_ = {h, UP_DIV(w, C4NUM), c4}; + } +} + +int TransposeOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return mindspore::lite::RET_OK; } -kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != mindspore::lite::RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index e4dcc9c7b69..1d893f4733b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -34,12 +34,14 @@ class TransposeOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~TransposeOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: cl::Kernel kernel_; - bool enable_fp16_{false}; TransposeType type{TransposeType::AXIS0312}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index b4257798944..43932895cca 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -213,15 +213,15 @@ int SubGraphOpenCLKernel::Init() { } nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end()); - UpdateTensorDataType(); - - MallocTensorWithReuse(); - ret = SubGraphKernel::Prepare(); if (ret != RET_OK) { MS_LOG(ERROR) << "OpenCL prepare fail"; return ret; } + + UpdateTensorDataType(); + + MallocTensorWithReuse(); return RET_OK; } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc index 41265efc59e..2f175d091e0 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/activation_tests.cc @@ -14,772 +14,109 @@ * limitations under the License. */ #include - +#include #include "src/common/log_adapter.h" #include "common/common_test.h" #include "mindspore/lite/src/common/file_utils.h" #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" -#include "mindspore/lite/src/runtime/opencl/opencl_allocator.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" -#include "mindspore/lite/nnacl/fp32/activation.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h" - -using mindspore::kernel::LiteKernel; -using mindspore::kernel::SubGraphOpenCLKernel; -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_OK; -using mindspore::lite::Tensor; -using mindspore::schema::ActivationType_HSWISH; -using mindspore::schema::ActivationType_LEAKY_RELU; -using mindspore::schema::ActivationType_RELU; -using mindspore::schema::ActivationType_RELU6; -using mindspore::schema::ActivationType_SIGMOID; -using mindspore::schema::ActivationType_SWISH; -using mindspore::schema::ActivationType_TANH; -using mindspore::schema::PrimitiveType_Activation; +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" namespace mindspore { -class TestActivationOpenCL : public mindspore::CommonTest {}; +class TestActivationOpenCL : public mindspore::CommonTest { + public: + TestActivationOpenCL() {} +}; -class TestActivationOpenCLTanh : public mindspore::CommonTest {}; - -void LoadActivationData(void *dst, size_t dst_size, const std::string &file_path) { - if (file_path.empty()) { - memset(dst, 0x00, dst_size); - } else { - auto src_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &dst_size)); - memcpy(dst, src_data, dst_size); - } -} - -template -void CompareRes(lite::Tensor *output_tensor, const std::string &standard_answer_file) { - auto *output_data = reinterpret_cast(output_tensor->data_c()); - size_t output_size = output_tensor->Size(); - auto expect_data = reinterpret_cast(mindspore::lite::ReadFile(standard_answer_file.c_str(), &output_size)); - constexpr float atol = 0.001; - for (int i = 0; i < output_tensor->ElementsNum(); ++i) { - if (std::fabs(output_data[i] - expect_data[i]) > atol) { - printf("error at idx[%d] expect=%f output=%f\n", i, expect_data[i], output_data[i]); - printf("error at idx[%d] expect=%f output=%f\n", i, expect_data[i], output_data[i]); - printf("error at idx[%d] expect=%f output=%f\n\n\n", i, expect_data[i], output_data[i]); - return; - } - } - printf("compare success!\n"); - printf("compare success!\n"); - printf("compare success!\n\n\n"); -} - -template -void printf_tensor(const std::string &str, mindspore::lite::Tensor *in_data) { - MS_LOG(INFO) << str; - auto input_data = reinterpret_cast(in_data->data_c()); - for (int i = 0; i < in_data->ElementsNum(); ++i) { - printf("%f ", input_data[i]); - } - printf("\n"); - MS_LOG(INFO) << "Print tensor done"; -} - -TEST_F(TestActivationOpenCL, ReluFp_dim4) { - std::string in_file = "/data/local/tmp/in_data.bin"; - std::string out_file = "/data/local/tmp/relu.bin"; - MS_LOG(INFO) << "Relu Begin test!"; +void RunTestCaseActivation(void *input_data0, const std::vector &input_shape, void *output_data, + const std::vector &out_shape, bool enable_fp16, int act_type) { auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); ocl_runtime->Init(); + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); - auto data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = ocl_runtime->GetFp16Enable(); - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 9}; - schema::Format format = schema::Format_NC; - auto tensor_type = lite::Tensor::CONST_TENSOR; - auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "new input tensor error!"; - return; - } - auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "new output tensor error!"; - delete input_tensor; - return; - } - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - inputs[0]->MallocData(allocator); - LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); - if (enable_fp16) { - printf_tensor("ReluFp16:--input data---", inputs[0]); - } else { - printf_tensor("ReluFp32:--input data---", inputs[0]); - } - - auto *param = new (std::nothrow) ActivationParameter(); + auto param = static_cast(malloc(sizeof(ActivationParameter))); if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - delete input_tensor; - delete output_tensor; + MS_LOG(ERROR) << "param_ptr create error."; return; } - param->type_ = ActivationType_RELU; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:Relu create fail."; - delete param; - delete input_tensor; - delete output_tensor; + param->op_parameter_.type_ = schema::PrimitiveType_Activation; + param->type_ = act_type; + auto tensor_x_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; return; } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Init relu fail."; + auto tensor_out_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; return; } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; + std::vector inputs{tensor_x}; + std::vector outputs{tensor_out}; + auto op_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); + if (op_kernel == nullptr) { + MS_LOG(ERROR) << "op_kernel create error."; return; } + inputs[0]->MallocData(allocator); + + std::vector kernels{op_kernel}; + + std::vector inputs_g{tensor_x}; + auto pGraph_ptr = std::make_unique(inputs_g, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); + if (pGraph == nullptr) { + MS_LOG(ERROR) << "pGraph create error."; + return; + } + pGraph->Init(); + memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size); + pGraph->Run(); + if (enable_fp16) { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), + 2e-2); + } else { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); + } - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - return; + for (auto t : inputs) { + t->set_data(nullptr); } - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; + for (auto t : outputs) { + t->set_data(nullptr); } - if (enable_fp16) { - printf_tensor("ReluFp16--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } else { - printf_tensor("ReluFp32--output data--", outputs[0]); - CompareRes(output_tensor, out_file); - } - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; + MS_LOG(INFO) << "TestActivation passed"; } -TEST_F(TestActivationOpenCL, Relu6Fp_dim4) { - std::string in_file = "/data/local/tmp/in_data.bin"; - std::string out_file = "/data/local/tmp/relu6.bin"; - MS_LOG(INFO) << "Relu6 Begin test!"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); - auto data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = ocl_runtime->GetFp16Enable(); - ocl_runtime->Init(); - - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 9}; - schema::Format format = schema::Format_NC; - auto tensor_type = lite::Tensor::CONST_TENSOR; - auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "new input tensor error!"; - return; - } - auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "new output tensor error!"; - delete input_tensor; - return; - } - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - auto allocator = ocl_runtime->GetAllocator(); - inputs[0]->MallocData(allocator); - MS_LOG(INFO) << "Initialize input data"; - LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); - if (enable_fp16) { - printf_tensor("Relu6:FP16--input data--", inputs[0]); - } else { - printf_tensor("Relu6:FP32--input data--", inputs[0]); - } - - auto *param = new (std::nothrow) ActivationParameter(); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - delete input_tensor; - delete output_tensor; - return; - } - param->type_ = ActivationType_RELU6; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:Relu6 create fail."; - delete param; - delete input_tensor; - delete output_tensor; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Init relu6 fail."; - return; - } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - return; - } - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - - if (enable_fp16) { - printf_tensor("Relu6:FP16--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } else { - printf_tensor("Relu6:FP32--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; +TEST_F(TestActivationOpenCL, ActivationReLUFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f}; + RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, false, + schema::ActivationType_RELU); } -TEST_F(TestActivationOpenCL, SigmoidFp_dim4) { - std::string in_file = "/data/local/tmp/in_data.bin"; - std::string out_file = "/data/local/tmp/sigmoid.bin"; - MS_LOG(INFO) << "Sigmoid Begin test!"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); - ocl_runtime->Init(); - auto data_type = kNumberTypeFloat32; - ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = ocl_runtime->GetFp16Enable(); - - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 9}; - schema::Format format = schema::Format_NC; - auto tensor_type = lite::Tensor::CONST_TENSOR; - auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "new input tensor error!"; - return; - } - auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "new output tensor error!"; - delete input_tensor; - return; - } - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - auto allocator = ocl_runtime->GetAllocator(); - inputs[0]->MallocData(allocator); - MS_LOG(INFO) << "Initialize input data"; - LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); - if (enable_fp16) { - printf_tensor("Sigmoid:FP16--input data--", inputs[0]); - } else { - printf_tensor("Sigmoid:FP32--input data--", inputs[0]); - } - - auto *param = new (std::nothrow) ActivationParameter(); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - delete input_tensor; - delete output_tensor; - return; - } - param->type_ = ActivationType_SIGMOID; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:Sigmoid create fail."; - delete param; - delete input_tensor; - delete output_tensor; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Init sigmoid fail."; - return; - } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - return; - } - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - - if (enable_fp16) { - printf_tensor("Sigmoid:FP16--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } else { - printf_tensor("Sigmoid:FP32--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; -} - -TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) { - std::string in_file = "/data/local/tmp/in_data.bin"; - std::string out_file = "/data/local/tmp/leaky_relu.bin"; - MS_LOG(INFO) << "Leaky relu Begin test!"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); - ocl_runtime->Init(); - auto data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = ocl_runtime->GetFp16Enable(); - - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 9}; - auto tensor_type = lite::Tensor::CONST_TENSOR; - schema::Format format = schema::Format_NC; - auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "new input tensor error!"; - return; - } - auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "new output tensor error!"; - delete input_tensor; - return; - } - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - auto allocator = ocl_runtime->GetAllocator(); - inputs[0]->MallocData(allocator); - MS_LOG(INFO) << "Initialize input data"; - LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); - if (enable_fp16) { - printf_tensor("Leaky Relu:FP16--input data--", inputs[0]); - } else { - printf_tensor("Leaky Relu:FP32--input data--", inputs[0]); - } - - auto *param = new (std::nothrow) ActivationParameter(); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - delete input_tensor; - delete output_tensor; - return; - } - param->alpha_ = 0.3f; - param->type_ = ActivationType_LEAKY_RELU; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:leaky relu create fail."; - delete param; - delete input_tensor; - delete output_tensor; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Init leaky relu fail."; - return; - } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - return; - } - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - if (enable_fp16) { - printf_tensor("Leaky Relu:FP16--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } else { - printf_tensor("Leaky Relu:FP32--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } - delete param; - delete input_tensor; - delete output_tensor; -} - -TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) { - std::string in_file = "/data/local/tmp/test_data/in_tanhfp16.bin"; - std::string out_file = "/data/local/tmp/test_data/out_tanhfp16.bin"; - MS_LOG(INFO) << "Tanh Begin test!"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); - ocl_runtime->Init(); - auto data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = ocl_runtime->GetFp16Enable(); - - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 2, 3, 9}; - schema::Format format = schema::Format_NHWC; - auto tensor_type = lite::Tensor::CONST_TENSOR; - auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (input_tensor == nullptr) { - MS_LOG(ERROR) << "new input tensor error!"; - return; - } - auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); - if (output_tensor == nullptr) { - MS_LOG(ERROR) << "new output tensor error!"; - delete input_tensor; - return; - } - std::vector inputs{input_tensor}; - std::vector outputs{output_tensor}; - auto allocator = ocl_runtime->GetAllocator(); - inputs[0]->MallocData(allocator); - MS_LOG(INFO) << "Initialize input data"; - LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); - if (enable_fp16) { - printf_tensor("Tanh:FP16--input data--", inputs[0]); - } else { - printf_tensor("Tanh:FP32--input data--", inputs[0]); - } - - auto param = reinterpret_cast(malloc(sizeof(ActivationParameter))); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - delete input_tensor; - delete output_tensor; - return; - } - param->type_ = ActivationType_TANH; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:Tanh create fail."; - delete param; - delete input_tensor; - delete output_tensor; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Init tanh fail."; - return; - } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - delete input_tensor; - delete output_tensor; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - return; - } - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete input_tensor; - delete output_tensor; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - - if (enable_fp16) { - printf_tensor("Tanh:FP16--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } else { - printf_tensor("Tanh:FP32--output data---", outputs[0]); - CompareRes(output_tensor, out_file); - } - input_tensor->set_data(nullptr); - delete input_tensor; - output_tensor->set_data(nullptr); - delete output_tensor; - delete sub_graph; -} - -TEST_F(TestActivationOpenCL, SwishFp16_dim4) { - size_t input_size; - std::string in_file = "/data/local/tmp/test_data/in_swishfp16.bin"; - std::string out_file = "/data/local/tmp/test_data/out_swishfp16.bin"; - auto input_data = reinterpret_cast(mindspore::lite::ReadFile(in_file.c_str(), &input_size)); - MS_LOG(INFO) << "Swish Begin test!"; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper(); - auto runtime = ocl_runtime.GetInstance(); - runtime->Init(); - auto data_type = kNumberTypeFloat16; - runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - bool enable_fp16 = runtime->GetFp16Enable(); - - MS_LOG(INFO) << "Init tensors."; - std::vector input_shape = {1, 2, 3, 9}; - schema::Format format = schema::Format_NHWC; - auto tensor_type = lite::Tensor::CONST_TENSOR; - auto input_tensor = Tensor(data_type, input_shape, format, tensor_type); - auto output_tensor = Tensor(data_type, input_shape, format, tensor_type); - - std::vector inputs{&input_tensor}; - std::vector outputs{&output_tensor}; - auto allocator = runtime->GetAllocator(); - inputs[0]->MallocData(allocator); - MS_LOG(INFO) << "Initialize input data"; - memcpy(inputs[0]->data_c(), input_data, input_size); - if (enable_fp16) { - printf_tensor("Swish:FP16--input data--", inputs[0]); - } else { - printf_tensor("Swish:FP32--input data--", inputs[0]); - } - - auto param = reinterpret_cast(malloc(sizeof(ActivationParameter))); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - return; - } - param->type_ = ActivationType_SWISH; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:Swish create fail."; - delete param; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - MS_LOG(ERROR) << "Init Swish fail."; - return; - } - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete sub_graph; - return; - } - - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - CompareRes(&output_tensor, out_file); - delete sub_graph; -} - -TEST_F(TestActivationOpenCL, HSwishFp16_dim4) { - MS_LOG(INFO) << " begin test "; - auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); - auto runtime = runtime_wrapper.GetInstance(); - runtime->Init(); - auto allocator = runtime->GetAllocator(); - - std::vector input_shape = {1, 1, 2, 4}; - std::vector output_shape = {1, 1, 2, 4}; - auto data_type = kNumberTypeFloat32; - - auto tensor_type = lite::Tensor::CONST_TENSOR; - schema::Format format = schema::Format_NHWC; - float input_data[] = {-3.0, -2.0, -1.0, 0.0, 1.0, 5.0, 6.0, 7.0}; - float correctOutput[] = {-0, -0.33333334, -0.33333334, 0, 0.6666667, 5, 6, 7}; - - MS_LOG(INFO) << "Init tensors."; - auto output_tensor = Tensor(data_type, input_shape, format, tensor_type); - auto in_tensor = Tensor(data_type, output_shape, format, tensor_type); - std::vector inputs{&in_tensor}; - std::vector outputs{&output_tensor}; - runtime->SetFp16Enable(data_type == kNumberTypeFloat16); - - MS_LOG(INFO) << "Initialize input data"; - auto param = reinterpret_cast(malloc(sizeof(ActivationParameter))); - if (param == nullptr) { - MS_LOG(ERROR) << "New ActivationParameter fail."; - return; - } - param->type_ = ActivationType_HSWISH; - auto *kernel = - new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast(param), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel:HSwish create fail."; - delete param; - return; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete param; - delete kernel; - MS_LOG(ERROR) << "Init HSwish fail."; - return; - } - inputs[0]->MallocData(allocator); - memcpy(inputs[0]->data_c(), input_data, sizeof(input_data)); - MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; - std::vector kernels{kernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (sub_graph == nullptr) { - delete kernel; - delete param; - MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; - return; - } - - MS_LOG(INFO) << "Initialize sub_graph."; - ret = sub_graph->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init sub_graph error."; - delete sub_graph; - return; - } - - MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; - ret = sub_graph->Run(); - if (ret != RET_OK) { - delete param; - delete sub_graph; - MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; - return; - } - auto *output_data_gpu = reinterpret_cast(output_tensor.data_c()); - CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001); - delete sub_graph; +TEST_F(TestActivationOpenCL, ActivationReLUFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f}; + std::vector output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f}; + RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, true, schema::ActivationType_RELU); } } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc index 327e92c267b..2f751d42cfb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc @@ -75,14 +75,12 @@ void RunTestCaseArithmetic(void *input_data0, const std::vector &input_shap } std::vector inputs{tensor_x, tensor_w}; std::vector outputs{tensor_out}; - auto op_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto op_kernel = op_kernel_ptr.release(); + auto op_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; } - op_kernel->Init(); inputs[0]->MallocData(allocator); std::vector kernels{op_kernel}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc index 7283c60468e..0f82879c472 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc @@ -99,15 +99,13 @@ void RunTestCaseConv2dTranspose(const std::vector &shape, void *input_data, opParameter->pad_l_ = pad; opParameter->input_channel_ = ci; opParameter->output_channel_ = co; - auto op_kernel_ptr = std::make_unique( - reinterpret_cast(opParameter), inputs, outputs); - auto op_kernel = op_kernel_ptr.release(); + auto op_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(opParameter), nullptr, kernel::KernelKey(), nullptr); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; } op_kernel->set_name("DeConv"); - op_kernel->Init(); inputs[0]->MallocData(allocator); std::vector kernels{op_kernel}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc index 1538f6ecd4d..d6b3b39a560 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/fullconnection_tests.cc @@ -63,6 +63,7 @@ void RunTestCaseFullConnection(const std::vector &shape, void *input_data, param->a_transpose_ = false; param->b_transpose_ = true; param->has_bias_ = true; + param->act_type_ = ActType_No; auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); auto tensor_x = tensor_x_ptr.get(); @@ -98,14 +99,12 @@ void RunTestCaseFullConnection(const std::vector &shape, void *input_data, } std::vector inputs{tensor_x, tensor_w, tensor_bias}; std::vector outputs{tensor_out}; - auto op_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto op_kernel = op_kernel_ptr.release(); + auto op_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; } - op_kernel->Init(); inputs[0]->MallocData(allocator); std::vector kernels{op_kernel}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc index 284b89cd59a..61c11ae2a25 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc @@ -87,14 +87,12 @@ void RunTestCaseMatMul(const std::vector &shape, void *input_data, void *we } std::vector inputs{tensor_x, tensor_w}; std::vector outputs{tensor_out}; - auto op_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto op_kernel = op_kernel_ptr.release(); + auto op_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (op_kernel == nullptr) { MS_LOG(ERROR) << "op_kernel create error."; return; } - op_kernel->Init(); inputs[0]->MallocData(allocator); std::vector kernels{op_kernel}; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc deleted file mode 100644 index d9011dca6d2..00000000000 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include "src/common/log_adapter.h" -#include "common/common_test.h" -#include "mindspore/lite/src/common/file_utils.h" -#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" -#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" -#include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h" -#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" - -namespace mindspore { - -class TestMaxPoolingOpenCL : public mindspore::CommonTest {}; - -void InitMaxPoolingParam(PoolingParameter *param) { - param->input_batch_ = 1; - param->input_h_ = 2; - param->input_w_ = 2; - param->input_channel_ = 4; - - param->output_batch_ = 1; - param->output_h_ = 1; - param->output_w_ = 1; - param->output_channel_ = 4; - - param->window_h_ = 2; - param->window_w_ = 2; - - param->stride_h_ = 2; - param->stride_w_ = 2; - - param->pad_u_ = 0; - param->pad_d_ = 0; - param->pad_l_ = 0; - param->pad_r_ = 0; - - param->pool_mode_ = PoolMode_MaxPool; -} - -void RunTestCaseMaxPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); - ocl_runtime->Init(); - size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); - ocl_runtime->SetFp16Enable(enable_fp16); - auto allocator = ocl_runtime->GetAllocator(); - int n = shape[0]; - int h = shape[1]; - int w = shape[2]; - int c = shape[3]; - int oh = shape[4]; - int ow = shape[5]; - auto param = static_cast(malloc(sizeof(PoolingParameter))); - if (param == nullptr) { - MS_LOG(ERROR) << "param create error."; - return; - } - InitMaxPoolingParam(param); - std::vector input_shape = {n, h, w, c}; - auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), - input_shape, schema::Format_NHWC); - auto tensor_x = tensor_x_ptr.get(); - if (tensor_x == nullptr) { - MS_LOG(ERROR) << "tensor_x create error."; - return; - } - std::vector out_shape = {n, oh, ow, c}; - auto tensor_out_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), - out_shape, schema::Format_NHWC); - auto tensor_out = tensor_out_ptr.get(); - if (tensor_out == nullptr) { - MS_LOG(ERROR) << "tensor_out create error."; - return; - } - std::vector inputs{tensor_x}; - std::vector outputs{tensor_out}; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); - if (arith_kernel == nullptr) { - MS_LOG(ERROR) << "arith_kernel create error."; - return; - } - arith_kernel->Init(); - - inputs[0]->MallocData(allocator); - - std::vector kernels{arith_kernel}; - auto pGraph_ptr = std::make_unique(inputs, outputs, kernels, kernels, kernels); - auto pGraph = pGraph_ptr.get(); - if (pGraph == nullptr) { - MS_LOG(ERROR) << "pGraph create error."; - return; - } - pGraph->Init(); - memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size); - pGraph->Run(); - - if (enable_fp16) { - CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast(1e-3), - 2e-2); - } else { - CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast(1e-5)); - } - for (auto t : inputs) { - t->set_data(nullptr); - } - for (auto t : outputs) { - t->set_data(nullptr); - } - - MS_LOG(INFO) << "Test MaxPool2d passed"; -} - -TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp32) { - int n = 1; - int h = 2; - int w = 2; - int c = 4; - int oh = 1; - int ow = 1; - std::vector shape = {n, h, w, c, oh, ow}; - std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, - 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; - std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; - - RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), false); -} - -TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp16) { - int n = 1; - int h = 2; - int w = 2; - int c = 4; - int oh = 1; - int ow = 1; - std::vector shape = {n, h, w, c, oh, ow}; - std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, - 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; - std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; - - RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), true); -} -} // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc similarity index 72% rename from mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc rename to mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc index ed796e3116e..593a970559c 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/avg_pooling_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/pooling_tests.cc @@ -25,9 +25,9 @@ namespace mindspore { -class TestAvgPoolingOpenCL : public mindspore::CommonTest {}; +class TestPoolingOpenCL : public mindspore::CommonTest {}; -void InitAvgPoolingParam(PoolingParameter *param) { +void InitPoolingParam(PoolingParameter *param) { param->input_batch_ = 1; param->input_h_ = 2; param->input_w_ = 2; @@ -48,11 +48,10 @@ void InitAvgPoolingParam(PoolingParameter *param) { param->pad_d_ = 0; param->pad_l_ = 0; param->pad_r_ = 0; - - param->pool_mode_ = PoolMode_AvgPool; } -void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16) { +void RunTestCasePooling(const std::vector &shape, void *input_data, void *output_data, bool enable_fp16, + PoolMode pool_mode) { auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); ocl_runtime->Init(); size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); @@ -69,7 +68,8 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void MS_LOG(ERROR) << "param create error."; return; } - InitAvgPoolingParam(param); + InitPoolingParam(param); + param->pool_mode_ = pool_mode; std::vector input_shape = {n, h, w, c}; auto tensor_x_ptr = std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape, schema::Format_NHWC); @@ -88,14 +88,12 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void } std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator); @@ -127,7 +125,7 @@ void RunTestCaseAvgPooling(const std::vector &shape, void *input_data, void MS_LOG(INFO) << "Test AvgPool2d passed"; } -TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) { +TEST_F(TestPoolingOpenCL, AvgPoolingFp32) { int n = 1; int h = 2; int w = 2; @@ -139,10 +137,10 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) { 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; std::vector output_data = {6.0f, 7.0f, 8.0f, 9.0f}; - RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), false); + RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_AvgPool); } -TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) { +TEST_F(TestPoolingOpenCL, AvgPoolingFp16) { int n = 1; int h = 2; int w = 2; @@ -154,6 +152,36 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) { 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; std::vector output_data = {6.0f, 7.0f, 8.0f, 9.0f}; - RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), true); + RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_AvgPool); +} + +TEST_F(TestPoolingOpenCL, MaxPoolingFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; + + RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_MaxPool); +} + +TEST_F(TestPoolingOpenCL, MaxPoolingFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 4; + int oh = 1; + int ow = 1; + std::vector shape = {n, h, w, c, oh, ow}; + std::vector input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; + std::vector output_data = {12.0f, 13.0f, 14.0f, 15.0f}; + + RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_MaxPool); } } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc index 5a3c0e216e3..80b9a06a42b 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc @@ -75,14 +75,12 @@ void RunTestCaseReduce(const std::vector &shape, void *input_data, void *ou } std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc index 7f7340ad085..dfe91c1ef34 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reshape_tests.cc @@ -54,13 +54,12 @@ void RunTestCaseReshape(const std::vector &shape_in, const std::vector } std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - auto arith_kernel_ptr = std::make_unique(nullptr, inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator(inputs, outputs, nullptr, nullptr, + kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc index be76efb40d8..137cdd5c619 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/resize_tests.cc @@ -69,14 +69,12 @@ void RunTestCaseResize(const std::vector &shape, void *input_data, void *ou } std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc index eb39c950b49..893da875063 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_tests.cc @@ -75,14 +75,12 @@ void RunTestCaseSoftmax(const std::vector &shape, void *input_data, void *o return; } opParameter->axis_ = axis; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(opParameter), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(opParameter), nullptr, kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc index c5aec7176ec..90719a1e205 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc @@ -67,14 +67,12 @@ void RunTestTranspose(const std::vector &shape, void *input_data, void *out } std::vector inputs{tensor_x}; std::vector outputs{tensor_out}; - auto arith_kernel_ptr = - std::make_unique(reinterpret_cast(param), inputs, outputs); - auto arith_kernel = arith_kernel_ptr.release(); + auto arith_kernel = kernel::OpenCLKernelCreator( + inputs, outputs, reinterpret_cast(param), nullptr, kernel::KernelKey(), nullptr); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "arith_kernel create error."; return; } - arith_kernel->Init(); inputs[0]->MallocData(allocator);