From d831f8df5e5e83d15a393ad731c41cc5d85dd3ee Mon Sep 17 00:00:00 2001 From: pengyongrong Date: Mon, 3 Aug 2020 10:55:53 +0800 Subject: [PATCH] add judgement for CL --- build.sh | 8 +- .../kernel/opencl/cl/fp16/depthwise_conv2d.cl | 44 +-- .../runtime/kernel/opencl/cl/fp16/matmul.cl | 7 +- .../kernel/opencl/cl/fp32/avg_pool2d.cl | 3 +- .../runtime/kernel/opencl/cl/fp32/concat.cl | 2 +- .../kernel/opencl/cl/fp32/convolution.cl | 295 ++++++++---------- .../kernel/opencl/cl/fp32/depthwise_conv2d.cl | 130 +++----- .../runtime/kernel/opencl/cl/fp32/matmul.cl | 7 +- .../runtime/kernel/opencl/cl/fp32/softmax.cl | 17 +- 9 files changed, 194 insertions(+), 319 deletions(-) diff --git a/build.sh b/build.sh index e8312e2f3ae..bd11f4fa043 100755 --- a/build.sh +++ b/build.sh @@ -510,12 +510,8 @@ gene_ocl_program() { build_opencl() { cd ${BASEPATH} - if [[ ! -d "third_party/OpenCL-Headers" ]]; then - git submodule update --init third_party/OpenCL-Headers - fi - if [[ ! -d "third_party/OpenCL-CLHPP" ]]; then - git submodule update --init third_party/OpenCL-CLHPP - fi + git submodule update --init third_party/OpenCL-Headers + git submodule update --init third_party/OpenCL-CLHPP if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then gene_ocl_program else diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl index 7e327ba0a3a..2725ca9261f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl @@ -13,19 +13,9 @@ __constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void DepthwiseConv2d_NC4HW4( -__global FLT4* src_data, - __global FLT4* filters, -__global FLT4* biases, - float relu_clip1, -__global FLT4* dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size -) { +__kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases, + float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -42,31 +32,21 @@ __global FLT4* dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filters[fx_c]; - FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; + FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; r += TO_ACCUM_TYPE(src_final * f); - }; + } fx_c++; } } FLT4 bias_val = biases[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; + dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0; } -__kernel void DepthwiseConv2d_NHWC4( -__global FLT4* src_data, - __global FLT4* filters, -__global FLT4* biases, - float relu_clip1, -__global FLT4* dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size -) { +__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases, + float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -83,9 +63,9 @@ __global FLT4* dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filters[fx_c]; - FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; r += TO_ACCUM_TYPE(src_final * f); - }; + } fx_c++; } } @@ -93,4 +73,4 @@ __global FLT4* dst_data, FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl index 10c66fc3626..6b89bb3b6bf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl @@ -1,9 +1,8 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define FLT4 half4 #define FLT16 half16 -__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, - __global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci, - int2 offset_co, int has_bias) { +__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias, + int2 offset_ci, int2 offset_co, int has_bias) { int2 gid = (int2)(get_global_id(0), get_global_id(1)); int2 lid = (int2)(get_local_id(0), get_local_id(1)); FLT4 s = (FLT4)(0.0f); @@ -29,4 +28,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, buffer[gid.x] = s; // memory pollution? or protected by opencl } -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl index 1d2ca1216c7..0e60a4ca1ee 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl @@ -31,7 +31,6 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { @@ -63,4 +62,4 @@ __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only imag } float4 result = convert_float4(r / window_size); write_imagef(output, (int2)(X, Y * output_shape.w + Z), result); -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl index 2149842a3bd..d457a3e4fa0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl @@ -51,4 +51,4 @@ __kernel void Concat3input(__global float *input0, __global float *input1, __glo output[index_output] = input2[input_idx]; } } -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl index 80074382678..43a5a0306b7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl @@ -1,191 +1,150 @@ #define CI_TILE 4 #define CO_TILE 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -//#define __global - -//#pragma OPENCL EXTENSION cl_arm_printf : enable -__kernel void convolution_NHWC_OHWI(__global float *input, - __global float *weight, - __global float *bias, +// #define __global +// #pragma OPENCL EXTENSION cl_arm_printf : enable +__kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias, __global float *output, - const int4 input_shape, // NHWC - const int4 output_shape, // NHWC - const int4 kernel_stride, // kernelHW_strideHW - const int4 pad) // top bottom left right -{ - int ow = get_global_id(0); - int oh = get_global_id(1); - int co_slice = get_global_id(2); + const int4 input_shape, // NHWC + const int4 output_shape, // NHWC + const int4 kernel_stride, // kernelHW_strideHW + const int4 pad) { + int ow = get_global_id(0); + int oh = get_global_id(1); + int co_slice = get_global_id(2); - int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; - int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; - int KH = kernel_stride.x, KW = kernel_stride.y; - int strideH = kernel_stride.z, strideW = kernel_stride.w; - int padTop = pad.x, padLeft = pad.z; - int CI_SLICES = UP_DIV(CI, CI_TILE); - int CO_SLICES = UP_DIV(CO, CO_TILE); + int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; + int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; + int KH = kernel_stride.x, KW = kernel_stride.y; + int strideH = kernel_stride.z, strideW = kernel_stride.w; + int padTop = pad.x, padLeft = pad.z; + int CI_SLICES = UP_DIV(CI, CI_TILE); + int CO_SLICES = UP_DIV(CO, CO_TILE); - if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) - return; + if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) return; - float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); - for (int kh = 0; kh < KH; ++kh) - { - int ih = kh + oh * strideH - padTop; - for (int kw = 0; kw < KW; ++kw) - { - int iw = kw + ow * strideW - padLeft; - for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) - { - for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) - { - int ci = ci_slice * CI_TILE + ci_inner; - if (ci >= CI) - break; + float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); + for (int kh = 0; kh < KH; ++kh) { + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) { + int iw = kw + ow * strideW - padLeft; + for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) { + for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) { + int ci = ci_slice * CI_TILE + ci_inner; + if (ci >= CI) break; - int input_idx = ih * IW * CI + iw * CI + ci; - float value = 0; - if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) - value = 0; - else - value = input[input_idx]; + int input_idx = ih * IW * CI + iw * CI + ci; + float value = 0; + if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) + value = 0; + else + value = input[input_idx]; - int CO_OFFSET = KH * KW * CI; - int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + - kh * KW * CI + - kw * CI + - ci; - acc.x += weight[weight_idx + 0 * CO_OFFSET] * value; - acc.y += weight[weight_idx + 1 * CO_OFFSET] * value; - acc.z += weight[weight_idx + 2 * CO_OFFSET] * value; - acc.w += weight[weight_idx + 3 * CO_OFFSET] * value; - } - } + int CO_OFFSET = KH * KW * CI; + int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci; + acc.x += weight[weight_idx + 0 * CO_OFFSET] * value; + acc.y += weight[weight_idx + 1 * CO_OFFSET] * value; + acc.z += weight[weight_idx + 2 * CO_OFFSET] * value; + acc.w += weight[weight_idx + 3 * CO_OFFSET] * value; } + } } - int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE); - if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) - { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; - output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3]; - } - else if (CO % CO_TILE == 1) - { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - } - else if (CO % CO_TILE == 2) - { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - } - else if (CO % CO_TILE == 3) - { - output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; - output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; - output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; - } + } + int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE); + if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) { + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; + output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; + output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3]; + } else if (CO % CO_TILE == 1) { + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + } else if (CO % CO_TILE == 2) { + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; + } else if (CO % CO_TILE == 3) { + output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; + output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; + output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; + } } - - -//#pragma OPENCL EXTENSION cl_khr_fp16 : enable -//#define FLT4 half4 +// #pragma OPENCL EXTENSION cl_khr_fp16 : enable +// #define FLT4 half4 #define FLT4 float4 -__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, - __global FLT4 *weight, - __global FLT4 *bias, +__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias, __global FLT4 *output, - const int4 input_shape, // NHWC - const int4 output_shape, // NHWC - const int4 kernel_stride, // kernelHW_strideHW - const int4 pad) // top bottom left right -{ - int oh = get_global_id(0); // [0, OH) - int ow = get_global_id(1); // [0, OW) - int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) + const int4 input_shape, // NHWC + const int4 output_shape, // NHWC + const int4 kernel_stride, // kernelHW_strideHW + const int4 pad) { + int oh = get_global_id(0); // [0, OH) + int ow = get_global_id(1); // [0, OW) + int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) - int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; - int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; - int CI_SLICES = UP_DIV(CI, CI_TILE); - int CO_SLICES = UP_DIV(CO, CO_TILE); - int KH = kernel_stride.x, KW = kernel_stride.y; - int strideH = kernel_stride.z, strideW = kernel_stride.w; - int padTop = pad.x, padLeft = pad.z; + int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; + int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; + int CI_SLICES = UP_DIV(CI, CI_TILE); + int CO_SLICES = UP_DIV(CO, CO_TILE); + int KH = kernel_stride.x, KW = kernel_stride.y; + int strideH = kernel_stride.z, strideW = kernel_stride.w; + int padTop = pad.x, padLeft = pad.z; - if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) - return; - if (2 * co_slice + 1 >= CO_SLICES) - { - FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; - for (int kh = 0; kh < KH; ++kh) - { - int ih = kh + oh * strideH - padTop; - for (int kw = 0; kw < KW; ++kw) - { - int iw = kw + ow * strideW - padLeft; - if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) - { - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) - { - FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice]; - out0_c4 += w0_ic1_oc4[0] * in_c4.x; - out0_c4 += w0_ic1_oc4[1] * in_c4.y; - out0_c4 += w0_ic1_oc4[2] * in_c4.z; - out0_c4 += w0_ic1_oc4[3] * in_c4.w; - w0_ic1_oc4 += 4; - } - } - else - { - w0_ic1_oc4 += 4 * CI_SLICES; - } - } + if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) return; + if (2 * co_slice + 1 >= CO_SLICES) { + FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; + for (int kh = 0; kh < KH; ++kh) { + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) { + int iw = kw + ow * strideW - padLeft; + if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice]; + out0_c4 += w0_ic1_oc4[0] * in_c4.x; + out0_c4 += w0_ic1_oc4[1] * in_c4.y; + out0_c4 += w0_ic1_oc4[2] * in_c4.z; + out0_c4 += w0_ic1_oc4[3] * in_c4.w; + w0_ic1_oc4 += 4; + } + } else { + w0_ic1_oc4 += 4 * CI_SLICES; } - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; + } } - else - { - FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); - __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; - __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE; - for (int kh = 0; kh < KH; ++kh) - { - int ih = kh + oh * strideH - padTop; - for (int kw = 0; kw < KW; ++kw) - { - int iw = kw + ow * strideW - padLeft; - if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) - { - int idx = ih * IW * CI_SLICES + iw * CI_SLICES; - for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) - { - FLT4 in_c4 = input[idx + ci_slice]; + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; + } else { + FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; + __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE; + for (int kh = 0; kh < KH; ++kh) { + int ih = kh + oh * strideH - padTop; + for (int kw = 0; kw < KW; ++kw) { + int iw = kw + ow * strideW - padLeft; + if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { + int idx = ih * IW * CI_SLICES + iw * CI_SLICES; + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_c4 = input[idx + ci_slice]; - out0_c4 += w0_ic1_oc4[0] * in_c4.x; - out0_c4 += w0_ic1_oc4[1] * in_c4.y; - out0_c4 += w0_ic1_oc4[2] * in_c4.z; - out0_c4 += w0_ic1_oc4[3] * in_c4.w; - w0_ic1_oc4 += 4; + out0_c4 += w0_ic1_oc4[0] * in_c4.x; + out0_c4 += w0_ic1_oc4[1] * in_c4.y; + out0_c4 += w0_ic1_oc4[2] * in_c4.z; + out0_c4 += w0_ic1_oc4[3] * in_c4.w; + w0_ic1_oc4 += 4; - out1_c4 += w1_ic1_oc4[0] * in_c4.x; - out1_c4 += w1_ic1_oc4[1] * in_c4.y; - out1_c4 += w1_ic1_oc4[2] * in_c4.z; - out1_c4 += w1_ic1_oc4[3] * in_c4.w; - w1_ic1_oc4 += 4; - } - } - else - { - w0_ic1_oc4 += 4 * CI_SLICES; - w1_ic1_oc4 += 4 * CI_SLICES; - } - } + out1_c4 += w1_ic1_oc4[0] * in_c4.x; + out1_c4 += w1_ic1_oc4[1] * in_c4.y; + out1_c4 += w1_ic1_oc4[2] * in_c4.z; + out1_c4 += w1_ic1_oc4[3] * in_c4.w; + w1_ic1_oc4 += 4; + } + } else { + w0_ic1_oc4 += 4 * CI_SLICES; + w1_ic1_oc4 += 4 * CI_SLICES; } - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; - output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1]; + } } -} \ No newline at end of file + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; + output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1]; + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl index aef7bf8310b..f7944e9a2ba 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl @@ -8,18 +8,9 @@ #define TO_FLT4 convert_float4 #endif __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void DepthwiseConv2d_IMG_NC4HW4( -__read_only image2d_t src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__write_only image2d_t dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -36,32 +27,23 @@ __write_only image2d_t dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - //FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; - FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c))); + // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; + FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c))); r += TO_FLT4(src_final * f); - }; + } fx_c++; } } FLT4 bias_val = bias[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - //dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; + // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0); } -__kernel void DepthwiseConv2d_IMG_NHWC4( -__read_only image2d_t src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__write_only image2d_t dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -78,32 +60,23 @@ __write_only image2d_t dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - //FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; - FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(Z+x_c*src_size.z, y_c)); + // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c)); r += TO_FLT4(src_final * f); - }; + } fx_c++; } } FLT4 bias_val = bias[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - //dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; - write_imagef(dst_data, (int2)(X*dst_size.z+Z, Y), res0); + // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; + write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0); } -__kernel void DepthwiseConv2d_IMG_NHWC4_1x1( -__read_only image2d_t src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__write_only image2d_t dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size, + int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -120,30 +93,21 @@ __write_only image2d_t dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - //FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z)); r += TO_FLT4(src_final * f); - }; + } } } FLT4 bias_val = bias[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - //dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; + // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0); } -__kernel void DepthwiseConv2d_BUF_NC4HW4( -__global FLT4* src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__global FLT4* dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -160,30 +124,21 @@ __global FLT4* dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; + FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; r += TO_FLT4(src_final * f); - }; + } fx_c++; } } FLT4 bias_val = bias[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); - dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; + dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0; } -__kernel void DepthwiseConv2d_BUF_NHWC4( -__global FLT4* src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__global FLT4* dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -200,9 +155,9 @@ __global FLT4* dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; r += TO_FLT4(src_final * f); - }; + } fx_c++; } } @@ -212,18 +167,9 @@ __global FLT4* dst_data, dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; } -__kernel void DepthwiseConv2d_BUF_NHWC4_1x1( -__global FLT4* src_data, -__global FLT4* filter, -__global FLT4* bias, - float relu_clip1, -__global FLT4* dst_data, - int2 kernel_size, - int2 stride, - int2 padding, - int2 dilation, - int4 src_size, - int4 dst_size) { +__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, + float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size) { int X = get_global_id(0); int Y = get_global_id(1); int Z = get_global_id(2); @@ -240,13 +186,13 @@ __global FLT4* dst_data, bool outside_x = x_c < 0 || x_c >= src_size.x; if (!outside_x && !outside_y) { FLT4 f = filter[fx_c]; - FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; + FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; r += TO_FLT4(src_final * f); - }; + } } } FLT4 bias_val = bias[Z]; FLT4 res0 = TO_FLT4(r) + bias_val; res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl index c08e3941275..e851282c4ba 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl @@ -1,8 +1,7 @@ #define FLT4 float4 #define FLT16 float16 -__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, - __global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci, - int2 offset_co, int has_bias) { +__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias, + int2 offset_ci, int2 offset_co, int has_bias) { int2 gid = (int2)(get_global_id(0), get_global_id(1)); int2 lid = (int2)(get_local_id(0), get_local_id(1)); FLT4 s = (FLT4)(0.0f); @@ -28,4 +27,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, buffer[gid.x] = s; // memory pollution? or protected by opencl } -} \ No newline at end of file +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl index a1649c3dcf8..f1a5c69d94d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl @@ -1,16 +1,13 @@ #define SLICES 4 -int DivideRoundUp(int n, int div) -{ - int q = n / div; - return n % div == 0 ? q : q + 1; +int DivideRoundUp(int n, int div) { + int q = n / div; + return n % div == 0 ? q : q + 1; } -__kernel void SoftMax(__global float4 *input, - __global float4 *output, - const int4 input_shape) { - int X = get_global_id(0); // width - int Y = get_global_id(1); // height +__kernel void SoftMax(__global float4 *input, __global float4 *output, const int4 input_shape) { + int X = get_global_id(0); // width + int Y = get_global_id(1); // height int H = input_shape.y; int W = input_shape.z; int C = input_shape.w; @@ -32,4 +29,4 @@ __kernel void SoftMax(__global float4 *input, float4 result = convert_float4(t); output[(Y * W + X * H) * C + d] = result; } -} \ No newline at end of file +}