add judgement for CL

This commit is contained in:
pengyongrong 2020-08-03 10:55:53 +08:00
parent 6a1e6b01f7
commit d831f8df5e
9 changed files with 194 additions and 319 deletions

View File

@ -510,12 +510,8 @@ gene_ocl_program() {
build_opencl() { build_opencl() {
cd ${BASEPATH} cd ${BASEPATH}
if [[ ! -d "third_party/OpenCL-Headers" ]]; then git submodule update --init third_party/OpenCL-Headers
git submodule update --init third_party/OpenCL-Headers git submodule update --init third_party/OpenCL-CLHPP
fi
if [[ ! -d "third_party/OpenCL-CLHPP" ]]; then
git submodule update --init third_party/OpenCL-CLHPP
fi
if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then if [[ "${OPENCL_OFFLINE_COMPILE}" == "on" ]]; then
gene_ocl_program gene_ocl_program
else else

View File

@ -13,19 +13,9 @@
__constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __constant sampler_t smp_edge = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_NC4HW4( __kernel void DepthwiseConv2d_NC4HW4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
__global FLT4* src_data, float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
__global FLT4* filters, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* biases,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size
) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -42,31 +32,21 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c]; FLT4 f = filters[fx_c];
FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_ACCUM_TYPE(src_final * f); r += TO_ACCUM_TYPE(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
FLT4 bias_val = biases[Z]; FLT4 bias_val = biases[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
} }
__kernel void DepthwiseConv2d_NHWC4( __kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
__global FLT4* src_data, float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
__global FLT4* filters, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* biases,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size
) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -83,9 +63,9 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c]; FLT4 f = filters[fx_c];
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_ACCUM_TYPE(src_final * f); r += TO_ACCUM_TYPE(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
@ -93,4 +73,4 @@ __global FLT4* dst_data,
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
} }

View File

@ -1,9 +1,8 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define FLT4 half4 #define FLT4 half4
#define FLT16 half16 #define FLT16 half16
__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias,
__global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci, int2 offset_ci, int2 offset_co, int has_bias) {
int2 offset_co, int has_bias) {
int2 gid = (int2)(get_global_id(0), get_global_id(1)); int2 gid = (int2)(get_global_id(0), get_global_id(1));
int2 lid = (int2)(get_local_id(0), get_local_id(1)); int2 lid = (int2)(get_local_id(0), get_local_id(1));
FLT4 s = (FLT4)(0.0f); FLT4 s = (FLT4)(0.0f);
@ -29,4 +28,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
buffer[gid.x] = s; buffer[gid.x] = s;
// memory pollution? or protected by opencl // memory pollution? or protected by opencl
} }
} }

View File

@ -31,7 +31,6 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size, const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) { const int2 padding) {
@ -63,4 +62,4 @@ __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only imag
} }
float4 result = convert_float4(r / window_size); float4 result = convert_float4(r / window_size);
write_imagef(output, (int2)(X, Y * output_shape.w + Z), result); write_imagef(output, (int2)(X, Y * output_shape.w + Z), result);
} }

View File

@ -51,4 +51,4 @@ __kernel void Concat3input(__global float *input0, __global float *input1, __glo
output[index_output] = input2[input_idx]; output[index_output] = input2[input_idx];
} }
} }
} }

View File

@ -1,191 +1,150 @@
#define CI_TILE 4 #define CI_TILE 4
#define CO_TILE 4 #define CO_TILE 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
//#define __global // #define __global
// #pragma OPENCL EXTENSION cl_arm_printf : enable
//#pragma OPENCL EXTENSION cl_arm_printf : enable __kernel void convolution_NHWC_OHWI(__global float *input, __global float *weight, __global float *bias,
__kernel void convolution_NHWC_OHWI(__global float *input,
__global float *weight,
__global float *bias,
__global float *output, __global float *output,
const int4 input_shape, // NHWC const int4 input_shape, // NHWC
const int4 output_shape, // NHWC const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right const int4 pad) {
{ int ow = get_global_id(0);
int ow = get_global_id(0); int oh = get_global_id(1);
int oh = get_global_id(1); int co_slice = get_global_id(2);
int co_slice = get_global_id(2);
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int KH = kernel_stride.x, KW = kernel_stride.y; int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w; int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z; int padTop = pad.x, padLeft = pad.z;
int CI_SLICES = UP_DIV(CI, CI_TILE); int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE); int CO_SLICES = UP_DIV(CO, CO_TILE);
if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) if (oh >= OH || ow >= OW || co_slice >= CO_SLICES) return;
return;
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for (int kh = 0; kh < KH; ++kh) for (int kh = 0; kh < KH; ++kh) {
{ int ih = kh + oh * strideH - padTop;
int ih = kh + oh * strideH - padTop; for (int kw = 0; kw < KW; ++kw) {
for (int kw = 0; kw < KW; ++kw) int iw = kw + ow * strideW - padLeft;
{ for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) {
int iw = kw + ow * strideW - padLeft; for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) {
for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) int ci = ci_slice * CI_TILE + ci_inner;
{ if (ci >= CI) break;
for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
{
int ci = ci_slice * CI_TILE + ci_inner;
if (ci >= CI)
break;
int input_idx = ih * IW * CI + iw * CI + ci; int input_idx = ih * IW * CI + iw * CI + ci;
float value = 0; float value = 0;
if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
value = 0; value = 0;
else else
value = input[input_idx]; value = input[input_idx];
int CO_OFFSET = KH * KW * CI; int CO_OFFSET = KH * KW * CI;
int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci;
kh * KW * CI + acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
kw * CI + acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
ci; acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.x += weight[weight_idx + 0 * CO_OFFSET] * value; acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
}
}
} }
}
} }
int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE); }
if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
{ if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2]; output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3]; output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
} } else if (CO % CO_TILE == 1) {
else if (CO % CO_TILE == 1) output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
{ } else if (CO % CO_TILE == 2) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
} output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
else if (CO % CO_TILE == 2) } else if (CO % CO_TILE == 3) {
{ output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0]; output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1]; output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
} }
else if (CO % CO_TILE == 3)
{
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
}
} }
// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
// #define FLT4 half4
//#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//#define FLT4 half4
#define FLT4 float4 #define FLT4 float4
__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias,
__global FLT4 *weight,
__global FLT4 *bias,
__global FLT4 *output, __global FLT4 *output,
const int4 input_shape, // NHWC const int4 input_shape, // NHWC
const int4 output_shape, // NHWC const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right const int4 pad) {
{ int oh = get_global_id(0); // [0, OH)
int oh = get_global_id(0); // [0, OH) int ow = get_global_id(1); // [0, OW)
int ow = get_global_id(1); // [0, OW) int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z; int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z; int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int CI_SLICES = UP_DIV(CI, CI_TILE); int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE); int CO_SLICES = UP_DIV(CO, CO_TILE);
int KH = kernel_stride.x, KW = kernel_stride.y; int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w; int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z; int padTop = pad.x, padLeft = pad.z;
if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES) return;
return; if (2 * co_slice + 1 >= CO_SLICES) {
if (2 * co_slice + 1 >= CO_SLICES) FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
{ __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); for (int kh = 0; kh < KH; ++kh) {
__global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; int ih = kh + oh * strideH - padTop;
for (int kh = 0; kh < KH; ++kh) for (int kw = 0; kw < KW; ++kw) {
{ int iw = kw + ow * strideW - padLeft;
int ih = kh + oh * strideH - padTop; if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
for (int kw = 0; kw < KW; ++kw) for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
{ FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
int iw = kw + ow * strideW - padLeft; out0_c4 += w0_ic1_oc4[0] * in_c4.x;
if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) out0_c4 += w0_ic1_oc4[1] * in_c4.y;
{ out0_c4 += w0_ic1_oc4[2] * in_c4.z;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) out0_c4 += w0_ic1_oc4[3] * in_c4.w;
{ w0_ic1_oc4 += 4;
FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice]; }
out0_c4 += w0_ic1_oc4[0] * in_c4.x; } else {
out0_c4 += w0_ic1_oc4[1] * in_c4.y; w0_ic1_oc4 += 4 * CI_SLICES;
out0_c4 += w0_ic1_oc4[2] * in_c4.z;
out0_c4 += w0_ic1_oc4[3] * in_c4.w;
w0_ic1_oc4 += 4;
}
}
else
{
w0_ic1_oc4 += 4 * CI_SLICES;
}
}
} }
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; }
} }
else output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
{ } else {
FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE; __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
__global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE; __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
for (int kh = 0; kh < KH; ++kh) for (int kh = 0; kh < KH; ++kh) {
{ int ih = kh + oh * strideH - padTop;
int ih = kh + oh * strideH - padTop; for (int kw = 0; kw < KW; ++kw) {
for (int kw = 0; kw < KW; ++kw) int iw = kw + ow * strideW - padLeft;
{ if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) {
int iw = kw + ow * strideW - padLeft; int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
{ FLT4 in_c4 = input[idx + ci_slice];
int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
{
FLT4 in_c4 = input[idx + ci_slice];
out0_c4 += w0_ic1_oc4[0] * in_c4.x; out0_c4 += w0_ic1_oc4[0] * in_c4.x;
out0_c4 += w0_ic1_oc4[1] * in_c4.y; out0_c4 += w0_ic1_oc4[1] * in_c4.y;
out0_c4 += w0_ic1_oc4[2] * in_c4.z; out0_c4 += w0_ic1_oc4[2] * in_c4.z;
out0_c4 += w0_ic1_oc4[3] * in_c4.w; out0_c4 += w0_ic1_oc4[3] * in_c4.w;
w0_ic1_oc4 += 4; w0_ic1_oc4 += 4;
out1_c4 += w1_ic1_oc4[0] * in_c4.x; out1_c4 += w1_ic1_oc4[0] * in_c4.x;
out1_c4 += w1_ic1_oc4[1] * in_c4.y; out1_c4 += w1_ic1_oc4[1] * in_c4.y;
out1_c4 += w1_ic1_oc4[2] * in_c4.z; out1_c4 += w1_ic1_oc4[2] * in_c4.z;
out1_c4 += w1_ic1_oc4[3] * in_c4.w; out1_c4 += w1_ic1_oc4[3] * in_c4.w;
w1_ic1_oc4 += 4; w1_ic1_oc4 += 4;
} }
} } else {
else w0_ic1_oc4 += 4 * CI_SLICES;
{ w1_ic1_oc4 += 4 * CI_SLICES;
w0_ic1_oc4 += 4 * CI_SLICES;
w1_ic1_oc4 += 4 * CI_SLICES;
}
}
} }
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0]; }
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
} }
} output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
}
}

View File

@ -8,18 +8,9 @@
#define TO_FLT4 convert_float4 #define TO_FLT4 convert_float4
#endif #endif
__constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t sampler_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void DepthwiseConv2d_IMG_NC4HW4( __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__read_only image2d_t src_data, float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
__global FLT4* filter, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__write_only image2d_t dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -36,32 +27,23 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; // FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c))); FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
FLT4 bias_val = bias[Z]; FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; // dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0); write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
} }
__kernel void DepthwiseConv2d_IMG_NHWC4( __kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__read_only image2d_t src_data, float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
__global FLT4* filter, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__write_only image2d_t dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -78,32 +60,23 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(Z+x_c*src_size.z, y_c)); FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c));
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
FLT4 bias_val = bias[Z]; FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(X*dst_size.z+Z, Y), res0); write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0);
} }
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1( __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
__read_only image2d_t src_data, float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
__global FLT4* filter, int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__write_only image2d_t dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -120,30 +93,21 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; // FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z)); FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
} }
} }
FLT4 bias_val = bias[Z]; FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; // dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0); write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0);
} }
__kernel void DepthwiseConv2d_BUF_NC4HW4( __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4* src_data, float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
__global FLT4* filter, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -160,30 +124,21 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))]; FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
FLT4 bias_val = bias[Z]; FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0; dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
} }
__kernel void DepthwiseConv2d_BUF_NHWC4( __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4* src_data, float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
__global FLT4* filter, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -200,9 +155,9 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
fx_c++; fx_c++;
} }
} }
@ -212,18 +167,9 @@ __global FLT4* dst_data,
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
} }
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1( __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
__global FLT4* src_data, float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
__global FLT4* filter, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
int Z = get_global_id(2); int Z = get_global_id(2);
@ -240,13 +186,13 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x; bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) { if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c]; FLT4 f = filter[fx_c];
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)]; FLT4 src_final = src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_FLT4(src_final * f); r += TO_FLT4(src_final * f);
}; }
} }
} }
FLT4 bias_val = bias[Z]; FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val; FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1)); res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0; dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
} }

View File

@ -1,8 +1,7 @@
#define FLT4 float4 #define FLT4 float4
#define FLT16 float16 #define FLT16 float16
__kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight, __global FLT4 *buffer, __global FLT4 *bias,
__global FLT4 *buffer, __global FLT4 *bias, int2 offset_ci, int2 offset_ci, int2 offset_co, int has_bias) {
int2 offset_co, int has_bias) {
int2 gid = (int2)(get_global_id(0), get_global_id(1)); int2 gid = (int2)(get_global_id(0), get_global_id(1));
int2 lid = (int2)(get_local_id(0), get_local_id(1)); int2 lid = (int2)(get_local_id(0), get_local_id(1));
FLT4 s = (FLT4)(0.0f); FLT4 s = (FLT4)(0.0f);
@ -28,4 +27,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
buffer[gid.x] = s; buffer[gid.x] = s;
// memory pollution? or protected by opencl // memory pollution? or protected by opencl
} }
} }

View File

@ -1,16 +1,13 @@
#define SLICES 4 #define SLICES 4
int DivideRoundUp(int n, int div) int DivideRoundUp(int n, int div) {
{ int q = n / div;
int q = n / div; return n % div == 0 ? q : q + 1;
return n % div == 0 ? q : q + 1;
} }
__kernel void SoftMax(__global float4 *input, __kernel void SoftMax(__global float4 *input, __global float4 *output, const int4 input_shape) {
__global float4 *output, int X = get_global_id(0); // width
const int4 input_shape) { int Y = get_global_id(1); // height
int X = get_global_id(0); // width
int Y = get_global_id(1); // height
int H = input_shape.y; int H = input_shape.y;
int W = input_shape.z; int W = input_shape.z;
int C = input_shape.w; int C = input_shape.w;
@ -32,4 +29,4 @@ __kernel void SoftMax(__global float4 *input,
float4 result = convert_float4(t); float4 result = convert_float4(t);
output[(Y * W + X * H) * C + d] = result; output[(Y * W + X * H) * C + d] = result;
} }
} }