!17294 optimize conv and depthwise

From: @yeyunpeng2020
Reviewed-by: @ddwsky,@HilbertDavid
Signed-off-by: @ddwsky
This commit is contained in:
mindspore-ci-bot 2021-05-31 09:40:55 +08:00 committed by Gitee
commit 5b30e3ce51
4 changed files with 935 additions and 11 deletions

View File

@ -308,6 +308,129 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
}
}
__kernel void Conv2D_H2W2C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 2;
const int BlockW = 2;
const int BlockC = 1;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int ow1 = ow + 1;
int co_slice0 = co_slice + 0;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
for (int kh = 0; kh < KH; ++kh) {
int ih0 = kh * dilationH + oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = kh * dilationH + oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
for (int kw = 0; kw < KW; ++kw) {
int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
int x_idx0 = iw0 * CI_SLICES;
int x_idx1 = iw1 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
x_idx0++;
x_idx1++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
weight_ptr += 4;
}
}
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h0_w1_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
out_h1_w1_c0 += bias[co_slice0];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h0_w1_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h1_w1_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h0_w1_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w1_c0, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
}
if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
}
}
__kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
@ -676,3 +799,741 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2
} // end if (co_slice1 < CO_SLICES)
}
}
__kernel void Conv2D_H1W1C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 1;
const int BlockW = 1;
const int BlockC = 1;
DEFINE_ARGS;
int oh0 = oh + 0;
int n_oh0 = n * OH + oh0;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
int ih0 = oh0 * strideH - padTop;
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int iw0 = ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
x_idx0++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
weight_ptr += 4;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
}
if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
}
}
__kernel void Conv2D_H2W1C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 2;
const int BlockW = 1;
const int BlockC = 1;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
int ih0 = oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int iw0 = ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
x_idx0++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
weight_ptr += 4;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h1_w0_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
} // end if (oh1 < OH)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
#endif
}
__kernel void Conv2D_H2W1C2_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 2;
const int BlockW = 1;
const int BlockC = 2;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;
int co_slice1 = co_slice + 1;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
int ih0 = oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int iw0 = ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
x_idx0++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
weight_ptr += 8;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
out_h0_w0_c1 += bias[co_slice1];
out_h1_w0_c1 += bias[co_slice1];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h0_w0_c1);
DO_TANH(out_h1_w0_c1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
DO_LEAKY_RELU(out_h0_w0_c1, alpha);
DO_LEAKY_RELU(out_h1_w0_c1, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#endif
}
__kernel void Conv2D_H2W2C1_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 2;
const int BlockW = 2;
const int BlockC = 1;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int ow1 = ow + 1;
int co_slice0 = co_slice + 0;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
int ih0 = oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int iw0 = ow0 * strideW - padLeft;
int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
int x_idx0 = iw0 * CI_SLICES;
int x_idx1 = iw1 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
x_idx0++;
x_idx1++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
weight_ptr += 4;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h0_w1_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
out_h1_w1_c0 += bias[co_slice0];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h0_w1_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h1_w1_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h0_w1_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w1_c0, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
#endif
}
__kernel void Conv2D_H2W2C2_1x1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
const int BlockH = 2;
const int BlockW = 2;
const int BlockC = 2;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int ow1 = ow + 1;
int co_slice0 = co_slice + 0;
int co_slice1 = co_slice + 1;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
int ih0 = oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int iw0 = ow0 * strideW - padLeft;
int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
int x_idx0 = iw0 * CI_SLICES;
int x_idx1 = iw1 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
x_idx0++;
x_idx1++;
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x;
out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x;
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x;
out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x;
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y;
out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y;
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y;
out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y;
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z;
out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z;
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z;
out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z;
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w;
out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w;
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w;
out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w;
out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x;
out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x;
out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x;
out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x;
out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y;
out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y;
out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y;
out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y;
out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z;
out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z;
out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z;
out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z;
out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w;
out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w;
out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w;
out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w;
weight_ptr += 8;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h0_w1_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
out_h1_w1_c0 += bias[co_slice0];
out_h0_w0_c1 += bias[co_slice1];
out_h0_w1_c1 += bias[co_slice1];
out_h1_w0_c1 += bias[co_slice1];
out_h1_w1_c1 += bias[co_slice1];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h0_w1_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h1_w1_c0);
DO_TANH(out_h0_w0_c1);
DO_TANH(out_h0_w1_c1);
DO_TANH(out_h1_w0_c1);
DO_TANH(out_h1_w1_c1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h0_w1_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w1_c0, alpha);
DO_LEAKY_RELU(out_h0_w0_c1, alpha);
DO_LEAKY_RELU(out_h0_w1_c1, alpha);
DO_LEAKY_RELU(out_h1_w0_c1, alpha);
DO_LEAKY_RELU(out_h1_w1_c1, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#endif
}
__kernel void Conv2D_H2W2C2_Img_1x1(__read_only image2d_t input, __write_only image2d_t output,
__read_only image2d_t weight, __global FLT4 *bias, int4 input_shape,
int4 output_shape, int4 kernel_stride, int4 pad, int2 dilation, int act_type,
float alpha) {
const int BlockH = 2;
const int BlockW = 2;
const int BlockC = 2;
DEFINE_ARGS;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int ow1 = ow + 1;
int co_slice0 = co_slice + 0;
int co_slice1 = co_slice + 1;
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int filter_offset = 0;
int ih0 = oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH)
int ih1 = oh1 * strideH - padTop;
// check ih0 and ih1
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int iw0 = ow0 * strideW - padLeft;
int iw1 = (ow1 < OW) ? ow1 * strideW - padLeft : -2;
int x_idx0 = iw0 * CI_SLICES;
int x_idx1 = iw1 * CI_SLICES;
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0));
FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0));
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1));
FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1));
x_idx0++;
x_idx1++;
FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 0));
FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 1));
FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 2));
FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 3));
FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 0));
FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 1));
FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 2));
FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 3));
filter_offset += 4;
out_h0_w0_c0 += filter_ci0_co0 * in_h0_w0.x;
out_h0_w1_c0 += filter_ci0_co0 * in_h0_w1.x;
out_h1_w0_c0 += filter_ci0_co0 * in_h1_w0.x;
out_h1_w1_c0 += filter_ci0_co0 * in_h1_w1.x;
out_h0_w0_c0 += filter_ci1_co0 * in_h0_w0.y;
out_h0_w1_c0 += filter_ci1_co0 * in_h0_w1.y;
out_h1_w0_c0 += filter_ci1_co0 * in_h1_w0.y;
out_h1_w1_c0 += filter_ci1_co0 * in_h1_w1.y;
out_h0_w0_c0 += filter_ci2_co0 * in_h0_w0.z;
out_h0_w1_c0 += filter_ci2_co0 * in_h0_w1.z;
out_h1_w0_c0 += filter_ci2_co0 * in_h1_w0.z;
out_h1_w1_c0 += filter_ci2_co0 * in_h1_w1.z;
out_h0_w0_c0 += filter_ci3_co0 * in_h0_w0.w;
out_h0_w1_c0 += filter_ci3_co0 * in_h0_w1.w;
out_h1_w0_c0 += filter_ci3_co0 * in_h1_w0.w;
out_h1_w1_c0 += filter_ci3_co0 * in_h1_w1.w;
out_h0_w0_c1 += filter_ci0_co1 * in_h0_w0.x;
out_h0_w1_c1 += filter_ci0_co1 * in_h0_w1.x;
out_h1_w0_c1 += filter_ci0_co1 * in_h1_w0.x;
out_h1_w1_c1 += filter_ci0_co1 * in_h1_w1.x;
out_h0_w0_c1 += filter_ci1_co1 * in_h0_w0.y;
out_h0_w1_c1 += filter_ci1_co1 * in_h0_w1.y;
out_h1_w0_c1 += filter_ci1_co1 * in_h1_w0.y;
out_h1_w1_c1 += filter_ci1_co1 * in_h1_w1.y;
out_h0_w0_c1 += filter_ci2_co1 * in_h0_w0.z;
out_h0_w1_c1 += filter_ci2_co1 * in_h0_w1.z;
out_h1_w0_c1 += filter_ci2_co1 * in_h1_w0.z;
out_h1_w1_c1 += filter_ci2_co1 * in_h1_w1.z;
out_h0_w0_c1 += filter_ci3_co1 * in_h0_w0.w;
out_h0_w1_c1 += filter_ci3_co1 * in_h0_w1.w;
out_h1_w0_c1 += filter_ci3_co1 * in_h1_w0.w;
out_h1_w1_c1 += filter_ci3_co1 * in_h1_w1.w;
}
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0];
out_h0_w1_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0];
out_h1_w1_c0 += bias[co_slice0];
out_h0_w0_c1 += bias[co_slice1];
out_h0_w1_c1 += bias[co_slice1];
out_h1_w0_c1 += bias[co_slice1];
out_h1_w1_c1 += bias[co_slice1];
}
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f));
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h0_w1_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h1_w1_c0);
DO_TANH(out_h0_w0_c1);
DO_TANH(out_h0_w1_c1);
DO_TANH(out_h1_w0_c1);
DO_TANH(out_h1_w1_c1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0, alpha);
DO_LEAKY_RELU(out_h0_w1_c0, alpha);
DO_LEAKY_RELU(out_h1_w0_c0, alpha);
DO_LEAKY_RELU(out_h1_w1_c0, alpha);
DO_LEAKY_RELU(out_h0_w0_c1, alpha);
DO_LEAKY_RELU(out_h0_w1_c1, alpha);
DO_LEAKY_RELU(out_h1_w0_c1, alpha);
DO_LEAKY_RELU(out_h1_w1_c1, alpha);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
#endif
}

View File

@ -143,8 +143,15 @@ void Conv2DOpenCLKernel::BuildKernel() {
if (filter_type_ == MemType::IMG) {
kernel_name << "_Img";
}
if (KW_ == 1 && KH_ == 1) {
kernel_name << "_1x1";
}
ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
auto build_options_ext = CreateBuildOptionsExtByDType(this->registry_data_type_);
std::string exceed_max_image_width_option =
(OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH";
build_options_ext.push_back(exceed_max_image_width_option);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), build_options_ext);
}
@ -155,25 +162,63 @@ void Conv2DOpenCLKernel::SetBlockSize() {
}
auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_);
auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits();
bool w_kernel_is_1 =
KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
bool h_kernel_is_1 =
KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
if (use_fp16_) {
SetMaliFp16BlockSize(task_size_per_cu, w_kernel_is_1, h_kernel_is_1);
} else {
SetMaliFp32BlockSize(task_size_per_cu, w_kernel_is_1, h_kernel_is_1);
}
}
void Conv2DOpenCLKernel::SetMaliFp32BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1) {
int block_size;
if (task_size_per_cu <= 256) {
block_size = 1;
} else if (task_size_per_cu <= 256 * 4) {
block_size = 2;
} else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) {
} else if (task_size_per_cu <= FLT_MAX) {
block_size = 4;
} else {
block_size = 8;
}
bool w_kernel_is_1 =
KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0;
bool h_kernel_is_1 =
KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0;
if (!w_kernel_is_1 || !h_kernel_is_1) {
block_size = std::min(block_size, 4);
}
if (block_size == 8) {
block_size_ = {2, 2, 2};
} else if (block_size == 4) {
block_size_ = {2, 2, 1};
} else if (block_size == 2) {
block_size_ = {2, 1, 1};
} else {
block_size_ = {1, 1, 1};
}
}
void Conv2DOpenCLKernel::SetMaliFp16BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1) {
int block_size;
if (task_size_per_cu <= 256) {
block_size = 1;
} else if (task_size_per_cu <= 256 * 4) {
block_size = 2;
} else if (task_size_per_cu <= 256 * 8) {
block_size = 4;
} else {
block_size = 8;
}
if (!w_kernel_is_1 || !h_kernel_is_1) {
block_size = std::min(block_size, 4);
}
if (CO_SLICES_ >= 128 && OH_ >= 10 && OW_ >= 10) {
block_size = 8;
}
if (block_size == 8) {
block_size_ = {2, 2, 2};
} else if (block_size == 4) {

View File

@ -105,6 +105,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
private:
void SetBlockSize();
void SetMaliFp16BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1);
void SetMaliFp32BlockSize(int task_size_per_cu, bool w_kernel_is_1, bool h_kernel_is_1);
struct {
int H{1};
int W{1};

View File

@ -244,13 +244,29 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
global_size_ = {CO4, (size_t)UP_DIV(out_info.W, block_size_.W),
(size_t)UP_DIV(out_info.H, block_size_.H) * out_info.N};
// set local
const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
int z = global_size_[0];
int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8));
int x = std::max(1, std::min(static_cast<int>(global_size_[1]), max_group_size / (y * z)));
local_size_ = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
int local_max = filter_type_ == MemType::IMG ? 64 : 128;
if (ocl_runtime_->DeviceComputeUnits() > 16) {
local_max = 256;
}
const int local_c_max = 16;
const int OH_threshold = 100;
const int OW_threshold = 100;
const int OC_threshold = 64;
size_t local_c = GetMaxDivisor(global_size_[0], local_c_max);
local_c = std::max<size_t>(local_c, 1);
size_t local_hw = local_max / local_c;
size_t local_h;
size_t local_w;
if (out_info.H >= OH_threshold && out_info.W >= OW_threshold && out_info.C <= OC_threshold) { // c -> w -> h
local_w = std::min(global_size_[1], local_hw);
local_h = std::min(local_hw / local_w, global_size_[2]);
} else { // c -> h -> w
local_h = std::min(global_size_[2], local_hw);
local_w = std::min(local_hw / local_h, global_size_[1]);
}
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
local_size_ = {local_c, local_w, local_h};
AlignGlobalLocal(global_size_, local_size_);
}
int DepthwiseConv2dOpenCLKernel::StoreConstData() {