forked from mindspore-Ecosystem/mindspore
!15068 fix depthwise_conv2d multi batch bug
From: @yeyunpeng2020 Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
This commit is contained in:
commit
cb3571a1ca
|
@ -7,10 +7,11 @@ __kernel void DepthwiseConv2d_IMG_NHWC4(__write_only image2d_t dst_data, __read_
|
|||
int X = get_global_id(1);
|
||||
int Y = get_global_id(2);
|
||||
int Z = get_global_id(0);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int batch = Y / dst_size.y;
|
||||
int y_offset = (Y % dst_size.y) * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offset + ky * dilation.y;
|
||||
|
@ -20,7 +21,7 @@ __kernel void DepthwiseConv2d_IMG_NHWC4(__write_only image2d_t dst_data, __read_
|
|||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 flt_p = READ_IMAGE(filter, smp_zero, (int2)(ky * kernel_size.x + kx, Z));
|
||||
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
|
||||
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c + batch * src_size.y));
|
||||
r += TO_FLT4(src_p * flt_p);
|
||||
}
|
||||
fx_c++;
|
||||
|
@ -39,10 +40,11 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __r
|
|||
int X = get_global_id(1);
|
||||
int Y = get_global_id(2);
|
||||
int Z = get_global_id(0);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int batch = Y / dst_size.y;
|
||||
int y_offset = (Y % dst_size.y) * stride.y + padding.y;
|
||||
int fx_c = Z;
|
||||
{
|
||||
int y_c = y_offset;
|
||||
|
@ -52,7 +54,7 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __r
|
|||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 flt_p = READ_IMAGE(filter, smp_zero, (int2)(0, Z));
|
||||
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
|
||||
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c + batch * src_size.y));
|
||||
r += TO_FLT4(src_p * flt_p);
|
||||
}
|
||||
}
|
||||
|
@ -67,22 +69,27 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__write_only image2d_t dst_data, __
|
|||
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
|
||||
float relu_clip_min, float relu_clip_max) {
|
||||
int X = get_global_id(1) * 2;
|
||||
int Y = get_global_id(2) * 2;
|
||||
int heightOfBlock = dst_size.y + (dst_size.y & 0x1);
|
||||
int batch = get_global_id(2) / (heightOfBlock >> 1);
|
||||
int Y = (get_global_id(2) * 2 - heightOfBlock * batch) + batch * dst_size.y;
|
||||
int Z = get_global_id(0) * 2;
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r[8] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int y_offset = (Y - batch * dst_size.y) * stride.y + padding.y;
|
||||
int f_len = kernel_size.x * kernel_size.y;
|
||||
int fx_c = Z * f_len;
|
||||
bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
|
||||
bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
|
||||
bool last_y =
|
||||
((get_global_id(2) - batch * (heightOfBlock >> 1)) == ((heightOfBlock >> 1) - 1)) && ((dst_size.y & 0x1) == 1);
|
||||
bool last_c = (get_global_id(0) == (dst_size.z + 1) / 2) && ((dst_size.z & 0x1) == 1);
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offset + ky * dilation.y;
|
||||
int y_c_a1 = y_c + stride.y;
|
||||
y_c = y_c < 0 || y_c >= src_size.y ? src_size.y * src_size.z : y_c;
|
||||
y_c_a1 = y_c_a1 < 0 || y_c_a1 >= src_size.y ? src_size.y * src_size.z : y_c_a1;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offset + kx * dilation.x;
|
||||
int x_c_a1 = x_c + stride.x;
|
||||
|
@ -92,25 +99,28 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__write_only image2d_t dst_data, __
|
|||
FLT4 flt_p1 = filter[fx_c + f_len];
|
||||
{
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
|
||||
FLT4 src_p00_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_sign + x_c * src_size.z, y_c));
|
||||
FLT4 src_p00_c1 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_sign + x_c * src_size.z, y_c + batch * src_size.y));
|
||||
r[0] += TO_FLT4(src_p00_c0 * flt_p0);
|
||||
r[1] += TO_FLT4(src_p00_c1 * flt_p1);
|
||||
}
|
||||
{
|
||||
FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
|
||||
FLT4 src_p01_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c));
|
||||
FLT4 src_p01_c1 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c + batch * src_size.y));
|
||||
r[2] += TO_FLT4(src_p01_c0 * flt_p0);
|
||||
r[3] += TO_FLT4(src_p01_c1 * flt_p1);
|
||||
}
|
||||
{
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
|
||||
FLT4 src_p10_c1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c_a1));
|
||||
FLT4 src_p10_c1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[4] += TO_FLT4(src_p10_c0 * flt_p0);
|
||||
r[5] += TO_FLT4(src_p10_c1 * flt_p1);
|
||||
}
|
||||
{
|
||||
FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
|
||||
FLT4 src_p11_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
|
||||
FLT4 src_p11_c1 = READ_IMAGE(src_data, smp_zero,
|
||||
(int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[6] += TO_FLT4(src_p11_c0 * flt_p0);
|
||||
r[7] += TO_FLT4(src_p11_c1 * flt_p1);
|
||||
}
|
||||
|
@ -161,33 +171,40 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __
|
|||
int2 padding, int2 dilation, int4 src_size, int4 dst_size,
|
||||
float relu_clip_min, float relu_clip_max) {
|
||||
int X = get_global_id(1) * 2;
|
||||
int Y = get_global_id(2) * 2;
|
||||
int heightOfBlock = dst_size.y + (dst_size.y & 0x1);
|
||||
int batch = get_global_id(2) / (heightOfBlock >> 1);
|
||||
int Y = (get_global_id(2) * 2 - heightOfBlock * batch) + batch * dst_size.y;
|
||||
int Z = get_global_id(0);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int y_offset = (Y - batch * dst_size.y) * stride.y + padding.y;
|
||||
int f_len = kernel_size.x * kernel_size.y;
|
||||
int fx_c = Z * f_len;
|
||||
bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
|
||||
bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
|
||||
bool last_y =
|
||||
((get_global_id(2) - batch * (heightOfBlock >> 1)) == ((heightOfBlock >> 1) - 1)) && ((dst_size.y & 0x1) == 1);
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offset + ky * dilation.y;
|
||||
int y_c_a1 = y_c + stride.y;
|
||||
y_c = y_c < 0 || y_c >= src_size.y ? src_size.y * src_size.w : y_c;
|
||||
y_c_a1 = y_c_a1 < 0 || y_c_a1 >= src_size.y ? src_size.y * src_size.w : y_c_a1;
|
||||
for (int kx = 0; kx < kernel_size.x; ++kx) {
|
||||
int x_c = x_offset + kx * dilation.x;
|
||||
int x_c_a1 = x_c + stride.x;
|
||||
int x_sign = x_c < 0 ? -1 : 1;
|
||||
int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
|
||||
FLT4 flt_p0 = filter[fx_c];
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c + batch * src_size.y));
|
||||
r[0] += TO_FLT4(src_p00_c0 * flt_p0);
|
||||
FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
|
||||
FLT4 src_p01_c0 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c + batch * src_size.y));
|
||||
r[1] += TO_FLT4(src_p01_c0 * flt_p0);
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[2] += TO_FLT4(src_p10_c0 * flt_p0);
|
||||
FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
|
||||
FLT4 src_p11_c0 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[3] += TO_FLT4(src_p11_c0 * flt_p0);
|
||||
|
||||
fx_c++;
|
||||
|
@ -217,17 +234,20 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1_b221(__write_only image2d_t dst_data
|
|||
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size,
|
||||
float relu_clip_min, float relu_clip_max) {
|
||||
int X = get_global_id(1) * 2;
|
||||
int Y = get_global_id(2) * 2;
|
||||
int heightOfBlock = dst_size.y + (dst_size.y & 0x1);
|
||||
int batch = get_global_id(2) / (heightOfBlock >> 1);
|
||||
int Y = (get_global_id(2) * 2 - heightOfBlock * batch) + batch * dst_size.y;
|
||||
int Z = get_global_id(0);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
|
||||
(FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int y_offset = (Y - batch * dst_size.y) * stride.y + padding.y;
|
||||
int f_len = kernel_size.x * kernel_size.y;
|
||||
int fx_c = Z * f_len;
|
||||
bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
|
||||
bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
|
||||
bool last_y =
|
||||
((get_global_id(2) - batch * (heightOfBlock >> 1)) == ((heightOfBlock >> 1) - 1)) && ((dst_size.y & 0x1) == 1);
|
||||
int y_c = y_offset;
|
||||
int y_c_a1 = y_c + stride.y;
|
||||
int x_c = x_offset;
|
||||
|
@ -235,13 +255,17 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1_b221(__write_only image2d_t dst_data
|
|||
int x_sign = x_c < 0 ? -1 : 1;
|
||||
int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
|
||||
FLT4 flt_p0 = filter[fx_c];
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
|
||||
y_c = y_c < 0 || y_c >= src_size.y ? src_size.y * src_size.z : y_c;
|
||||
y_c_a1 = y_c_a1 < 0 || y_c_a1 >= src_size.y ? src_size.y * src_size.z : y_c_a1;
|
||||
FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c + batch * src_size.y));
|
||||
r[0] += TO_FLT4(src_p00_c0 * flt_p0);
|
||||
FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
|
||||
FLT4 src_p01_c0 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c + batch * src_size.y));
|
||||
r[1] += TO_FLT4(src_p01_c0 * flt_p0);
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
|
||||
FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[2] += TO_FLT4(src_p10_c0 * flt_p0);
|
||||
FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
|
||||
FLT4 src_p11_c0 =
|
||||
READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1 + batch * src_size.y));
|
||||
r[3] += TO_FLT4(src_p11_c0 * flt_p0);
|
||||
|
||||
r[0] += bias[Z];
|
||||
|
@ -270,10 +294,11 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4
|
|||
int X = get_global_id(0);
|
||||
int Y = get_global_id(1);
|
||||
int Z = get_global_id(2);
|
||||
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
|
||||
if (X >= dst_size.x || Y >= (dst_size.y * dst_size.w) || Z >= dst_size.z) return;
|
||||
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
|
||||
int x_offset = X * stride.x + padding.x;
|
||||
int y_offset = Y * stride.y + padding.y;
|
||||
int batch = Y / dst_size.y;
|
||||
int y_offset = (Y - batch * dst_size.y) * stride.y + padding.y;
|
||||
int fx_c = Z * kernel_size.x * kernel_size.y;
|
||||
for (int ky = 0; ky < kernel_size.y; ++ky) {
|
||||
int y_c = y_offset + ky * dilation.y;
|
||||
|
@ -283,7 +308,7 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4
|
|||
bool outside_x = x_c < 0 || x_c >= src_size.x;
|
||||
if (!outside_x && !outside_y) {
|
||||
FLT4 flt_p = filter[fx_c];
|
||||
FLT4 src_p = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
|
||||
FLT4 src_p = src_data[(((Z)*src_size.y + (y_c + batch * src_size.y)) * src_size.x + (x_c))];
|
||||
r += TO_FLT4(src_p * flt_p);
|
||||
}
|
||||
fx_c++;
|
||||
|
|
|
@ -238,7 +238,7 @@ void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
|
|||
// set global
|
||||
size_t CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C);
|
||||
global_size_ = {CO4, (size_t)UP_DIV(out_info.W, block_size_.W),
|
||||
(size_t)UP_DIV(out_info.H * out_info.N, block_size_.H)};
|
||||
(size_t)UP_DIV(out_info.H, block_size_.H) * out_info.N};
|
||||
// set local
|
||||
const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
|
||||
int z = global_size_[0];
|
||||
|
|
Loading…
Reference in New Issue