!24251 ms opencl performance drop bugfix

Merge pull request !24251 from Greatpan/br003
This commit is contained in:
i-robot 2021-09-28 00:56:54 +00:00 committed by Gitee
commit 9d0553255a
1 changed files with 9 additions and 64 deletions

View File

@ -59,26 +59,14 @@ __kernel void SoftmaxAxis1_NHWC4(__read_only image2d_t input, __write_only image
if (n >= input_shape.x || X >= W || Y >= C4) return;
// get max
float input_max = 0.0f;
for (int d = 0; d < H; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d)));
input_max = max(input_max, t.x);
input_max = max(input_max, t.y);
input_max = max(input_max, t.z);
input_max = max(input_max, t.w);
}
float4 input_max_f4 = (float4)(input_max, input_max, input_max, input_max);
// get sum
float4 sum = 0.0f;
for (int d = 0; d < H; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d)));
sum += exp(t - input_max_f4);
sum += exp(t);
}
for (int d = 0; d < H; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d)));
result = exp(result - input_max_f4) / sum;
result = exp(result) / sum;
WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, n * H + d), OUT_FLT4(result));
}
}
@ -94,26 +82,14 @@ __kernel void SoftmaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
if (n >= input_shape.x || X >= H || Y >= C4) return;
// get max
float input_max = 0.0f;
for (int d = 0; d < W; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X)));
input_max = max(input_max, t.x);
input_max = max(input_max, t.y);
input_max = max(input_max, t.z);
input_max = max(input_max, t.w);
}
float4 input_max_f4 = (float4)(input_max, input_max, input_max, input_max);
// get sum
float4 sum = 0.0f;
for (int d = 0; d < W; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X)));
sum += exp(t - input_max);
sum += exp(t);
}
for (int d = 0; d < W; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X)));
result = exp(result - input_max) / sum;
result = exp(result) / sum;
WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, n * H + X), OUT_FLT4(result));
}
}
@ -124,49 +100,18 @@ __kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
int n = get_global_id(1);
if (n >= input_shape.x) return;
int C4 = input_shape.w;
// get max
float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, n)));
float input_max = last.x;
if (mask.y > 0.5f) input_max = max(input_max, last.y);
if (mask.z > 0.5f) input_max = max(input_max, last.z);
if (mask.w > 0.5f) input_max = max(input_max, last.w);
for (size_t i = tid; i < C4 - 1; i += 32) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n)));
input_max = max(input_max, t.x);
input_max = max(input_max, t.y);
input_max = max(input_max, t.z);
input_max = max(input_max, t.w);
}
__local float4 tmp[8];
__local float *tmpx1 = (__local float *)tmp;
tmpx1[tid] = input_max;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
input_max = max(input_max, tmpx1[0]);
input_max = max(input_max, tmpx1[1]);
input_max = max(input_max, tmpx1[2]);
input_max = max(input_max, tmpx1[3]);
input_max = max(input_max, tmpx1[4]);
input_max = max(input_max, tmpx1[5]);
input_max = max(input_max, tmpx1[6]);
input_max = max(input_max, tmpx1[7]);
tmpx1[0] = input_max;
}
barrier(CLK_GLOBAL_MEM_FENCE);
float4 input_max_f4 = (float4)(tmpx1[0], tmpx1[0], tmpx1[0], tmpx1[0]);
// get sum
float sum = 0.0f;
for (size_t i = tid; i < C4 - 1; i += 32) {
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n)));
sum += dot((float4)(1.0f), exp(src - input_max_f4));
sum += dot((float4)(1.0f), exp(src));
}
if ((C4 - 1) % 32 == tid) {
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, n)));
sum += dot(convert_float4(mask), exp(src - input_max_f4));
sum += dot(convert_float4(mask), exp(src));
}
__local float4 tmp[8];
__local float *tmpx1 = (__local float *)tmp;
tmpx1[tid] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
@ -184,7 +129,7 @@ __kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
sum = tmpx1[0];
for (size_t i = tid; i < C4; i += 32) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n)));
result = exp(result - input_max_f4) * sum;
result = exp(result) * sum;
WRITE_IMAGEOUT(output, (int2)(i, n), OUT_FLT4(result));
}
}