!13249 [MS][LITE][GPU]op support n>1

From: @chenzupeng
Reviewed-by: @ddwsky,@zhanghaibo5
Signed-off-by: @zhanghaibo5
This commit is contained in:
mindspore-ci-bot 2021-03-13 10:59:54 +08:00 committed by Gitee
commit 75faccfcdc
13 changed files with 248 additions and 87 deletions

View File

@ -7,12 +7,13 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
@ -23,28 +24,30 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
bool outside_y = y_c < 0 || y_c >= input_shape.z;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
bool outside = outside_y || x_c < 0 || x_c >= input_shape.y;
r +=
!outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result);
}
__kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
@ -55,28 +58,30 @@ __kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
bool outside_y = y_c < 0 || y_c >= input_shape.y;
bool outside_y = y_c < 0 || y_c >= input_shape.z;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f);
bool outside = outside_y || x_c < 0 || x_c >= input_shape.y;
r +=
!outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f);
window_size += !outside ? 1.0f : 0.0f;
}
}
FLT4 result = TO_FLT4(divide_no_check(r, window_size));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f)));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f)));
}
__kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
@ -85,27 +90,28 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
if (y_c < 0 || y_c >= input_shape.z) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
if (x_c < 0 || x_c >= input_shape.y) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum);
}
__kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, const int4 output_shape, const int2 stride,
const int2 kernel_size, const int2 padding) {
// axis to dst tensor coordinate
int X = get_global_id(2);
int Y = get_global_id(1);
int Z = get_global_id(0);
int X = get_global_id(2); // N*H
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
int N = X / output_shape.y;
X = X % output_shape.y;
// boundary check
if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) {
return;
}
@ -114,13 +120,13 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o
int ys = Y * stride.y - padding.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = ys + ky;
if (y_c < 0 || y_c >= input_shape.y) continue;
if (y_c < 0 || y_c >= input_shape.z) continue;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = xs + kx;
if (x_c < 0 || x_c >= input_shape.x) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
if (x_c < 0 || x_c >= input_shape.y) continue;
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c));
maximum = max(src, maximum);
}
}
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f)));
WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f)));
}

View File

@ -4,16 +4,18 @@
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void resize_nearest_neighbor_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
int4 in_size, int4 out_size, float2 scale_factor) {
int X = get_global_id(2); // H
int X = get_global_id(2); // H * N
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
if (X >= out_size.y || Y >= out_size.z || Z >= out_size.w) {
if (X >= out_size.x * out_size.y || Y >= out_size.z || Z >= out_size.w) {
return;
}
int N = X / out_size.y;
X = X % out_size.y;
int src_x = (int)(X * scale_factor.x);
int src_y = (int)(Y * scale_factor.y);
WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, X),
READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x)));
WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, N * out_size.y + X),
READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x)));
}
__kernel void resize_nearest_neighbor_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
@ -32,25 +34,27 @@ __kernel void resize_nearest_neighbor_NC4HW4(__read_only image2d_t src_data, __w
__kernel void resize_bilinear_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 in_size,
int4 out_size, float2 scale_factor) {
int X = get_global_id(2); // H
int X = get_global_id(2); // H * N
int Y = get_global_id(1); // W
int Z = get_global_id(0); // C4
if (X >= out_size.y || Y >= out_size.z || Z >= out_size.w) {
if (X >= out_size.x * out_size.y || Y >= out_size.z || Z >= out_size.w) {
return;
}
int N = X / out_size.y;
X = X % out_size.y;
float scale_x = X * scale_factor.x;
float scale_y = Y * scale_factor.y;
int src_x = (int)(scale_x);
int src_y = (int)(scale_y);
int src_x_1 = min(src_x + 1, in_size.y - 1);
int src_y_1 = min(src_y + 1, in_size.z - 1);
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x));
FLT4 src1 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, src_x));
FLT4 src2 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x_1));
FLT4 src3 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, src_x_1));
FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x));
FLT4 src1 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, N * in_size.y + src_x));
FLT4 src2 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x_1));
FLT4 src3 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, N * in_size.y + src_x_1));
FLT4 result =
mix(mix(src0, src1, TO_FLT(scale_y - src_y)), mix(src2, src3, TO_FLT(scale_y - src_y)), TO_FLT(scale_x - src_x));
WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, X), result);
WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, N * out_size.y + X), result);
}
__kernel void resize_bilinear_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 in_size,

View File

@ -8,20 +8,21 @@ __kernel void SoftmaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
const int4 input_shape) {
int X = get_global_id(1); // H
int Y = get_global_id(0); // W
int n = get_global_id(2); // N
int H = input_shape.y;
int W = input_shape.z;
int C4 = input_shape.w;
if (X >= H || Y >= W) return;
if (n >= input_shape.x || X >= H || Y >= W) return;
// get max
float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X)));
float input_max = last.x;
if (mask.y > 0.5f) input_max = max(input_max, last.y);
if (mask.z > 0.5f) input_max = max(input_max, last.z);
if (mask.w > 0.5f) input_max = max(input_max, last.w);
for (int d = 0; d < C4 - 1; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X)));
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X)));
input_max = max(input_max, t.x);
input_max = max(input_max, t.y);
input_max = max(input_max, t.z);
@ -31,41 +32,42 @@ __kernel void SoftmaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
float sum = 0.0f;
for (int d = 0; d < C4 - 1; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X)));
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X)));
sum += dot(exp(t - input_max_f4), (float4)(1.f));
}
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X)));
sum += dot(exp(min(t - input_max_f4, 0)), mask);
for (int d = 0; d < C4 - 1; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X)));
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X)));
result = exp(result - input_max_f4) / sum;
WRITE_IMAGE(output, (int2)(Y * C4 + d, X), TO_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(Y * C4 + d, n * H + X), OUT_FLT4(result));
}
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X)));
result = exp(min(result - input_max_f4, 0)) / sum;
result = result * mask;
WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, n * H + X), OUT_FLT4(result));
}
__kernel void SoftmaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
const int4 input_shape) {
int X = get_global_id(1); // W
int Y = get_global_id(0); // C4
int n = get_global_id(2); // N
int H = input_shape.y;
int W = input_shape.z;
int C4 = input_shape.w;
if (X >= W || Y >= C4) return;
if (n >= input_shape.x || X >= W || Y >= C4) return;
float4 sum = 0.0f;
for (int d = 0; d < H; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d)));
sum += exp(t);
}
for (int d = 0; d < H; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d)));
result = exp(result) / sum;
WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, n * H + d), OUT_FLT4(result));
}
}
@ -73,35 +75,38 @@ __kernel void SoftmaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
const int4 input_shape) {
int X = get_global_id(1); // H
int Y = get_global_id(0); // C4
int n = get_global_id(2); // n
int H = input_shape.y;
int W = input_shape.z;
int C4 = input_shape.w;
if (X >= H || Y >= C4) return;
if (n >= input_shape.x || X >= H || Y >= C4) return;
float4 sum = 0.0f;
for (int d = 0; d < W; ++d) {
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X)));
sum += exp(t);
}
for (int d = 0; d < W; ++d) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X)));
result = exp(result) / sum;
WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, n * H + X), OUT_FLT4(result));
}
}
__kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
const int4 input_shape) {
int tid = get_local_id(0);
int n = get_global_id(1);
if (n >= input_shape.x) return;
int C4 = input_shape.w;
float sum = 0.0f;
for (size_t i = tid; i < C4 - 1; i += 32) {
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n)));
sum += dot((float4)(1.0f), exp(src));
}
if ((C4 - 1) % 32 == tid) {
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, 0)));
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, n)));
sum += dot(convert_float4(mask), exp(src));
}
@ -123,8 +128,8 @@ __kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
barrier(CLK_LOCAL_MEM_FENCE);
sum = tmpx1[0];
for (size_t i = tid; i < C4; i += 32) {
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n)));
result = exp(result) * sum;
WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result));
WRITE_IMAGEOUT(output, (int2)(i, n), OUT_FLT4(result));
}
}

View File

@ -4,8 +4,11 @@ __kernel void space_to_batch_nd_NHWC4(__read_only image2d_t src_data, __write_on
int4 dst_size, int2 block_size, int4 paddings) {
int X = get_global_id(0); // c
int Y = get_global_id(1); // w
int Z = get_global_id(2); // h
if (X >= dst_size.x || Y >= dst_size.y || Y >= dst_size.z) {
int Z = get_global_id(2); // h * n_i
// (N,H*BH,W*BW,C) to (BH*BW*N,H,W,C)
int N_I = Z / dst_size.z;
Z = Z % dst_size.z;
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || N_I >= src_size.w) {
return;
}
for (int i = 0; i < block_size.x; ++i) {
@ -13,8 +16,10 @@ __kernel void space_to_batch_nd_NHWC4(__read_only image2d_t src_data, __write_on
int w_org = Y * block_size.y + j - paddings.z;
int h_org = Z * block_size.x + i - paddings.x;
FLT4 res_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
res_data = READ_IMAGE(src_data, smp_zero, (int2)(w_org * dst_size.x + X, h_org));
WRITE_IMAGE(dst_data, (int2)(Y * dst_size.x + X, (i * block_size.y + j) * dst_size.z + Z), res_data);
if (h_org >= 0 && h_org < src_size.z)
res_data = READ_IMAGE(src_data, smp_zero, (int2)(w_org * dst_size.x + X, N_I * src_size.z + h_org));
WRITE_IMAGE(dst_data, (int2)(Y * dst_size.x + X, ((i * block_size.y + j) * src_size.w + N_I) * dst_size.z + Z),
res_data);
}
}
}

View File

@ -42,6 +42,10 @@ int PoolingOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
if (in_tensors_[0]->shape().size() != 4) {
MS_LOG(ERROR) << "Only support 4d tensor.";
return RET_ERROR;
}
if (parameter_->pool_mode_ != PoolMode_MaxPool && parameter_->pool_mode_ != PoolMode_AvgPool) {
MS_LOG(ERROR) << "Init `Pooling2d` kernel failed, unsupported pool mode!";
return RET_ERROR;
@ -88,7 +92,7 @@ int PoolingOpenCLKernel::Prepare() {
}
void PoolingOpenCLKernel::SetGlobalLocal() {
const size_t global_x = out_tensors_[0]->shape()[1];
const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0];
const size_t global_y = out_tensors_[0]->shape()[2];
const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
global_size_ = {global_z, global_y, global_x};
@ -98,8 +102,8 @@ void PoolingOpenCLKernel::SetGlobalLocal() {
void PoolingOpenCLKernel::SetConstArgs() {
int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM);
cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices};
cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3],
cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices};
cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
slices};
cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};
cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_};

View File

@ -106,7 +106,7 @@ void ResizeOpenCLKernel::SetConstArgs() {
void ResizeOpenCLKernel::SetGlobalLocal() {
local_size_ = {};
auto out_shape = GpuTensorInfo(out_tensors_[0]);
global_size_ = {out_shape.Slice, out_shape.W, out_shape.H};
global_size_ = {out_shape.Slice, out_shape.W, out_shape.H * out_shape.N};
AlignGlobalLocal(global_size_, local_size_);
}

View File

@ -54,10 +54,6 @@ int SoftmaxOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "Init Softmax kernel failed: Unsupported shape size: " << in_shape.size();
return RET_ERROR;
}
if (in_shape[0] > 1) {
MS_LOG(ERROR) << "Init Softmax kernel failed: Unsupported multi-batch.";
return RET_ERROR;
}
if (axis_ < 0) {
axis_ = in_shape.size() + axis_;
}
@ -104,8 +100,8 @@ int SoftmaxOpenCLKernel::Prepare() {
void SoftmaxOpenCLKernel::SetGlobalLocal() {
if (onexone_flag_) {
local_size_ = {32};
global_size_ = {32};
local_size_ = {32, 1};
global_size_ = {32, out_shape_.N};
} else {
size_t global_x, global_y;
if (axis_ == 1) {
@ -121,7 +117,7 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() {
global_x = 1;
global_y = 1;
}
global_size_ = {global_x, global_y};
global_size_ = {global_x, global_y, out_shape_.N};
local_size_ = {};
}
AlignGlobalLocal(global_size_, local_size_);

View File

@ -85,7 +85,8 @@ void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()};
local_size_ = {1, 1, 1};
global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1],
(size_t)dst_size.s[2] * (size_t)(in_tensors_[0]->Batch())};
OpenCLKernel::AlignGlobalLocal(global_size_, local_size_);
}

View File

@ -239,10 +239,11 @@ int OpenCLSubGraph::Init() {
MS_ASSERT(tensor);
tensor->set_allocator(allocator_);
}
std::map<std::string, std::function<int(void)>> pass_manager{
std::vector<std::pair<std::string, std::function<int(void)>>> pass_manager{
{"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)},
{"InsertOpsPass", std::bind(&OpenCLSubGraph::InsertOpsPass, this)},
{"UpdateTensorDataTypePass", std::bind(&OpenCLSubGraph::UpdateTensorDataTypePass, this)},
{"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)}};
};
for (auto iv : pass_manager) {
auto ret = iv.second();
if (ret != RET_OK) {

View File

@ -23,7 +23,7 @@ class TestOpenCL_Pooling : public CommonTest {};
namespace {
// PrimitiveType_Pooling: src/ops/populate/pooling_populate.cc
OpParameter *CreateParameter(PoolMode pool_mode, int window_h, int window_w, int stride_h, int stride_w, int pad_u,
int pad_d, int pad_l, int pad_r, RoundMode round_mode = RoundMode_No,
int pad_d, int pad_l, int pad_r, RoundMode round_mode = RoundMode_Floor,
ActType act_type = ActType_No) {
auto *param = test::CreateParameter<PoolingParameter>(schema::PrimitiveType_MaxPoolFusion);
param->global_ = false;
@ -65,4 +65,27 @@ TEST_F(TestOpenCL_Pooling, Max) {
}
}
TEST_F(TestOpenCL_Pooling, AvgMultiBatch) {
std::vector<int> input_shape = {2, 2, 2, 4};
std::vector<int> output_shape = {2, 1, 1, 4};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
float output_data[] = {6, 7, 8, 9, 6, 7, 8, 9};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(PoolMode_AvgPool, 2, 2, 2, 2, 0, 0, 0, 0);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Pooling, MaxMultiBatch) {
std::vector<int> input_shape = {2, 2, 2, 4};
std::vector<int> output_shape = {2, 1, 1, 4};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
float output_data[] = {12, 13, 14, 15, 12, 13, 14, 15};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(PoolMode_MaxPool, 2, 2, 2, 2, 0, 0, 0, 0);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
} // namespace mindspore::lite::opencl::test

View File

@ -83,4 +83,53 @@ TEST_F(TestOpenCL_Resize, NEAREST) {
}
}
TEST_F(TestOpenCL_Resize, BilinearBatch) {
schema::ResizeMethod method = schema::ResizeMethod_LINEAR;
int oh = 4;
int ow = 4;
bool align_corners = false;
std::vector<int> input_shape = {2, 2, 2, 1};
std::vector<int> output_shape = {2, oh, ow, 1};
float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3};
float output_data[] = {0, 0.5, 1, 1, 1, 1.5, 2, 2, 2, 2.5, 3, 3, 2, 2.5, 3, 3,
0, 0.5, 1, 1, 1, 1.5, 2, 2, 2, 2.5, 3, 3, 2, 2.5, 3, 3};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(method, oh, ow, align_corners);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Resize, Bilinear_AlignCornersBatch) {
schema::ResizeMethod method = schema::ResizeMethod_LINEAR;
int oh = 3;
int ow = 3;
bool align_corners = true;
std::vector<int> input_shape = {2, 2, 2, 1};
std::vector<int> output_shape = {2, oh, ow, 1};
float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3};
float output_data[] = {0, 0.5, 1, 1, 1.5, 2, 2, 2.5, 3, 0, 0.5, 1, 1, 1.5, 2, 2, 2.5, 3};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(method, oh, ow, align_corners);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_Resize, NEARESTBatch) {
schema::ResizeMethod method = schema::ResizeMethod_NEAREST;
int oh = 4;
int ow = 4;
bool align_corners = false;
std::vector<int> input_shape = {2, 2, 2, 1};
std::vector<int> output_shape = {2, oh, ow, 1};
float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3};
float output_data[] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3,
0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(method, oh, ow, align_corners);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
} // namespace mindspore::lite::opencl::test

View File

@ -71,4 +71,47 @@ TEST_F(TestOpenCL_SoftMax, 4D_axis1) {
}
}
TEST_F(TestOpenCL_SoftMax, 2D_axis1_N) {
int axis = 1;
std::vector<int> input_shape = {2, 10};
std::vector<int> output_shape = input_shape;
float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float output_data[] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
fp16_enable ? 2e-2 : 1e-5);
}
}
TEST_F(TestOpenCL_SoftMax, 4D_axis3_N) {
int axis = 3;
std::vector<int> input_shape = {2, 2, 1, 5};
std::vector<int> output_shape = input_shape;
float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float output_data[] = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
fp16_enable ? 2e-2 : 1e-5);
}
}
TEST_F(TestOpenCL_SoftMax, 4D_axis1_N) {
int axis = 1;
std::vector<int> input_shape = {2, 2, 1, 1};
std::vector<int> output_shape = input_shape;
float input_data[] = {1, 1, 1, 1};
float output_data[] = {0.5, 0.5, 0.5, 0.5};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable,
fp16_enable ? 2e-2 : 1e-5);
}
}
} // namespace mindspore::lite::opencl::test

View File

@ -32,6 +32,7 @@ OpParameter *CreateParameter(const std::vector<int> &block_sizes, const std::vec
for (int i = 0; i < paddings.size(); ++i) {
param->paddings_[i] = paddings[i];
}
param->m_ = 2;
return reinterpret_cast<OpParameter *>(param);
}
@ -83,4 +84,27 @@ TEST_F(TestOpenCL_SpaceToBatch, H2W2Pad2222) {
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
TEST_F(TestOpenCL_SpaceToBatch, H2W2Pad2222MultiBatch) {
std::vector<int> input_shape{2, 6, 6, 1};
std::vector<int> block_sizes = {2, 2};
std::vector<int> paddings = {2, 2, 2, 2};
auto output_shape = InferShape(input_shape, block_sizes, paddings);
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71};
float output_data[] = {0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 12, 14, 16, 0, 0, 24, 26, 28, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 36, 38, 40, 0, 0, 48, 50, 52, 0, 0, 60, 62, 64, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 13, 15, 17, 0, 0, 25, 27, 29, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 37, 39, 41, 0, 0, 49, 51, 53, 0, 0, 61, 63, 65, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 18, 20, 22, 0, 0, 30, 32, 34, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 42, 44, 46, 0, 0, 54, 56, 58, 0, 0, 66, 68, 70, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 7, 9, 11, 0, 0, 19, 21, 23, 0, 0, 31, 33, 35, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 43, 45, 47, 0, 0, 55, 57, 59, 0, 0, 67, 69, 71, 0, 0, 0, 0, 0, 0};
for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(block_sizes, paddings);
TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable);
}
}
} // namespace mindspore::lite::opencl::test