diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl index defaae81f56..214d89b2320 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl @@ -35,12 +35,27 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, WRITE_IMAGE(output, (int2)(X, Y), in_c4); } -__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { +__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape, const int c4, + const int last_c4) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= img_shape.x || Y >= img_shape.y) return; + int C4 = X % c4; FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); - in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4)); + if (C4 < c4 - 1) { + in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4)); + } else { + in_c4.x = (FLT)(1.f) / ((FLT)(1.f) + exp(-in_c4.x)); + if (last_c4 > 1) { + in_c4.y = (FLT)(1.f) / ((FLT)(1.f) + exp(-in_c4.y)); + } + if (last_c4 > 2) { + in_c4.z = (FLT)(1.f) / ((FLT)(1.f) + exp(-in_c4.z)); + } + if (last_c4 > 3) { + in_c4.w = (FLT)(1.f) / ((FLT)(1.f) + exp(-in_c4.w)); + } + } WRITE_IMAGE(output, (int2)(X, Y), in_c4); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl index c0f53be9047..ae3cb51dc16 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl @@ -266,7 +266,7 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, float act_min, float act_max) { + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -281,14 +281,21 @@ __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only i int b_w = Y < b_shape.z ? Y : b_shape.z - 1; int b_h = Z < b_shape.y ? Z : b_shape.y - 1; FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_h)); - FLT4 result = a + b; + FLT4 result; + if (broadcastC_flag == 0) { + result = a + b; + } else if (broadcastC_flag == 1) { + result = a.x + b; + } else { + result = a + b.x; + } result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, float act_min, float act_max) { + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -303,14 +310,21 @@ __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only i int b_w = Y < b_shape.z ? Y : b_shape.z - 1; int b_h = Z < b_shape.y ? Z : b_shape.y - 1; FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_h)); - FLT4 result = a - b; + FLT4 result; + if (broadcastC_flag == 0) { + result = a - b; + } else if (broadcastC_flag == 1) { + result = a.x - b; + } else { + result = a - b.x; + } result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, float act_min, float act_max) { + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -325,14 +339,21 @@ __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only i int b_w = Y < b_shape.z ? Y : b_shape.z - 1; int b_h = Z < b_shape.y ? Z : b_shape.y - 1; FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_h)); - FLT4 result = a * b; + FLT4 result; + if (broadcastC_flag == 0) { + result = a * b; + } else if (broadcastC_flag == 1) { + result = a.x * b; + } else { + result = a * b.x; + } result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, float act_min, float act_max) { + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -347,7 +368,14 @@ __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only i int b_w = Y < b_shape.z ? Y : b_shape.z - 1; int b_h = Z < b_shape.y ? Z : b_shape.y - 1; FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_h)); - FLT4 result = a / b; + FLT4 result; + if (broadcastC_flag == 0) { + result = a / b; + } else if (broadcastC_flag == 1) { + result = a.x / b; + } else { + result = a / b.x; + } result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index 805f7242873..fd3b6a4a2bf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -70,6 +70,12 @@ int ActivationOpenClKernel::SetArgs() { if (type_ == ActivationType_LEAKY_RELU) { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, alpha_); } + if (type_ == ActivationType_SIGMOID) { + int c4 = outShape.Slice; + int last_c4 = outShape.C % 4 == 0 ? 4 : outShape.C % 4; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4); + } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 659b6bb4d2b..71bfd3749f8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -148,6 +148,13 @@ int ArithmeticOpenCLKernel::SetArgs() { auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + int broadcastC_flag = 0; // do not need broadcast in C4 + if (inputs_nhwc_shapes_[0][3] == 1 && inputs_nhwc_shapes_[1][3] != 1) { + broadcastC_flag = 1; // BroadCast C4 in input0 + } else if (inputs_nhwc_shapes_[0][3] != 1 && inputs_nhwc_shapes_[1][3] == 1) { + broadcastC_flag = 2; // BroadCast C4 in input1 + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag); } else { cl_int2 output_shape{static_cast(global_size_[0]), static_cast(global_size_[1])}; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);