diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl index c3cc89ad1f1..7301135455c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl @@ -2,10 +2,10 @@ #define INT4 int4 #define INT2 int2 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__kernel void batch_normalization(__read_only image2d_t input, __read_only image2d_t scale, - __read_only image2d_t offset, __read_only image2d_t mean, - __read_only image2d_t variance, __write_only image2d_t output, const INT4 input_shape, - float epsilon) { +__kernel void Batch_normalization_NHWC4(__read_only image2d_t input, __read_only image2d_t scale, + __read_only image2d_t offset, __read_only image2d_t mean, + __read_only image2d_t variance, __write_only image2d_t output, + const INT4 input_shape, float epsilon) { int X = get_global_id(0); // H int Y = get_global_id(1); // W int Z = get_global_id(2); // C/4 @@ -25,3 +25,27 @@ __kernel void batch_normalization(__read_only image2d_t input, __read_only image result.w = result_scale.w * ((result.w - result_mean.w) / sqrt(result_var.w + epsilon)) + result_offset.w; WRITE_IMAGE(output, (int2)((Y)*input_shape.w + Z, (X)), result); } + +__kernel void Batch_normalization_NC4HW4(__read_only image2d_t input, __read_only image2d_t scale, + __read_only image2d_t offset, __read_only image2d_t mean, + __read_only image2d_t variance, __write_only image2d_t output, + const INT4 input_shape, float epsilon) { + int X = get_global_id(0); // H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // C/4 + if (X >= input_shape.y || Y >= input_shape.z || Z >= input_shape.w) { + return; + } + FLT4 result = READ_IMAGE(input, smp_none, (int2)((Y), (Z * input_shape.y + X))); + + FLT4 result_mean = READ_IMAGE(mean, smp_none, (int2)((0), (Z))); + FLT4 result_var = READ_IMAGE(variance, smp_none, (int2)((0), (Z))); + FLT4 result_scale = READ_IMAGE(scale, smp_none, (int2)((0), (Z))); + FLT4 result_offset = READ_IMAGE(offset, smp_none, (int2)((0), (Z))); + + result.x = result_scale.x * ((result.x - result_mean.x) / sqrt(result_var.x + epsilon)) + result_offset.x; + result.y = result_scale.y * ((result.y - result_mean.y) / sqrt(result_var.y + epsilon)) + result_offset.y; + result.z = result_scale.z * ((result.z - result_mean.z) / sqrt(result_var.z + epsilon)) + result_offset.z; + result.w = result_scale.w * ((result.w - result_mean.w) / sqrt(result_var.w + epsilon)) + result_offset.w; + WRITE_IMAGE(output, (int2)((Y), (Z * input_shape.y + X)), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index 5c98f9dfe06..30c6c347ae3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -14,7 +14,6 @@ * limitations under the License. */ #include -#include #include #include #include "src/kernel_registry.h" @@ -35,7 +34,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector *img_siz im_dst_x = out_tensors_[0]->Width() * CO4; im_dst_y = out_tensors_[0]->Height(); } else { - im_dst_y = out_tensors_[0]->Height() * CO4; + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4; im_dst_x = out_tensors_[0]->Width(); } size_t img_dtype = CL_FLOAT; @@ -50,17 +49,29 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector *img_siz return RET_OK; } int BatchNormOpenCLKernel::Init() { + auto in_format = op_format_; + if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { + MS_LOG(ERROR) << "input format(" << in_format << ") " + << "format not support!"; + return RET_ERROR; + } + in_ori_format_ = in_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_ori_format_ = out_tensors_[0]->GetFormat(); + out_tensors_[0]->SetFormat(op_format_); + std::string kernel_name = "Batch_normalization"; + if (in_format == schema::Format_NC4HW4) { + kernel_name += "_NC4HW4"; + } else if (in_format == schema::Format_NHWC4) { + kernel_name += "_NHWC4"; + } + std::set build_options; std::string source = batchnorm_source; - std::string program_name = "batch_normalization"; - std::string kernel_name = "batch_normalization"; + std::string program_name = "Batch_normalization"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->LoadSource(program_name, source); ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); - in_ori_format_ = in_tensors_[0]->GetFormat(); - in_tensors_[0]->SetFormat(schema::Format_NHWC4); - out_ori_format_ = out_tensors_[0]->GetFormat(); - out_tensors_[0]->SetFormat(schema::Format_NHWC4); return RET_OK; } diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc index 7ab29a333a8..e9c90cb7765 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc @@ -197,7 +197,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { return; } auto *output_tensor = - new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); + new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); if (output_tensor == nullptr) { MS_LOG(INFO) << " init tensor failed "; delete tensor_data; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc index c0b744bfc9a..5c8830e26fb 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc @@ -180,8 +180,8 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { MS_LOG(INFO) << " init tensors "; constexpr int INPUT_NUM = 3; std::array, INPUT_NUM> input_shapes = { - std::vector{1, 2, 4, 8}, std::vector{1, 2, 4, 8}, std::vector{1, 2, 4, 8}}; - std::vector output_shape = {3, 2, 4, 8}; + std::vector{1, 16, 256, 80}, std::vector{1, 16, 256, 80}, std::vector{1, 16, 256, 80}}; + std::vector output_shape = {1, 48, 256, 80}; auto data_type = kNumberTypeFloat32; auto tensor_type = schema::NodeType_ValueNode; std::vector inputs; @@ -217,7 +217,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { } return; } - param->axis_ = 0; + param->axis_ = 1; auto *concat_kernel = new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast(param), inputs, outputs); if (concat_kernel == nullptr) {