From 3b9ca7780e0744ade611bbede238f5c224b88f1a Mon Sep 17 00:00:00 2001 From: songhonglei413 Date: Wed, 19 Aug 2020 16:08:56 +0800 Subject: [PATCH] add op_fused_batchnorm_int8 --- mindspore/lite/src/populate_parameter.cc | 2 + .../runtime/kernel/arm/int8/batchnorm_int8.cc | 80 +++++++++++-- .../runtime/kernel/arm/int8/batchnorm_int8.h | 1 + .../kernel/arm/nnacl/batchnorm_parameter.h | 2 + .../kernel/arm/nnacl/int8/batchnorm_int8.c | 6 +- .../kernel/arm/int8/batchnorm_int8_test.cc | 105 +++++++++++++++++- 6 files changed, 184 insertions(+), 12 deletions(-) diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc index b6c1702a24c..baca2b1d2ea 100644 --- a/mindspore/lite/src/populate_parameter.cc +++ b/mindspore/lite/src/populate_parameter.cc @@ -192,6 +192,7 @@ OpParameter *PopulateBatchNorm(const mindspore::lite::PrimitiveC *primitive) { } batch_norm_param->op_parameter_.type_ = primitive->Type(); batch_norm_param->epsilon_ = param->GetEpsilon(); + batch_norm_param->fused_ = false; return reinterpret_cast(batch_norm_param); } @@ -648,6 +649,7 @@ OpParameter *PopulateFusedBatchNorm(const mindspore::lite::PrimitiveC *primitive batch_norm_param->op_parameter_.type_ = primitive->Type(); auto param = dynamic_cast(primitive); batch_norm_param->epsilon_ = param->GetEpsilon(); + batch_norm_param->fused_ = true; return reinterpret_cast(batch_norm_param); } diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc index b89fff55b9c..3c1539b3cc3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc @@ -27,6 +27,7 @@ using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_BatchNorm; +using mindspore::schema::PrimitiveType_FusedBatchNorm; namespace mindspore::kernel { BatchnormInt8CPUKernel::~BatchnormInt8CPUKernel() { @@ -82,22 +83,86 @@ int BatchnormInt8CPUKernel::InitConstTensor() { return RET_OK; } +int BatchnormInt8CPUKernel::InitFusedConstTensor() { + auto input = in_tensors_[0]; + auto scale = in_tensors_[1]; + auto offset = in_tensors_[2]; + auto mean = in_tensors_[3]; + auto variance = in_tensors_[4]; + auto output = out_tensors_[0]; + + auto scale_ptr = reinterpret_cast(scale->Data()); + auto offset_ptr = reinterpret_cast(offset->Data()); + auto mean_ptr = reinterpret_cast(mean->Data()); + auto var_ptr = reinterpret_cast(variance->Data()); + + alpha_addr_ = reinterpret_cast(malloc(mean->ElementsNum() * sizeof(float))); + if (alpha_addr_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + beta_addr_ = reinterpret_cast(malloc(variance->ElementsNum() * sizeof(float))); + if (beta_addr_ == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return RET_ERROR; + } + // compute alpha, beta; + // 0. tmp = (S6 * Sqrt(e + S5 * (q5 - Z5))); + // 1. A = S1 * S2 * (q2 - Z2) / tmp; + // 2. B = Z6 - (A1 * Z1) -((S3 * (q3 - Z3)) / S6 - S2 * S4 * (q2 - Z4) * (q4 - z4) / tmp; + auto eps = batchnorm_param_->epsilon_; + auto zp_in = input->GetQuantParams().front().zeroPoint; + auto zp_scale = scale->GetQuantParams().front().zeroPoint; + auto zp_offset = offset->GetQuantParams().front().zeroPoint; + auto zp_mean = mean->GetQuantParams().front().zeroPoint; + auto zp_var = variance->GetQuantParams().front().zeroPoint; + auto zp_out = output->GetQuantParams().front().zeroPoint; + auto s_in = input->GetQuantParams().front().scale; + auto s_scale = scale->GetQuantParams().front().scale; + auto s_offset = offset->GetQuantParams().front().scale; + auto s_mean = mean->GetQuantParams().front().scale; + auto s_var = variance->GetQuantParams().front().scale; + auto s_out = output->GetQuantParams().front().scale; + + float mul_12 = s_in * s_scale; + float mul_24 = s_scale * s_mean; + float div_36 = s_offset / s_out; + for (int i = 0; i < batchnorm_param_->channel_; ++i) { + float tmp = s_out * sqrt(eps + s_var * (var_ptr[i] - zp_var)); + float tmp_a = (mul_12 * (scale_ptr[i] - zp_scale)) / tmp; + float tmp_b = zp_out + div_36 * (offset_ptr[i] - zp_offset) - tmp_a * zp_in - + (mul_24 * (scale_ptr[i] - zp_scale) * (mean_ptr[i] - zp_mean)) / tmp; + alpha_addr_[i] = tmp_a; + beta_addr_[i] = tmp_b; + } + return RET_OK; +} + int BatchnormInt8CPUKernel::Init() { auto input_shapes = in_tensors_[0]->shape(); auto n_dim = input_shapes.size(); batchnorm_param_->channel_ = input_shapes[n_dim - 1]; - batchnorm_param_->unit_ = 1; + batchnorm_param_->units_ = 1; for (int i = 0; i < n_dim - 1; i++) { - batchnorm_param_->unit_ *= input_shapes[i]; + batchnorm_param_->units_ *= input_shapes[i]; } batchnorm_param_->op_parameter_.thread_num_ = MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_); - - auto ret = InitConstTensor(); - if (ret != 0) { - MS_LOG(ERROR) << "Batchnorm fp32 InitConstTensor failed."; - return RET_ERROR; + batchnorm_param_->unit_ = UP_DIV(batchnorm_param_->units_, batchnorm_param_->op_parameter_.thread_num_); + if (batchnorm_param_->fused_) { + auto ret = InitFusedConstTensor(); + if (ret != 0) { + MS_LOG(ERROR) << "FusedBatchnorm int8 InitFusedConstTensor failed."; + return RET_ERROR; + } + } else { + auto ret = InitConstTensor(); + if (ret != 0) { + MS_LOG(ERROR) << "Batchnorm int8 InitConstTensor failed."; + return RET_ERROR; + } } + return RET_OK; } @@ -165,4 +230,5 @@ kernel::LiteKernel *CpuBatchnormInt8KernelCreator(const std::vectorchannel_; c += param->op_parameter_.thread_num_) { - for (int u = 0; u < param->unit_; u++) { + int unit_st = task_id * param->unit_; + int unit_end = MSMIN((task_id + 1) * param->unit_, param->units_); + for (int u = unit_st; u < unit_end; u++) { + for (int c = 0; c < param->channel_; c++) { int32_t output_tmp = round(input_ptr[u * param->channel_ + c] * alpha_ptr[c] + beta_ptr[c]); output_tmp = output_tmp > 127 ? 127 : output_tmp; output_tmp = output_tmp < -128 ? -128 : output_tmp; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc index b49d302b6fa..ee246454560 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/batchnorm_int8_test.cc @@ -27,6 +27,104 @@ class TestBatchnormInt8 : public mindspore::CommonTest { TestBatchnormInt8() {} }; +TEST_F(TestBatchnormInt8, FusedTest) { + std::vector in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61}; + std::vector in_data1 = {4, 4}; + std::vector in_data2 = {8, 33}; + std::vector in_data3 = {35, 55}; + std::vector in_data4 = {2, 3}; + std::vector inputs_tensor; + std::vector outputs_tensor; + + BatchNormParameter op_param; + op_param.op_parameter_.type_ = schema::PrimitiveType_FusedBatchNorm; + op_param.epsilon_ = 0.001f; + op_param.fused_ = true; + + std::vector shape = {1, 1, 6, 2}; + + lite::tensor::QuantArg input_quant_arg; + input_quant_arg.scale = 0.1; + input_quant_arg.zeroPoint = 1; + lite::tensor::QuantArg input_quant_arg_1; + input_quant_arg_1.scale = 0.5; + input_quant_arg_1.zeroPoint = 2; + lite::tensor::QuantArg input_quant_arg_2; + input_quant_arg_2.scale = 0.02; + input_quant_arg_2.zeroPoint = 3; + lite::tensor::QuantArg input_quant_arg_3; + input_quant_arg_3.scale = 0.5; + input_quant_arg_3.zeroPoint = 15; + lite::tensor::QuantArg input_quant_arg_4; + input_quant_arg_4.scale = 0.25; + input_quant_arg_4.zeroPoint = 1; + lite::tensor::QuantArg output_quant_arg; + output_quant_arg.scale = 0.8; + output_quant_arg.zeroPoint = 0; + + lite::tensor::Tensor input0_tensor; + lite::tensor::Tensor input1_tensor; + lite::tensor::Tensor input2_tensor; + lite::tensor::Tensor input3_tensor; + lite::tensor::Tensor input4_tensor; + inputs_tensor.push_back(&input0_tensor); + inputs_tensor.push_back(&input1_tensor); + inputs_tensor.push_back(&input2_tensor); + inputs_tensor.push_back(&input3_tensor); + inputs_tensor.push_back(&input4_tensor); + input0_tensor.SetData(in_data.data()); + input1_tensor.SetData(in_data1.data()); + input2_tensor.SetData(in_data2.data()); + input3_tensor.SetData(in_data3.data()); + input4_tensor.SetData(in_data4.data()); + input0_tensor.set_shape(shape); + input1_tensor.set_shape({2}); + input2_tensor.set_shape({2}); + input3_tensor.set_shape({2}); + input4_tensor.set_shape({2}); + input0_tensor.AddQuantParam(input_quant_arg); + input1_tensor.AddQuantParam(input_quant_arg_1); + input2_tensor.AddQuantParam(input_quant_arg_2); + input3_tensor.AddQuantParam(input_quant_arg_3); + input4_tensor.AddQuantParam(input_quant_arg_4); + + std::vector output(12); + // std::vector corr_out = {-18, -22, -16, -21, -14, -19, -22, -34, -24, -35, -26, -36 }; + std::vector corr_out = {-22, -28, -20, -26, -17, -24, -28, -42, -30, -44, -33, -46}; + lite::tensor::Tensor output0_tensor; + outputs_tensor.push_back(&output0_tensor); + output0_tensor.SetData(output.data()); + output0_tensor.set_shape(shape); + output0_tensor.AddQuantParam(output_quant_arg); + + kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_FusedBatchNorm}; + auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc); + ASSERT_NE(creator, nullptr); + lite::Context ctx; + ctx.thread_num_ = 3; + kernel::LiteKernel *kernel = + creator(inputs_tensor, outputs_tensor, reinterpret_cast(&op_param), &ctx, desc, nullptr); + ASSERT_NE(kernel, nullptr); + + auto output_tensor_shape = output0_tensor.shape(); + kernel->Run(); + + printf("==================output data=================\n"); + for (int i = 0; i < output0_tensor.ElementsNum(); i++) { + printf("%d, ", output[i]); + } + std::cout << std::endl; + CompareOutputData(output.data(), corr_out.data(), output0_tensor.ElementsNum(), 0.001); + + input0_tensor.SetData(nullptr); + input1_tensor.SetData(nullptr); + input2_tensor.SetData(nullptr); + input3_tensor.SetData(nullptr); + input4_tensor.SetData(nullptr); + output0_tensor.SetData(nullptr); + MS_LOG(INFO) << "TestBathNormFp32 accuracy passed"; +} + TEST_F(TestBatchnormInt8, BNTest) { std::vector in_data = {11, 41, 21, 51, 31, 61, -11, -41, -21, -51, -31, -61}; std::vector in_data1 = {4, 14}; @@ -37,6 +135,7 @@ TEST_F(TestBatchnormInt8, BNTest) { BatchNormParameter op_param; op_param.op_parameter_.type_ = schema::PrimitiveType_BatchNorm; op_param.epsilon_ = 0.001f; + op_param.fused_ = false; std::vector shape = {1, 1, 6, 2}; @@ -50,7 +149,7 @@ TEST_F(TestBatchnormInt8, BNTest) { input_quant_arg_2.scale = 0.1; input_quant_arg_2.zeroPoint = -1; lite::tensor::QuantArg output_quant_arg; - output_quant_arg.scale = 1; + output_quant_arg.scale = 0.5; output_quant_arg.zeroPoint = 0; lite::tensor::Tensor input0_tensor; @@ -70,8 +169,7 @@ TEST_F(TestBatchnormInt8, BNTest) { input2_tensor.AddQuantParam(input_quant_arg_2); std::vector output(12); - // std::vector corr_out1 = {5, 17, 11, 22, 17, 27, -6, -23, -12, -28, -18, -33}; - std::vector corr_out = {1, 2, 1, 2, 2, 3, -1, -2, -1, -3, -2, -3}; + std::vector corr_out = {1, 3, 2, 4, 3, 5, -2, -5, -3, -6, -4, -7}; lite::tensor::Tensor output0_tensor; outputs_tensor.push_back(&output0_tensor); @@ -87,6 +185,7 @@ TEST_F(TestBatchnormInt8, BNTest) { kernel::LiteKernel *kernel = creator(inputs_tensor, outputs_tensor, reinterpret_cast(&op_param), &ctx, desc, nullptr); ASSERT_NE(kernel, nullptr); + auto output_tensor_shape = output0_tensor.shape(); kernel->Run();