!28227 [MSLITE][DEVELOP] optimize fused batchnorm to scale in runtime

Merge pull request !28227 from yangruoqi713/master
This commit is contained in:
i-robot 2021-12-29 01:14:31 +00:00 committed by Gitee
commit 25d4333693
5 changed files with 156 additions and 25 deletions

View File

@ -17,6 +17,7 @@
#include "src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.h"
#include "nnacl/fp16/batchnorm_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/scale_fp16.h"
#include "src/kernel_registry.h"
using mindspore::lite::KernelRegistrar;
@ -35,6 +36,35 @@ constexpr static int kOutOffsetIdx = 2;
constexpr static int kOutCurrentMeanIdx = 3;
constexpr static int kOutCurrentVarIdx = 4;
// new scale: -scale / sqrt(variance + eps)
// new bias: -scale * mean / sqrt(variance + eps) + bias
int FusedBatchnormFp16CPUKernel::Batchnorm2Scale(const void *scale_data, const void *bias_data, const void *mean_data,
const void *var_data, float eps, int kernel_num) {
auto ret = InitScaleParam();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init scale parameter when converting fused_batchnorm to scale.";
return RET_ERROR;
}
scale_ = malloc(in_tensors_.at(SECOND_INPUT)->Size());
CHECK_NULL_RETURN(scale_);
auto fp16_scale = reinterpret_cast<float16_t *>(scale_);
for (int i = 0; i < kernel_num; i++) {
fp16_scale[i] = (reinterpret_cast<const float16_t *>(scale_data))[i] /
sqrtf((reinterpret_cast<const float16_t *>(var_data))[i] + eps);
}
offset_ = malloc(in_tensors_.at(THIRD_INPUT)->Size());
CHECK_NULL_RETURN(offset_);
auto fp16_offset = reinterpret_cast<float16_t *>(offset_);
for (int i = 0; i < kernel_num; i++) {
fp16_offset[i] = (reinterpret_cast<const float16_t *>(bias_data))[i] -
(reinterpret_cast<const float16_t *>(mean_data))[i] * fp16_scale[i];
}
is_scale_ = true;
return RET_OK;
}
void FusedBatchnormFp16CPUKernel::CalcMeanVar(float16_t *in, float16_t *scale, float16_t *offset, float16_t *save_mean,
float16_t *save_variance) {
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
@ -73,10 +103,17 @@ int FusedBatchnormFp16CPUKernel::DoExecute(int task_id) {
reinterpret_cast<float16_t *>(in_tensors_.at(kInCurrentMeanIdx)->data()),
reinterpret_cast<float16_t *>(in_tensors_.at(kInCurrentVarIdx)->data()));
}
FusedBatchNormFp16(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data()), reinterpret_cast<float16_t *>(scale_),
reinterpret_cast<float16_t *>(offset_), reinterpret_cast<float16_t *>(mean_),
reinterpret_cast<float16_t *>(variance_), param, task_id,
reinterpret_cast<float16_t *>(out_tensors_.at(0)->data()));
if (is_scale_) {
DoScaleFp16(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data()),
reinterpret_cast<float16_t *>(out_tensors_.at(0)->data()), reinterpret_cast<float16_t *>(scale_),
reinterpret_cast<float16_t *>(offset_), task_id, scale_param_);
} else {
FusedBatchNormFp16(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data()), reinterpret_cast<float16_t *>(scale_),
reinterpret_cast<float16_t *>(offset_), reinterpret_cast<float16_t *>(mean_),
reinterpret_cast<float16_t *>(variance_), param, task_id,
reinterpret_cast<float16_t *>(out_tensors_.at(0)->data()));
}
return RET_OK;
}

View File

@ -30,6 +30,8 @@ class FusedBatchnormFp16CPUKernel : public FusedBatchnormCPUKernel {
int DoExecute(int task_id) override;
int Eval() override;
int Batchnorm2Scale(const void *scale_data, const void *bias_data, const void *mean_data, const void *var_data,
float eps, int kernel_num) override;
protected:
void CalcMeanVar(float16_t *in, float16_t *scale, float16_t *offset, float16_t *save_mean, float16_t *save_variance);

View File

@ -39,6 +39,10 @@ class BatchnormCPUKernel : public InnerKernel {
int SetupVirtualBatch(int virtual_batch_multiplier, int param) override;
virtual int InitConstTensor();
virtual int DoExecute(int task_id);
virtual int Batchnorm2Scale(const void *scale_data, const void *bias_data, const void *mean_data,
const void *var_data, float eps, int kernel_num) {
return RET_OK;
}
virtual int set_momentum(float momentum);
virtual float get_momentum();
virtual int RestoreDefaultMomentum();

View File

@ -23,12 +23,19 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_FusedBatchNorm;
namespace mindspore::kernel {
int FusedBatchnormCPUKernel::ReSize() {
CHECK_LESS_RETURN(in_tensors_.size(), DIMENSION_5D);
int FusedBatchnormCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), SIXTH_INPUT);
CHECK_LESS_RETURN(out_tensors_.size(), 1);
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}
int FusedBatchnormCPUKernel::ReSize() {
FillParam();
FreeMeanAndVariance();
FreeScaleAndOffset();
FillParam();
return InitConstTensor();
}
@ -41,29 +48,97 @@ void FusedBatchnormCPUKernel::FreeScaleAndOffset() {
free(offset_);
offset_ = nullptr;
}
if (scale_param_ != nullptr) {
free(scale_param_);
scale_param_ = nullptr;
}
}
int FusedBatchnormCPUKernel::InitScaleParam() {
scale_param_ = reinterpret_cast<ScaleParameter *>(malloc(sizeof(ScaleParameter)));
CHECK_NULL_RETURN(scale_param_);
scale_param_->op_parameter_.thread_num_ = ms_context_->thread_num_;
scale_param_->axis_ = kNHWC_C;
auto in_shape = in_tensors_[0]->shape();
CHECK_LESS_RETURN(in_shape.size(), DIMENSION_5D);
scale_param_->outer_size_ = 1;
for (auto i = 0; i < scale_param_->axis_; i++) {
scale_param_->outer_size_ *= in_shape[i];
}
scale_param_->axis_size_ = in_shape[DIMENSION_3D];
scale_param_->inner_size_ = 1;
return RET_OK;
}
// new scale: -scale / sqrt(variance + eps)
// new bias: -scale * mean / sqrt(variance + eps) + bias
int FusedBatchnormCPUKernel::Batchnorm2Scale(const void *scale_data, const void *bias_data, const void *mean_data,
const void *var_data, float eps, int kernel_num) {
auto ret = InitScaleParam();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init scale parameter when converting fused_batchnorm to scale.";
return RET_ERROR;
}
scale_ = malloc(in_tensors_.at(SECOND_INPUT)->Size());
CHECK_NULL_RETURN(scale_);
auto fp32_scale = reinterpret_cast<float *>(scale_);
for (int i = 0; i < kernel_num; i++) {
fp32_scale[i] =
(reinterpret_cast<const float *>(scale_data))[i] / sqrtf((reinterpret_cast<const float *>(var_data))[i] + eps);
}
offset_ = malloc(in_tensors_.at(THIRD_INPUT)->Size());
CHECK_NULL_RETURN(offset_);
auto fp32_offset = reinterpret_cast<float *>(offset_);
for (int i = 0; i < kernel_num; i++) {
fp32_offset[i] =
(reinterpret_cast<const float *>(bias_data))[i] - (reinterpret_cast<const float *>(mean_data))[i] * fp32_scale[i];
}
is_scale_ = true;
return RET_OK;
}
int FusedBatchnormCPUKernel::InitConstTensor() {
auto scale = in_tensors_.at(SECOND_INPUT);
auto offset = in_tensors_.at(THIRD_INPUT);
auto mean = in_tensors_.at(FOURTH_INPUT);
auto variance = in_tensors_.at(FIFTH_INPUT);
CHECK_NULL_RETURN(scale);
CHECK_NULL_RETURN(scale->data());
scale_ = malloc(scale->Size());
offset_ = malloc(offset->Size());
mean_ = malloc(mean->Size());
variance_ = malloc(variance->Size());
if (scale_ == nullptr || offset_ == nullptr || mean_ == nullptr || variance_ == nullptr) {
FreeMeanAndVariance();
FreeScaleAndOffset();
MS_LOG(ERROR) << "Memory allocation failed";
return RET_ERROR;
auto offset = in_tensors_.at(THIRD_INPUT);
CHECK_NULL_RETURN(offset);
CHECK_NULL_RETURN(offset->data());
auto mean = in_tensors_.at(FOURTH_INPUT);
CHECK_NULL_RETURN(mean);
CHECK_NULL_RETURN(mean->data());
auto variance = in_tensors_.at(FIFTH_INPUT);
CHECK_NULL_RETURN(variance);
CHECK_NULL_RETURN(variance->data());
auto param = reinterpret_cast<BatchNormParameter *>(op_parameter_);
CHECK_NULL_RETURN(param);
if (!op_parameter_->is_train_session_) {
auto ret = Batchnorm2Scale(reinterpret_cast<float *>(scale->data()), reinterpret_cast<float *>(offset->data()),
reinterpret_cast<float *>(mean->data()), reinterpret_cast<float *>(variance->data()),
param->epsilon_, scale->ElementsNum());
if (ret == RET_OK) {
return RET_OK;
} else {
FreeScaleAndOffset();
}
}
CHECK_NULL_RETURN(scale->data());
CHECK_NULL_RETURN(offset->data());
CHECK_NULL_RETURN(mean->data());
CHECK_NULL_RETURN(variance->data());
scale_ = malloc(in_tensors_.at(SECOND_INPUT)->Size());
CHECK_NULL_RETURN(scale_);
offset_ = malloc(in_tensors_.at(THIRD_INPUT)->Size());
CHECK_NULL_RETURN(offset_);
mean_ = malloc(in_tensors_.at(FOURTH_INPUT)->Size());
CHECK_NULL_RETURN(mean_);
variance_ = malloc(in_tensors_.at(FIFTH_INPUT)->Size());
CHECK_NULL_RETURN(variance_);
memcpy(scale_, scale->data(), scale->Size());
memcpy(offset_, offset->data(), offset->Size());
memcpy(mean_, mean->data(), mean->Size());
@ -141,8 +216,14 @@ int FusedBatchnormCPUKernel::DoExecute(int task_id) {
auto out_data = reinterpret_cast<float *>(out_tensors_.at(FIRST_INPUT)->data());
CHECK_NULL_RETURN(in_data);
CHECK_NULL_RETURN(out_data);
FusedBatchNormFp32(in_data, reinterpret_cast<float *>(scale_), reinterpret_cast<float *>(offset_),
reinterpret_cast<float *>(mean_), reinterpret_cast<float *>(variance_), param, task_id, out_data);
if (is_scale_) {
DoScale(in_data, out_data, reinterpret_cast<float *>(scale_), reinterpret_cast<float *>(offset_), task_id,
scale_param_);
} else {
FusedBatchNormFp32(in_data, reinterpret_cast<float *>(scale_), reinterpret_cast<float *>(offset_),
reinterpret_cast<float *>(mean_), reinterpret_cast<float *>(variance_), param, task_id,
out_data);
}
return RET_OK;
}

View File

@ -19,6 +19,7 @@
#include <vector>
#include "src/runtime/kernel/arm/fp32/batchnorm_fp32.h"
#include "nnacl/fp32/scale_fp32.h"
namespace mindspore::kernel {
class FusedBatchnormCPUKernel : public BatchnormCPUKernel {
@ -29,16 +30,22 @@ class FusedBatchnormCPUKernel : public BatchnormCPUKernel {
~FusedBatchnormCPUKernel() { FreeScaleAndOffset(); }
int Eval() override;
int Prepare() override;
int ReSize() override;
int Run() override;
int InitConstTensor() override;
int DoExecute(int task_id) override;
int Batchnorm2Scale(const void *scale_data, const void *bias_data, const void *mean_data, const void *var_data,
float eps, int kernel_num) override;
protected:
void FreeScaleAndOffset();
int InitScaleParam();
void *scale_ = nullptr;
void *offset_ = nullptr;
bool trained_ = false;
bool is_scale_ = false;
ScaleParameter *scale_param_ = nullptr;
};
} // namespace mindspore::kernel