!9445 [MS][LITE][CPU]support multi-batch for group convs

From: @fuzhiye
Reviewed-by: 
Signed-off-by:
This commit is contained in:
mindspore-ci-bot 2020-12-07 09:48:43 +08:00 committed by Gitee
commit f85fbc4dd2
6 changed files with 56 additions and 81 deletions

View File

@ -40,15 +40,13 @@ using mindspore::schema::Format::Format_NHWC;
namespace mindspore::kernel {
int ConvolutionFP16CPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
int in_channel = filter_tensor->Channel();
int out_channel = filter_tensor->Batch();
conv_param_->input_channel_ = in_channel;
conv_param_->output_channel_ = out_channel;
int oc8 = UP_DIV(out_channel, C8NUM);
int kernel_plane = kernel_h * kernel_w;
int pack_weight_size = oc8 * C8NUM * in_channel * kernel_plane;
int oc8 = UP_ROUND(out_channel, C8NUM);
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
int pack_weight_size = oc8 * in_channel * kernel_plane;
// init weight
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
@ -69,15 +67,15 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
}
// init bias
bias_data_ = malloc(oc8 * C8NUM * sizeof(float16_t));
bias_data_ = malloc(oc8 * sizeof(float16_t));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_data_ failed.";
return RET_ERROR;
}
memset(bias_data_, 0, oc8 * C8NUM * sizeof(float16_t));
memset(bias_data_, 0, oc8 * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
for (int i = 0; i < out_channel; ++i) {
fp16_bias_data[i] = (float16_t)ori_bias[i];
}
@ -89,9 +87,8 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
int ConvolutionFP16CPUKernel::InitTmpBuffer() {
const int cal_num = 16;
int in_channel = conv_param_->input_channel_;
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
int unit_size = kernel_plane * in_channel * cal_num * thread_count_;
int unit_size =
conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * cal_num * thread_count_;
packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(unit_size * sizeof(float16_t)));
if (packed_input_ == nullptr) {
@ -205,19 +202,13 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
if (sub_conv != nullptr) {
delete sub_conv;
}
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
if (in_tensor != nullptr) {
delete in_tensor;
}
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
if (out_tensor != nullptr) {
delete out_tensor;
}
delete out_tensor;
}
}
@ -332,8 +323,10 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
std::vector<int> in_shape;
std::vector<int> out_shape;
int batch = inputs.front()->Batch();
conv_param->input_batch_ = batch;
conv_param->output_batch_ = batch;
if (infered_flag) {
int batch = inputs.front()->Batch();
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);

View File

@ -77,11 +77,6 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
return ret;
}
(const_cast<mindspore::lite::PrimitiveC *>(primitive_))->set_infer_flag(true);
ret = ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
return ret;
}
// if infershape func is called in runtime stage, we should malloc memory and set shape info for outputs of sub
// kernels here.
@ -119,6 +114,11 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
}
}
}
ret = ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
return ret;
}
}
auto outputs = this->out_tensors();
@ -136,9 +136,7 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
// input may either be float32 or float16
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_plane = in_h * in_w;
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = group_convs_.at(group_id)->in_tensors().front()->data_c();
@ -178,9 +176,7 @@ int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
void GroupConvolutionFP16CPUKernel::PostConcat(int group_id) {
// output is must float16 data type
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_plane = out_h * out_w;
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float16_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());

View File

@ -37,18 +37,15 @@ using mindspore::schema::Format::Format_NHWC;
namespace mindspore::kernel {
int ConvolutionCPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
int in_channel = filter_tensor->Channel();
int out_channel = filter_tensor->Batch();
conv_param_->input_channel_ = in_channel;
conv_param_->output_channel_ = out_channel;
int kernel_plane = kernel_h * kernel_w;
const int oc_block = C8NUM;
int oc_block_num = UP_DIV(out_channel, C8NUM);
int pack_weight_size = oc_block_num * oc_block * in_channel * kernel_plane;
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
int oc_block_num = UP_ROUND(out_channel, C8NUM);
int pack_weight_size = oc_block_num * in_channel * kernel_plane;
auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData());
auto origin_weight = reinterpret_cast<float *>(filter_tensor->data_c());
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed weight failed.";
@ -57,15 +54,15 @@ int ConvolutionCPUKernel::InitWeightBias() {
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
RowMajor2Col8Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * sizeof(float)));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "malloc bias failed.";
return RET_ERROR;
}
memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float));
memset(bias_data_, 0, oc_block_num * sizeof(float));
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
memcpy(bias_data_, ori_bias, out_channel * sizeof(float));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
@ -74,13 +71,12 @@ int ConvolutionCPUKernel::InitWeightBias() {
}
int ConvolutionCPUKernel::InitTmpBuffer() {
int in_channel = conv_param_->input_channel_;
MS_ASSERT(ctx_->allocator != nullptr);
#ifdef ENABLE_ARM32
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * in_channel * C4NUM * thread_count_;
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * C4NUM * thread_count_;
#else
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * in_channel * C12NUM * thread_count_;
int unit_size =
conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * C12NUM * thread_count_;
#endif
packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
if (packed_input_ == nullptr) {
@ -124,9 +120,8 @@ int ConvolutionCPUKernel::ReSize() {
}
int ConvolutionCPUKernel::RunImpl(int task_id) {
auto input_tensor = in_tensors_.at(kInputIndex);
auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
auto ori_input_data = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->data_c());
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->data_c());
ConvFp32(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), col_major_input_,
output_addr, task_id, conv_param_);
return RET_OK;
@ -171,19 +166,13 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
if (sub_conv != nullptr) {
delete sub_conv;
}
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
if (in_tensor != nullptr) {
delete in_tensor;
}
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
if (out_tensor != nullptr) {
delete out_tensor;
}
delete out_tensor;
}
}
@ -304,8 +293,10 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}
int batch = inputs.front()->Batch();
conv_param->input_batch_ = batch;
conv_param->output_batch_ = batch;
if (infered_flag) {
int batch = inputs.front()->Batch();
int in_h = inputs.front()->Height();
int in_w = inputs.front()->Width();
conv_param->input_channel_ = new_in_channel;

View File

@ -82,11 +82,6 @@ int GroupConvolutionCPUKernel::PreProcess() {
return ret;
}
(const_cast<mindspore::lite::PrimitiveC *>(primitive_))->set_infer_flag(true);
ret = ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
return ret;
}
// if infershape func is called in runtime stage, we should malloc memory and set shape info for outputs of sub
// kernels here.
@ -124,6 +119,11 @@ int GroupConvolutionCPUKernel::PreProcess() {
}
}
}
ret = ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
return ret;
}
}
auto outputs = this->out_tensors();
@ -140,9 +140,7 @@ int GroupConvolutionCPUKernel::PreProcess() {
}
void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_plane = in_h * in_w;
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = reinterpret_cast<float *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
@ -156,9 +154,7 @@ void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
}
void GroupConvolutionCPUKernel::PostConcat(int group_id) {
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_plane = out_h * out_w;
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float *>(group_convs_.at(group_id)->out_tensors().front()->data_c());

View File

@ -60,9 +60,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();
int kernel_h = filter_tensor->Height();
int kernel_w = filter_tensor->Width();
int kernel_plane = kernel_h * kernel_w;
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
conv_param_->input_channel_ = input_channel;
conv_param_->output_channel_ = output_channel;
int up_round_deep;
@ -84,7 +82,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
// init weight
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->MutableData());
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data_c());
packed_weight_ = reinterpret_cast<int8_t *>(malloc(pack_weight_size));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
@ -109,7 +107,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
}
memset(bias_data_, 0, bias_size);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->data_c());
memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
@ -210,9 +208,8 @@ int ConvolutionInt8CPUKernel::ReSize() {
}
int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
auto input_tensor = in_tensors_.at(kInputIndex);
auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
auto ori_input_data = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->data_c());
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->data_c());
ConvInt8(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_, matmul_func_, support_optimize_);
return RET_OK;
@ -325,9 +322,11 @@ kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}
int batch = inputs.front()->Batch();
conv_param->input_batch_ = batch;
conv_param->output_batch_ = batch;
bool infered_flag = primitive != nullptr && primitive->infer_flag();
if (infered_flag) {
int batch = inputs.front()->Batch();
int in_h = inputs.front()->Height();
int in_w = inputs.front()->Width();
conv_param->input_channel_ = new_in_channel;

View File

@ -27,7 +27,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
namespace mindspore::kernel {
void GroupConvolutionInt8CPUKernel::SeparateInput(int group_id) {
int in_plane = conv_param_->input_h_ * conv_param_->input_w_;
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = reinterpret_cast<int8_t *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
@ -41,7 +41,7 @@ void GroupConvolutionInt8CPUKernel::SeparateInput(int group_id) {
}
void GroupConvolutionInt8CPUKernel::PostConcat(int group_id) {
int out_plane = conv_param_->output_h_ * conv_param_->output_w_;
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<int8_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());