forked from mindspore-Ecosystem/mindspore
!9445 [MS][LITE][CPU]support multi-batch for group convs
From: @fuzhiye Reviewed-by: Signed-off-by:
This commit is contained in:
commit
f85fbc4dd2
|
@ -40,15 +40,13 @@ using mindspore::schema::Format::Format_NHWC;
|
|||
namespace mindspore::kernel {
|
||||
int ConvolutionFP16CPUKernel::InitWeightBias() {
|
||||
auto filter_tensor = in_tensors_.at(kWeightIndex);
|
||||
int kernel_h = filter_tensor->Height();
|
||||
int kernel_w = filter_tensor->Width();
|
||||
int in_channel = filter_tensor->Channel();
|
||||
int out_channel = filter_tensor->Batch();
|
||||
conv_param_->input_channel_ = in_channel;
|
||||
conv_param_->output_channel_ = out_channel;
|
||||
int oc8 = UP_DIV(out_channel, C8NUM);
|
||||
int kernel_plane = kernel_h * kernel_w;
|
||||
int pack_weight_size = oc8 * C8NUM * in_channel * kernel_plane;
|
||||
int oc8 = UP_ROUND(out_channel, C8NUM);
|
||||
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
|
||||
int pack_weight_size = oc8 * in_channel * kernel_plane;
|
||||
|
||||
// init weight
|
||||
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
|
||||
|
@ -69,15 +67,15 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
|
|||
}
|
||||
|
||||
// init bias
|
||||
bias_data_ = malloc(oc8 * C8NUM * sizeof(float16_t));
|
||||
bias_data_ = malloc(oc8 * sizeof(float16_t));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc bias_data_ failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, oc8 * C8NUM * sizeof(float16_t));
|
||||
memset(bias_data_, 0, oc8 * sizeof(float16_t));
|
||||
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
|
||||
for (int i = 0; i < out_channel; ++i) {
|
||||
fp16_bias_data[i] = (float16_t)ori_bias[i];
|
||||
}
|
||||
|
@ -89,9 +87,8 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
|
|||
|
||||
int ConvolutionFP16CPUKernel::InitTmpBuffer() {
|
||||
const int cal_num = 16;
|
||||
int in_channel = conv_param_->input_channel_;
|
||||
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
||||
int unit_size = kernel_plane * in_channel * cal_num * thread_count_;
|
||||
int unit_size =
|
||||
conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * cal_num * thread_count_;
|
||||
|
||||
packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(unit_size * sizeof(float16_t)));
|
||||
if (packed_input_ == nullptr) {
|
||||
|
@ -205,19 +202,13 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
|
|||
void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
|
||||
const std::vector<lite::Tensor *> &new_outputs) {
|
||||
for (auto sub_conv : group_convs) {
|
||||
if (sub_conv != nullptr) {
|
||||
delete sub_conv;
|
||||
}
|
||||
delete sub_conv;
|
||||
}
|
||||
for (auto in_tensor : new_inputs) {
|
||||
if (in_tensor != nullptr) {
|
||||
delete in_tensor;
|
||||
}
|
||||
delete in_tensor;
|
||||
}
|
||||
for (auto out_tensor : new_outputs) {
|
||||
if (out_tensor != nullptr) {
|
||||
delete out_tensor;
|
||||
}
|
||||
delete out_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -332,8 +323,10 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
|
|||
|
||||
std::vector<int> in_shape;
|
||||
std::vector<int> out_shape;
|
||||
int batch = inputs.front()->Batch();
|
||||
conv_param->input_batch_ = batch;
|
||||
conv_param->output_batch_ = batch;
|
||||
if (infered_flag) {
|
||||
int batch = inputs.front()->Batch();
|
||||
conv_param->input_channel_ = new_in_channel;
|
||||
conv_param->output_channel_ = new_out_channel;
|
||||
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
|
||||
|
|
|
@ -77,11 +77,6 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
|
|||
return ret;
|
||||
}
|
||||
(const_cast<mindspore::lite::PrimitiveC *>(primitive_))->set_infer_flag(true);
|
||||
ret = ReSize();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// if infershape func is called in runtime stage, we should malloc memory and set shape info for outputs of sub
|
||||
// kernels here.
|
||||
|
@ -119,6 +114,11 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
|
|||
}
|
||||
}
|
||||
}
|
||||
ret = ReSize();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
auto outputs = this->out_tensors();
|
||||
|
@ -136,9 +136,7 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
|
|||
|
||||
int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
|
||||
// input may either be float32 or float16
|
||||
int in_h = conv_param_->input_h_;
|
||||
int in_w = conv_param_->input_w_;
|
||||
int in_plane = in_h * in_w;
|
||||
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
|
||||
int sub_in_channel = conv_param_->input_channel_;
|
||||
int ori_in_channel = sub_in_channel * group_num_;
|
||||
auto sub_in_data = group_convs_.at(group_id)->in_tensors().front()->data_c();
|
||||
|
@ -178,9 +176,7 @@ int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
|
|||
|
||||
void GroupConvolutionFP16CPUKernel::PostConcat(int group_id) {
|
||||
// output is must float16 data type
|
||||
int out_h = conv_param_->output_h_;
|
||||
int out_w = conv_param_->output_w_;
|
||||
int out_plane = out_h * out_w;
|
||||
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
|
||||
int sub_out_channel = conv_param_->output_channel_;
|
||||
int ori_out_channel = sub_out_channel * group_num_;
|
||||
auto sub_out_data = reinterpret_cast<float16_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());
|
||||
|
|
|
@ -37,18 +37,15 @@ using mindspore::schema::Format::Format_NHWC;
|
|||
namespace mindspore::kernel {
|
||||
int ConvolutionCPUKernel::InitWeightBias() {
|
||||
auto filter_tensor = in_tensors_.at(kWeightIndex);
|
||||
int kernel_h = filter_tensor->Height();
|
||||
int kernel_w = filter_tensor->Width();
|
||||
int in_channel = filter_tensor->Channel();
|
||||
int out_channel = filter_tensor->Batch();
|
||||
conv_param_->input_channel_ = in_channel;
|
||||
conv_param_->output_channel_ = out_channel;
|
||||
int kernel_plane = kernel_h * kernel_w;
|
||||
const int oc_block = C8NUM;
|
||||
int oc_block_num = UP_DIV(out_channel, C8NUM);
|
||||
int pack_weight_size = oc_block_num * oc_block * in_channel * kernel_plane;
|
||||
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
|
||||
int oc_block_num = UP_ROUND(out_channel, C8NUM);
|
||||
int pack_weight_size = oc_block_num * in_channel * kernel_plane;
|
||||
|
||||
auto origin_weight = reinterpret_cast<float *>(filter_tensor->MutableData());
|
||||
auto origin_weight = reinterpret_cast<float *>(filter_tensor->data_c());
|
||||
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc packed weight failed.";
|
||||
|
@ -57,15 +54,15 @@ int ConvolutionCPUKernel::InitWeightBias() {
|
|||
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
|
||||
RowMajor2Col8Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
|
||||
|
||||
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
|
||||
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * sizeof(float)));
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc bias failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float));
|
||||
memset(bias_data_, 0, oc_block_num * sizeof(float));
|
||||
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
|
||||
memcpy(bias_data_, ori_bias, out_channel * sizeof(float));
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
|
@ -74,13 +71,12 @@ int ConvolutionCPUKernel::InitWeightBias() {
|
|||
}
|
||||
|
||||
int ConvolutionCPUKernel::InitTmpBuffer() {
|
||||
int in_channel = conv_param_->input_channel_;
|
||||
MS_ASSERT(ctx_->allocator != nullptr);
|
||||
|
||||
#ifdef ENABLE_ARM32
|
||||
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * in_channel * C4NUM * thread_count_;
|
||||
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * C4NUM * thread_count_;
|
||||
#else
|
||||
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * in_channel * C12NUM * thread_count_;
|
||||
int unit_size =
|
||||
conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * C12NUM * thread_count_;
|
||||
#endif
|
||||
packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
|
||||
if (packed_input_ == nullptr) {
|
||||
|
@ -124,9 +120,8 @@ int ConvolutionCPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
int ConvolutionCPUKernel::RunImpl(int task_id) {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
|
||||
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
|
||||
auto ori_input_data = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->data_c());
|
||||
auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->data_c());
|
||||
ConvFp32(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), col_major_input_,
|
||||
output_addr, task_id, conv_param_);
|
||||
return RET_OK;
|
||||
|
@ -171,19 +166,13 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
|
|||
void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
|
||||
const std::vector<lite::Tensor *> &new_outputs) {
|
||||
for (auto sub_conv : group_convs) {
|
||||
if (sub_conv != nullptr) {
|
||||
delete sub_conv;
|
||||
}
|
||||
delete sub_conv;
|
||||
}
|
||||
for (auto in_tensor : new_inputs) {
|
||||
if (in_tensor != nullptr) {
|
||||
delete in_tensor;
|
||||
}
|
||||
delete in_tensor;
|
||||
}
|
||||
for (auto out_tensor : new_outputs) {
|
||||
if (out_tensor != nullptr) {
|
||||
delete out_tensor;
|
||||
}
|
||||
delete out_tensor;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -304,8 +293,10 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
|
|||
} else {
|
||||
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
|
||||
}
|
||||
int batch = inputs.front()->Batch();
|
||||
conv_param->input_batch_ = batch;
|
||||
conv_param->output_batch_ = batch;
|
||||
if (infered_flag) {
|
||||
int batch = inputs.front()->Batch();
|
||||
int in_h = inputs.front()->Height();
|
||||
int in_w = inputs.front()->Width();
|
||||
conv_param->input_channel_ = new_in_channel;
|
||||
|
|
|
@ -82,11 +82,6 @@ int GroupConvolutionCPUKernel::PreProcess() {
|
|||
return ret;
|
||||
}
|
||||
(const_cast<mindspore::lite::PrimitiveC *>(primitive_))->set_infer_flag(true);
|
||||
ret = ReSize();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
// if infershape func is called in runtime stage, we should malloc memory and set shape info for outputs of sub
|
||||
// kernels here.
|
||||
|
@ -124,6 +119,11 @@ int GroupConvolutionCPUKernel::PreProcess() {
|
|||
}
|
||||
}
|
||||
}
|
||||
ret = ReSize();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "ReSize fail!ret: " << ret;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
auto outputs = this->out_tensors();
|
||||
|
@ -140,9 +140,7 @@ int GroupConvolutionCPUKernel::PreProcess() {
|
|||
}
|
||||
|
||||
void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
|
||||
int in_h = conv_param_->input_h_;
|
||||
int in_w = conv_param_->input_w_;
|
||||
int in_plane = in_h * in_w;
|
||||
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
|
||||
int sub_in_channel = conv_param_->input_channel_;
|
||||
int ori_in_channel = sub_in_channel * group_num_;
|
||||
auto sub_in_data = reinterpret_cast<float *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
|
||||
|
@ -156,9 +154,7 @@ void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
|
|||
}
|
||||
|
||||
void GroupConvolutionCPUKernel::PostConcat(int group_id) {
|
||||
int out_h = conv_param_->output_h_;
|
||||
int out_w = conv_param_->output_w_;
|
||||
int out_plane = out_h * out_w;
|
||||
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
|
||||
int sub_out_channel = conv_param_->output_channel_;
|
||||
int ori_out_channel = sub_out_channel * group_num_;
|
||||
auto sub_out_data = reinterpret_cast<float *>(group_convs_.at(group_id)->out_tensors().front()->data_c());
|
||||
|
|
|
@ -60,9 +60,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
auto filter_tensor = in_tensors_.at(kWeightIndex);
|
||||
auto input_channel = filter_tensor->Channel();
|
||||
auto output_channel = filter_tensor->Batch();
|
||||
int kernel_h = filter_tensor->Height();
|
||||
int kernel_w = filter_tensor->Width();
|
||||
int kernel_plane = kernel_h * kernel_w;
|
||||
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
|
||||
conv_param_->input_channel_ = input_channel;
|
||||
conv_param_->output_channel_ = output_channel;
|
||||
int up_round_deep;
|
||||
|
@ -84,7 +82,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
|
||||
|
||||
// init weight
|
||||
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->MutableData());
|
||||
auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->data_c());
|
||||
packed_weight_ = reinterpret_cast<int8_t *>(malloc(pack_weight_size));
|
||||
if (packed_weight_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
|
||||
|
@ -109,7 +107,7 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
|||
}
|
||||
memset(bias_data_, 0, bias_size);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->data_c());
|
||||
memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
|
@ -210,9 +208,8 @@ int ConvolutionInt8CPUKernel::ReSize() {
|
|||
}
|
||||
|
||||
int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
|
||||
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
|
||||
auto ori_input_data = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->data_c());
|
||||
auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->data_c());
|
||||
ConvInt8(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
|
||||
output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_, matmul_func_, support_optimize_);
|
||||
return RET_OK;
|
||||
|
@ -325,9 +322,11 @@ kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor
|
|||
} else {
|
||||
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
|
||||
}
|
||||
int batch = inputs.front()->Batch();
|
||||
conv_param->input_batch_ = batch;
|
||||
conv_param->output_batch_ = batch;
|
||||
bool infered_flag = primitive != nullptr && primitive->infer_flag();
|
||||
if (infered_flag) {
|
||||
int batch = inputs.front()->Batch();
|
||||
int in_h = inputs.front()->Height();
|
||||
int in_w = inputs.front()->Width();
|
||||
conv_param->input_channel_ = new_in_channel;
|
||||
|
|
|
@ -27,7 +27,7 @@ using mindspore::schema::PrimitiveType_Conv2D;
|
|||
|
||||
namespace mindspore::kernel {
|
||||
void GroupConvolutionInt8CPUKernel::SeparateInput(int group_id) {
|
||||
int in_plane = conv_param_->input_h_ * conv_param_->input_w_;
|
||||
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
|
||||
int sub_in_channel = conv_param_->input_channel_;
|
||||
int ori_in_channel = sub_in_channel * group_num_;
|
||||
auto sub_in_data = reinterpret_cast<int8_t *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
|
||||
|
@ -41,7 +41,7 @@ void GroupConvolutionInt8CPUKernel::SeparateInput(int group_id) {
|
|||
}
|
||||
|
||||
void GroupConvolutionInt8CPUKernel::PostConcat(int group_id) {
|
||||
int out_plane = conv_param_->output_h_ * conv_param_->output_w_;
|
||||
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
|
||||
int sub_out_channel = conv_param_->output_channel_;
|
||||
int ori_out_channel = sub_out_channel * group_num_;
|
||||
auto sub_out_data = reinterpret_cast<int8_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());
|
||||
|
|
Loading…
Reference in New Issue