forked from mindspore-Ecosystem/mindspore
optimize convolution and deconvolution init performance
This commit is contained in:
parent
ad37b6845f
commit
18a972399d
|
@ -92,6 +92,7 @@ void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const floa
|
||||||
}
|
}
|
||||||
|
|
||||||
void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||||
|
size_t row_up_16 = UP_ROUND(row, C16NUM);
|
||||||
size_t row16 = row / C16NUM * C16NUM;
|
size_t row16 = row / C16NUM * C16NUM;
|
||||||
size_t col8 = col / C8NUM * C8NUM;
|
size_t col8 = col / C8NUM * C8NUM;
|
||||||
float16_t *src_r = src_ptr;
|
float16_t *src_r = src_ptr;
|
||||||
|
@ -236,6 +237,12 @@ void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row,
|
||||||
src_r += col;
|
src_r += col;
|
||||||
dst_r += 1;
|
dst_r += 1;
|
||||||
}
|
}
|
||||||
|
for (; ri < row_up_16; ri++) {
|
||||||
|
for (size_t i = 0; i < col; i++) {
|
||||||
|
dst_r[i * C16NUM] = 0;
|
||||||
|
}
|
||||||
|
dst_r += 1;
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,14 +62,6 @@ int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
|
||||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
|
||||||
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM;
|
thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM;
|
||||||
|
|
||||||
pack_input_ =
|
|
||||||
reinterpret_cast<float16_t *>(malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
|
|
||||||
if (pack_input_ == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
|
|
||||||
return RET_MEMORY_FAILED;
|
|
||||||
}
|
|
||||||
memset(pack_input_, 0, matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t));
|
|
||||||
|
|
||||||
if (pre_trans_input_) {
|
if (pre_trans_input_) {
|
||||||
input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
|
input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
|
||||||
if (input_ptr_ == nullptr) {
|
if (input_ptr_ == nullptr) {
|
||||||
|
@ -133,10 +125,6 @@ int Convolution1x1FP16CPUKernel::Init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
|
void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
|
||||||
if (pack_input_ != nullptr) {
|
|
||||||
free(pack_input_);
|
|
||||||
pack_input_ = nullptr;
|
|
||||||
}
|
|
||||||
if (pre_trans_input_ && input_ptr_ != nullptr) {
|
if (pre_trans_input_ && input_ptr_ != nullptr) {
|
||||||
free(input_ptr_);
|
free(input_ptr_);
|
||||||
input_ptr_ = nullptr;
|
input_ptr_ = nullptr;
|
||||||
|
@ -216,6 +204,13 @@ int Convolution1x1FP16CPUKernel::Run() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pack_input_ = reinterpret_cast<float16_t *>(
|
||||||
|
ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
|
||||||
|
if (pack_input_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
|
||||||
|
return RET_MEMORY_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
|
||||||
Pre1x1Trans(
|
Pre1x1Trans(
|
||||||
execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
|
execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
|
||||||
|
@ -231,6 +226,10 @@ int Convolution1x1FP16CPUKernel::Run() {
|
||||||
ConvolutionBaseFP16CPUKernel::IfCastOutput();
|
ConvolutionBaseFP16CPUKernel::IfCastOutput();
|
||||||
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
|
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
|
||||||
|
|
||||||
|
if (pack_input_ != nullptr) {
|
||||||
|
ctx_->allocator->Free(pack_input_);
|
||||||
|
pack_input_ = nullptr;
|
||||||
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
|
@ -26,7 +26,6 @@ using mindspore::schema::PrimitiveType_DeConv2D;
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
|
DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
|
||||||
FreeParam();
|
|
||||||
if (matmul_param_ != nullptr) {
|
if (matmul_param_ != nullptr) {
|
||||||
delete matmul_param_;
|
delete matmul_param_;
|
||||||
matmul_param_ = nullptr;
|
matmul_param_ = nullptr;
|
||||||
|
@ -35,7 +34,6 @@ DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int DeConvolutionFp16CPUKernel::ReSize() {
|
int DeConvolutionFp16CPUKernel::ReSize() {
|
||||||
FreeParam();
|
|
||||||
ConvolutionBaseCPUKernel::Init();
|
ConvolutionBaseCPUKernel::Init();
|
||||||
|
|
||||||
int error_code = InitParam();
|
int error_code = InitParam();
|
||||||
|
@ -43,45 +41,36 @@ int DeConvolutionFp16CPUKernel::ReSize() {
|
||||||
MS_LOG(ERROR) << "deconv InitParam error!";
|
MS_LOG(ERROR) << "deconv InitParam error!";
|
||||||
return error_code;
|
return error_code;
|
||||||
}
|
}
|
||||||
|
|
||||||
error_code = InitWeightBias();
|
|
||||||
if (error_code != RET_OK) {
|
|
||||||
MS_LOG(ERROR) << "deconv InitWeightBias error!";
|
|
||||||
return error_code;
|
|
||||||
}
|
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeConvolutionFp16CPUKernel::FreeParam() {
|
|
||||||
if (pack_input_ != nullptr) {
|
|
||||||
free(pack_input_);
|
|
||||||
pack_input_ = nullptr;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
||||||
bias_data_ = malloc(UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float16_t));
|
auto weight_tensor = in_tensors_.at(kWeightIndex);
|
||||||
|
auto input_channel = weight_tensor->Batch();
|
||||||
|
auto output_channel = weight_tensor->Channel();
|
||||||
|
auto kernel_h = weight_tensor->Height();
|
||||||
|
auto kernel_w = weight_tensor->Width();
|
||||||
|
|
||||||
|
bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
|
||||||
if (bias_data_ == nullptr) {
|
if (bias_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
|
MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
memset(bias_data_, 0, UP_ROUND(conv_param_->output_channel_, C4NUM) * sizeof(float16_t));
|
memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
|
||||||
if (in_tensors_.size() == 3) {
|
if (in_tensors_.size() == 3) {
|
||||||
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->MutableData()),
|
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->MutableData()),
|
||||||
reinterpret_cast<float16_t *>(bias_data_), conv_param_->output_channel_);
|
reinterpret_cast<float16_t *>(bias_data_), conv_param_->output_channel_);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t weight_pack_size = conv_param_->input_channel_ * conv_param_->kernel_w_ * conv_param_->kernel_h_ *
|
size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
|
||||||
UP_ROUND(conv_param_->output_channel_, C8NUM) * sizeof(float16_t);
|
|
||||||
execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
|
execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
|
||||||
if (execute_weight_ == nullptr) {
|
if (execute_weight_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "deconv malloc execute_weight_ error!";
|
MS_LOG(ERROR) << "deconv malloc execute_weight_ error!";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
memset(execute_weight_, 0, weight_pack_size);
|
memset(execute_weight_, 0, weight_pack_size);
|
||||||
PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), execute_weight_,
|
PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_[1]->MutableData()), execute_weight_, input_channel,
|
||||||
conv_param_->input_channel_, kernel_plane_, conv_param_->output_channel_);
|
kernel_w * kernel_h, output_channel);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,14 +87,6 @@ int DeConvolutionFp16CPUKernel::InitParam() {
|
||||||
|
|
||||||
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
|
thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
|
||||||
thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
|
thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
|
||||||
|
|
||||||
size_t size = row16_ * matmul_param_->deep_ * sizeof(float16_t);
|
|
||||||
pack_input_ = reinterpret_cast<float16_t *>(malloc(size));
|
|
||||||
if (pack_input_ == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
|
|
||||||
return RET_ERROR;
|
|
||||||
}
|
|
||||||
memset(pack_input_, 0, size);
|
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,6 +103,12 @@ int DeConvolutionFp16CPUKernel::InitRunBuf() {
|
||||||
MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!";
|
MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pack_input_ = reinterpret_cast<float16_t *>(malloc(row16_ * matmul_param_->deep_ * sizeof(float16_t)));
|
||||||
|
if (pack_input_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -134,6 +121,10 @@ void DeConvolutionFp16CPUKernel::FreeRunBuf() {
|
||||||
ctx_->allocator->Free(pack_output_);
|
ctx_->allocator->Free(pack_output_);
|
||||||
pack_output_ = nullptr;
|
pack_output_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (pack_input_ != nullptr) {
|
||||||
|
ctx_->allocator->Free(pack_input_);
|
||||||
|
pack_input_ = nullptr;
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,6 +158,11 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int DeConvolutionFp16CPUKernel::Init() {
|
int DeConvolutionFp16CPUKernel::Init() {
|
||||||
|
int ret = InitWeightBias();
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "deconv InitWeightBias error!";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
if (!InferShapeDone()) {
|
if (!InferShapeDone()) {
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,6 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||||
private:
|
private:
|
||||||
int InitRunBuf();
|
int InitRunBuf();
|
||||||
void FreeRunBuf();
|
void FreeRunBuf();
|
||||||
void FreeParam();
|
|
||||||
int InitParam();
|
int InitParam();
|
||||||
int InitWeightBias();
|
int InitWeightBias();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue