forked from mindspore-Ecosystem/mindspore
!4804 [MS][LITE][BUG]Fix int8 conv per channel bug && Remove useless comment
Merge pull request !4804 from fuzhiye/tmp
This commit is contained in:
commit
e5b09c8e0f
|
@ -20,8 +20,12 @@
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
Matrix *TransformMatrixGenerator(int m, int k) {
|
Matrix *TransformMatrixGenerator(int m, int k) {
|
||||||
auto matrix = new Matrix;
|
auto matrix = new Matrix;
|
||||||
auto aa = malloc(m * k * sizeof(float));
|
auto data = malloc(m * k * sizeof(float));
|
||||||
matrix->SetData(aa);
|
if (data == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "Malloc matrix data failed.";
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
matrix->SetData(data);
|
||||||
matrix->SetNum(m, k);
|
matrix->SetNum(m, k);
|
||||||
return matrix;
|
return matrix;
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,7 +57,7 @@ class Matrix {
|
||||||
int GetK() { return this->k_; }
|
int GetK() { return this->k_; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void *data_;
|
void *data_ = nullptr;
|
||||||
std::vector<int> shape_;
|
std::vector<int> shape_;
|
||||||
std::vector<int> stride_;
|
std::vector<int> stride_;
|
||||||
int m_;
|
int m_;
|
||||||
|
|
|
@ -57,7 +57,7 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() {
|
||||||
conv_param_->output_channel_ = output_channel;
|
conv_param_->output_channel_ = output_channel;
|
||||||
int iC8 = UP_DIV(input_channel, C8NUM);
|
int iC8 = UP_DIV(input_channel, C8NUM);
|
||||||
int oC8 = UP_DIV(output_channel, C8NUM);
|
int oC8 = UP_DIV(output_channel, C8NUM);
|
||||||
// ===========================init weight========================== //
|
|
||||||
size_t transformed_size = iC8 * C8NUM * oC8 * C8NUM * 36 * sizeof(float16_t);
|
size_t transformed_size = iC8 * C8NUM * oC8 * C8NUM * 36 * sizeof(float16_t);
|
||||||
transformed_filter_addr_ = reinterpret_cast<float16_t *>(malloc(transformed_size));
|
transformed_filter_addr_ = reinterpret_cast<float16_t *>(malloc(transformed_size));
|
||||||
if (transformed_filter_addr_ == nullptr) {
|
if (transformed_filter_addr_ == nullptr) {
|
||||||
|
@ -72,7 +72,6 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() {
|
||||||
}
|
}
|
||||||
ProcessFilterFp16(execute_weight_, transformed_filter_addr_, conv_param_);
|
ProcessFilterFp16(execute_weight_, transformed_filter_addr_, conv_param_);
|
||||||
|
|
||||||
// =============================init bias========================= //
|
|
||||||
size_t new_bias_size = oC8 * C8NUM * sizeof(float16_t);
|
size_t new_bias_size = oC8 * C8NUM * sizeof(float16_t);
|
||||||
bias_data_ = malloc(new_bias_size);
|
bias_data_ = malloc(new_bias_size);
|
||||||
if (bias_data_ == nullptr) {
|
if (bias_data_ == nullptr) {
|
||||||
|
@ -97,7 +96,7 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
|
||||||
const int k_plane = 36;
|
const int k_plane = 36;
|
||||||
int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
/*=============================block_unit_buffer_============================*/
|
|
||||||
size_t block_unit_buffer_size = thread_count_ * k_plane * C8NUM * sizeof(float16_t);
|
size_t block_unit_buffer_size = thread_count_ * k_plane * C8NUM * sizeof(float16_t);
|
||||||
block_unit_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
block_unit_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
||||||
if (block_unit_buffer_ == nullptr) {
|
if (block_unit_buffer_ == nullptr) {
|
||||||
|
@ -105,7 +104,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_dst_buffer_============================*/
|
|
||||||
size_t tmp_dst_buffer_size = thread_count_ * tile_num * k_plane * oC8 * C8NUM * sizeof(float16_t);
|
size_t tmp_dst_buffer_size = thread_count_ * tile_num * k_plane * oC8 * C8NUM * sizeof(float16_t);
|
||||||
tmp_dst_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
tmp_dst_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
||||||
if (tmp_dst_buffer_ == nullptr) {
|
if (tmp_dst_buffer_ == nullptr) {
|
||||||
|
@ -113,7 +111,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_============================*/
|
|
||||||
int new_out_plane = UP_DIV(conv_param_->output_h_, C4NUM) * UP_DIV(conv_param_->output_w_, C4NUM) * C4NUM * C4NUM;
|
int new_out_plane = UP_DIV(conv_param_->output_h_, C4NUM) * UP_DIV(conv_param_->output_w_, C4NUM) * C4NUM * C4NUM;
|
||||||
size_t tmp_out_size = oC8 * C8NUM * conv_param_->output_batch_ * new_out_plane * sizeof(float16_t);
|
size_t tmp_out_size = oC8 * C8NUM * conv_param_->output_batch_ * new_out_plane * sizeof(float16_t);
|
||||||
tmp_out_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_out_size));
|
tmp_out_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_out_size));
|
||||||
|
@ -155,7 +152,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (tile_buffer_ != nullptr) {
|
if (tile_buffer_ != nullptr) {
|
||||||
free(tile_buffer_);
|
free(tile_buffer_);
|
||||||
tile_buffer_ = nullptr;
|
tile_buffer_ = nullptr;
|
||||||
|
@ -174,7 +170,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
|
||||||
const int k_plane = 36;
|
const int k_plane = 36;
|
||||||
int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
||||||
|
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
size_t nhwc8_input_size =
|
size_t nhwc8_input_size =
|
||||||
iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
||||||
nhwc4_input_ = malloc(nhwc8_input_size);
|
nhwc4_input_ = malloc(nhwc8_input_size);
|
||||||
|
@ -184,7 +179,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc8_input_size);
|
memset(nhwc4_input_, 0, nhwc8_input_size);
|
||||||
|
|
||||||
/*=============================tile_buffer_============================*/
|
|
||||||
size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t);
|
size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t);
|
||||||
tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
|
tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
|
||||||
if (tile_buffer_ == nullptr) {
|
if (tile_buffer_ == nullptr) {
|
||||||
|
|
|
@ -96,7 +96,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
|
||||||
int unit_size = kernel_plane * channel_block * C4NUM;
|
int unit_size = kernel_plane * channel_block * C4NUM;
|
||||||
int packed_input_size = output_tile_count * cal_num * unit_size;
|
int packed_input_size = output_tile_count * cal_num * unit_size;
|
||||||
|
|
||||||
/*=============================packed_input_============================*/
|
|
||||||
packed_input_ = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
|
packed_input_ = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
|
||||||
if (packed_input_ == nullptr) {
|
if (packed_input_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc packed_input_ failed.";
|
MS_LOG(ERROR) << "malloc packed_input_ failed.";
|
||||||
|
@ -104,7 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
|
||||||
}
|
}
|
||||||
memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));
|
memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));
|
||||||
|
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
|
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
|
||||||
conv_param_->input_w_ * sizeof(float16_t);
|
conv_param_->input_w_ * sizeof(float16_t);
|
||||||
nhwc4_input_ = malloc(nhwc4_input_size);
|
nhwc4_input_ = malloc(nhwc4_input_size);
|
||||||
|
@ -114,7 +112,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||||
|
|
||||||
/*=============================tmp_output_block_============================*/
|
|
||||||
tmp_output_block_ = reinterpret_cast<float16_t *>(malloc(cal_num * out_channel * sizeof(float16_t)));
|
tmp_output_block_ = reinterpret_cast<float16_t *>(malloc(cal_num * out_channel * sizeof(float16_t)));
|
||||||
if (tmp_output_block_ == nullptr) {
|
if (tmp_output_block_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
|
MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
|
||||||
|
|
|
@ -71,7 +71,6 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() {
|
||||||
int kernel_plane = kernel_h * kernel_w;
|
int kernel_plane = kernel_h * kernel_w;
|
||||||
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
|
int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
|
||||||
|
|
||||||
// ========================init weight==================== //
|
|
||||||
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
||||||
if (packed_weight_ == nullptr) {
|
if (packed_weight_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
|
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
|
||||||
|
@ -84,7 +83,6 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =======================init bias====================== //
|
|
||||||
bias_data_ = malloc(oc4 * C4NUM * sizeof(float16_t));
|
bias_data_ = malloc(oc4 * C4NUM * sizeof(float16_t));
|
||||||
if (bias_data_ == nullptr) {
|
if (bias_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc bias_data_ failed.";
|
MS_LOG(ERROR) << "malloc bias_data_ failed.";
|
||||||
|
@ -107,7 +105,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
|
||||||
int out_channel = conv_param_->output_channel_;
|
int out_channel = conv_param_->output_channel_;
|
||||||
int oc4 = UP_DIV(out_channel, C4NUM);
|
int oc4 = UP_DIV(out_channel, C4NUM);
|
||||||
|
|
||||||
/*=============================tmp_output_block_============================*/
|
|
||||||
tmp_output_block_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(
|
tmp_output_block_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(
|
||||||
conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t)));
|
conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t)));
|
||||||
if (tmp_output_block_ == nullptr) {
|
if (tmp_output_block_ == nullptr) {
|
||||||
|
@ -148,11 +145,14 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (slidingWindow_param_ != nullptr) {
|
||||||
|
delete slidingWindow_param_;
|
||||||
|
slidingWindow_param_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
ret = ConvolutionBaseCPUKernel::Init();
|
ret = ConvolutionBaseCPUKernel::Init();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
|
@ -160,10 +160,9 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||||
size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
|
size_t nhwc4_input_size =
|
||||||
conv_param_->input_w_ * sizeof(float16_t);
|
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
||||||
nhwc4_input_ = malloc(nhwc4_input_size);
|
nhwc4_input_ = malloc(nhwc4_input_size);
|
||||||
if (nhwc4_input_ == nullptr) {
|
if (nhwc4_input_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
|
MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
|
||||||
|
|
|
@ -37,6 +37,10 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||||
free(packed_weight_);
|
free(packed_weight_);
|
||||||
packed_weight_ = nullptr;
|
packed_weight_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (slidingWindow_param_ != nullptr) {
|
||||||
|
delete slidingWindow_param_;
|
||||||
|
slidingWindow_param_ = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int Init() override;
|
int Init() override;
|
||||||
|
@ -54,10 +58,6 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||||
ctx_->allocator->Free(tmp_output_block_);
|
ctx_->allocator->Free(tmp_output_block_);
|
||||||
tmp_output_block_ = nullptr;
|
tmp_output_block_ = nullptr;
|
||||||
}
|
}
|
||||||
if (slidingWindow_param_ != nullptr) {
|
|
||||||
delete slidingWindow_param_;
|
|
||||||
slidingWindow_param_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
float16_t *packed_weight_ = nullptr;
|
float16_t *packed_weight_ = nullptr;
|
||||||
float16_t *tmp_output_block_ = nullptr;
|
float16_t *tmp_output_block_ = nullptr;
|
||||||
|
|
|
@ -35,8 +35,8 @@ using mindspore::lite::RET_OK;
|
||||||
using mindspore::schema::PrimitiveType_Conv2D;
|
using mindspore::schema::PrimitiveType_Conv2D;
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
||||||
ConvParameter *conv_param, int oc_block) {
|
ConvParameter *conv_param, int oc_block) {
|
||||||
// original weight format : ohwi
|
// original weight format : ohwi
|
||||||
auto channel_in = conv_param->input_channel_;
|
auto channel_in = conv_param->input_channel_;
|
||||||
auto channel_out = conv_param->output_channel_;
|
auto channel_out = conv_param->output_channel_;
|
||||||
|
@ -44,7 +44,18 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei
|
||||||
|
|
||||||
// generate matrix_G && matrix_GT
|
// generate matrix_G && matrix_GT
|
||||||
auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit);
|
auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit);
|
||||||
|
if (matrix_g == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "matrix_g is null.";
|
||||||
|
delete matrix_g;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit);
|
auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit);
|
||||||
|
if (matrix_gt == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "matrix_gt is null.";
|
||||||
|
delete matrix_g;
|
||||||
|
delete matrix_gt;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
ChooseMatrixG(matrix_g, matrix_gt);
|
ChooseMatrixG(matrix_g, matrix_gt);
|
||||||
auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData());
|
auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData());
|
||||||
auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData());
|
auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData());
|
||||||
|
@ -72,7 +83,7 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei
|
||||||
free(matrix_gt_data_fp16);
|
free(matrix_gt_data_fp16);
|
||||||
delete matrix_g;
|
delete matrix_g;
|
||||||
delete matrix_gt;
|
delete matrix_gt;
|
||||||
return;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < channel_out; i++) {
|
for (int i = 0; i < channel_out; i++) {
|
||||||
int out_c_block = i / oc_block;
|
int out_c_block = i / oc_block;
|
||||||
|
@ -107,6 +118,7 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei
|
||||||
free(matrix_gt_data_fp16);
|
free(matrix_gt_data_fp16);
|
||||||
delete matrix_g;
|
delete matrix_g;
|
||||||
delete matrix_gt;
|
delete matrix_gt;
|
||||||
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
|
int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
|
||||||
|
@ -132,7 +144,12 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
|
||||||
MS_LOG(ERROR) << "Get Execute filter failed.";
|
MS_LOG(ERROR) << "Get Execute filter failed.";
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block);
|
|
||||||
|
ret = WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block);
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "winograd filter transfrom failed.";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
// init bias
|
// init bias
|
||||||
bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
|
bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
|
||||||
|
@ -203,7 +220,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
|
||||||
int output_w = conv_param_->output_w_;
|
int output_w = conv_param_->output_w_;
|
||||||
int oc8 = UP_DIV(channel_out, C8NUM);
|
int oc8 = UP_DIV(channel_out, C8NUM);
|
||||||
|
|
||||||
/*=============================gemm_out_============================*/
|
|
||||||
gemm_out_ = reinterpret_cast<float16_t *>(
|
gemm_out_ = reinterpret_cast<float16_t *>(
|
||||||
ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t)));
|
ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t)));
|
||||||
if (gemm_out_ == nullptr) {
|
if (gemm_out_ == nullptr) {
|
||||||
|
@ -211,7 +227,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_data_============================*/
|
|
||||||
int out_w_block = UP_DIV(output_w, output_unit_);
|
int out_w_block = UP_DIV(output_w, output_unit_);
|
||||||
int out_h_block = UP_DIV(output_h, output_unit_);
|
int out_h_block = UP_DIV(output_h, output_unit_);
|
||||||
tmp_out_data_ = reinterpret_cast<float16_t *>(
|
tmp_out_data_ = reinterpret_cast<float16_t *>(
|
||||||
|
@ -222,7 +237,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_data_============================*/
|
|
||||||
tmp_data_ = reinterpret_cast<float16_t *>(
|
tmp_data_ = reinterpret_cast<float16_t *>(
|
||||||
ctx_->allocator->Malloc(thread_count_ * C8NUM * input_unit_ * input_unit_ * sizeof(float16_t)));
|
ctx_->allocator->Malloc(thread_count_ * C8NUM * input_unit_ * input_unit_ * sizeof(float16_t)));
|
||||||
if (tmp_data_ == nullptr) {
|
if (tmp_data_ == nullptr) {
|
||||||
|
@ -279,7 +293,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
|
@ -302,7 +315,7 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
|
||||||
int cal_num = 16;
|
int cal_num = 16;
|
||||||
int channel_in = conv_param_->input_channel_;
|
int channel_in = conv_param_->input_channel_;
|
||||||
int ic8 = UP_DIV(channel_in, C8NUM);
|
int ic8 = UP_DIV(channel_in, C8NUM);
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
size_t nhwc8_input_size =
|
size_t nhwc8_input_size =
|
||||||
ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
|
||||||
nhwc4_input_ = malloc(nhwc8_input_size);
|
nhwc4_input_ = malloc(nhwc8_input_size);
|
||||||
|
@ -312,7 +325,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc8_input_size);
|
memset(nhwc4_input_, 0, nhwc8_input_size);
|
||||||
|
|
||||||
/*=============================trans_input_============================*/
|
|
||||||
size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t);
|
size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t);
|
||||||
trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
|
trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
|
||||||
if (trans_input_ == nullptr) {
|
if (trans_input_ == nullptr) {
|
||||||
|
|
|
@ -84,7 +84,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
||||||
OutputTransformUnitFp16Func output_trans_func_;
|
OutputTransformUnitFp16Func output_trans_func_;
|
||||||
TmpBufferAddressFp16 tmp_buffer_address_list_[4];
|
TmpBufferAddressFp16 tmp_buffer_address_list_[4];
|
||||||
};
|
};
|
||||||
void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
||||||
ConvParameter *conv_param, int oc_block);
|
ConvParameter *conv_param, int oc_block);
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,6 @@ int ConvolutionCPUKernel::InitWeightBias() {
|
||||||
// #endif
|
// #endif
|
||||||
int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;
|
int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;
|
||||||
|
|
||||||
// =====================init weight==========================//
|
|
||||||
auto origin_weight = reinterpret_cast<float *>(filter_tensor->Data());
|
auto origin_weight = reinterpret_cast<float *>(filter_tensor->Data());
|
||||||
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
||||||
if (packed_weight_ == nullptr) {
|
if (packed_weight_ == nullptr) {
|
||||||
|
@ -64,7 +63,6 @@ int ConvolutionCPUKernel::InitWeightBias() {
|
||||||
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
|
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
|
||||||
PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num);
|
PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num);
|
||||||
|
|
||||||
// =======================init bias==========================//
|
|
||||||
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
|
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
|
||||||
if (bias_data_ == nullptr) {
|
if (bias_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc bias failed.";
|
MS_LOG(ERROR) << "malloc bias failed.";
|
||||||
|
@ -84,7 +82,6 @@ int ConvolutionCPUKernel::InitTmpBuffer() {
|
||||||
int out_channel = conv_param_->output_channel_;
|
int out_channel = conv_param_->output_channel_;
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
|
|
||||||
/*=============================tmp_output_block_============================*/
|
|
||||||
tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(TILE_NUM * out_channel * sizeof(float)));
|
tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(TILE_NUM * out_channel * sizeof(float)));
|
||||||
if (tmp_output_block_ == nullptr) {
|
if (tmp_output_block_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc tmp output block failed.";
|
MS_LOG(ERROR) << "malloc tmp output block failed.";
|
||||||
|
@ -125,7 +122,6 @@ int ConvolutionCPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
|
@ -140,7 +136,6 @@ int ConvolutionCPUKernel::ReSize() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||||
size_t nhwc4_input_size =
|
size_t nhwc4_input_size =
|
||||||
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
||||||
|
@ -151,7 +146,6 @@ int ConvolutionCPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||||
|
|
||||||
/*=============================packed_input============================*/
|
|
||||||
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
|
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
|
||||||
int output_tile_count = UP_DIV(output_count, TILE_NUM);
|
int output_tile_count = UP_DIV(output_count, TILE_NUM);
|
||||||
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
|
int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
|
||||||
|
@ -192,7 +186,7 @@ int ConvolutionCPUKernel::Run() {
|
||||||
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
|
MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
|
||||||
return prepare_ret;
|
return prepare_ret;
|
||||||
}
|
}
|
||||||
// ============Init buffer using memory pool allocator=============//
|
|
||||||
auto ret = InitTmpBuffer();
|
auto ret = InitTmpBuffer();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
MS_LOG(ERROR) << "Init tmp buffer failed.";
|
MS_LOG(ERROR) << "Init tmp buffer failed.";
|
||||||
|
@ -264,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
|
||||||
kernel =
|
kernel =
|
||||||
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
|
||||||
} else if (use_sw) {
|
} else if (use_sw) {
|
||||||
// kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
|
||||||
} else {
|
} else {
|
||||||
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,7 +98,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
|
||||||
const int k_plane = 16;
|
const int k_plane = 16;
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
|
|
||||||
/*=============================block_unit_buffer_============================*/
|
|
||||||
size_t block_unit_buffer_size = thread_count_ * k_plane * C4NUM * sizeof(float);
|
size_t block_unit_buffer_size = thread_count_ * k_plane * C4NUM * sizeof(float);
|
||||||
block_unit_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
block_unit_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
||||||
if (block_unit_buffer_ == nullptr) {
|
if (block_unit_buffer_ == nullptr) {
|
||||||
|
@ -106,7 +105,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_dst_buffer_============================*/
|
|
||||||
size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float);
|
size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float);
|
||||||
tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
||||||
if (tmp_dst_buffer_ == nullptr) {
|
if (tmp_dst_buffer_ == nullptr) {
|
||||||
|
@ -114,7 +112,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================nc4hw4_out_============================*/
|
|
||||||
size_t nc4hw4_out_size =
|
size_t nc4hw4_out_size =
|
||||||
oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float);
|
oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float);
|
||||||
nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size));
|
nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size));
|
||||||
|
@ -160,7 +157,6 @@ int Convolution3x3CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
|
@ -177,7 +173,6 @@ int Convolution3x3CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
size_t nhwc4_input_size =
|
size_t nhwc4_input_size =
|
||||||
iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
||||||
nhwc4_input_ = malloc(nhwc4_input_size);
|
nhwc4_input_ = malloc(nhwc4_input_size);
|
||||||
|
@ -187,7 +182,6 @@ int Convolution3x3CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||||
|
|
||||||
/*=============================tile_buffer_============================*/
|
|
||||||
size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float);
|
size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float);
|
||||||
tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
||||||
if (tile_buffer_ == nullptr) {
|
if (tile_buffer_ == nullptr) {
|
||||||
|
|
|
@ -123,7 +123,11 @@ int ConvolutionDepthwiseCPUKernel::ReSize() {
|
||||||
ConvolutionBaseCPUKernel::Init();
|
ConvolutionBaseCPUKernel::Init();
|
||||||
|
|
||||||
// init sliding window param
|
// init sliding window param
|
||||||
sliding_ = new SlidingWindowParam;
|
sliding_ = new (std::nothrow) SlidingWindowParam;
|
||||||
|
if (sliding_ == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "new sliding window param failed.";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
|
InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
|
||||||
|
|
||||||
auto ret = InitWeightBias();
|
auto ret = InitWeightBias();
|
||||||
|
|
|
@ -43,7 +43,6 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
|
||||||
int oc_block_num = UP_DIV(output_channel, C4NUM);
|
int oc_block_num = UP_DIV(output_channel, C4NUM);
|
||||||
int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;
|
int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane;
|
||||||
|
|
||||||
// ==================================init weight======================================//
|
|
||||||
auto origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data());
|
auto origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data());
|
||||||
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
|
||||||
if (packed_weight_ == nullptr) {
|
if (packed_weight_ == nullptr) {
|
||||||
|
@ -61,7 +60,6 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ====================================init bias====================================== //
|
|
||||||
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
|
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
|
||||||
if (bias_data_ == nullptr) {
|
if (bias_data_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc bias failed.";
|
MS_LOG(ERROR) << "malloc bias failed.";
|
||||||
|
@ -82,7 +80,6 @@ int ConvolutionSWCPUKernel::InitTmpBuffer() {
|
||||||
int oc4 = UP_DIV(out_channel, C4NUM);
|
int oc4 = UP_DIV(out_channel, C4NUM);
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
|
|
||||||
/*=============================tmp_output_block_============================*/
|
|
||||||
tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(
|
tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(
|
||||||
conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float)));
|
conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float)));
|
||||||
if (tmp_output_block_ == nullptr) {
|
if (tmp_output_block_ == nullptr) {
|
||||||
|
@ -119,18 +116,21 @@ int ConvolutionSWCPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (slidingWindow_param_ != nullptr) {
|
||||||
|
delete slidingWindow_param_;
|
||||||
|
slidingWindow_param_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
ret = ConvolutionBaseCPUKernel::Init();
|
ret = ConvolutionBaseCPUKernel::Init();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
MS_LOG(ERROR) << "ConvolutionBase init failed.";
|
MS_LOG(ERROR) << "ConvolutionBase init failed.";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||||
size_t nhwc4_input_size =
|
size_t nhwc4_input_size =
|
||||||
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
||||||
|
|
|
@ -37,6 +37,10 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
|
||||||
free(packed_weight_);
|
free(packed_weight_);
|
||||||
packed_weight_ = nullptr;
|
packed_weight_ = nullptr;
|
||||||
}
|
}
|
||||||
|
if (slidingWindow_param_ != nullptr) {
|
||||||
|
delete slidingWindow_param_;
|
||||||
|
slidingWindow_param_ = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int Init() override;
|
int Init() override;
|
||||||
|
@ -53,10 +57,6 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
|
||||||
ctx_->allocator->Free(tmp_output_block_);
|
ctx_->allocator->Free(tmp_output_block_);
|
||||||
tmp_output_block_ = nullptr;
|
tmp_output_block_ = nullptr;
|
||||||
}
|
}
|
||||||
if (slidingWindow_param_ != nullptr) {
|
|
||||||
delete slidingWindow_param_;
|
|
||||||
slidingWindow_param_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
float *packed_weight_ = nullptr;
|
float *packed_weight_ = nullptr;
|
||||||
float *tmp_output_block_ = nullptr;
|
float *tmp_output_block_ = nullptr;
|
||||||
|
|
|
@ -28,16 +28,27 @@ using mindspore::lite::RET_OK;
|
||||||
using mindspore::schema::PrimitiveType_Conv2D;
|
using mindspore::schema::PrimitiveType_Conv2D;
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
||||||
ConvParameter *conv_param, int oc_block) {
|
ConvParameter *conv_param, int oc_block) {
|
||||||
// =============original weight format : ohwi===============//
|
// original weight format : ohwi
|
||||||
auto channel_in = conv_param->input_channel_;
|
auto channel_in = conv_param->input_channel_;
|
||||||
auto channel_out = conv_param->output_channel_;
|
auto channel_out = conv_param->output_channel_;
|
||||||
int input_unit_square = input_unit * input_unit;
|
int input_unit_square = input_unit * input_unit;
|
||||||
|
|
||||||
// =============generate matrix_G && matrix_GT===============//
|
// generate matrix_G && matrix_GT
|
||||||
auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit);
|
auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit);
|
||||||
|
if (matrix_g == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "matrix_g is null.";
|
||||||
|
delete matrix_g;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit);
|
auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit);
|
||||||
|
if (matrix_gt == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "matrix_gt is null.";
|
||||||
|
delete matrix_g;
|
||||||
|
delete matrix_gt;
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
ChooseMatrixG(matrix_g, matrix_gt);
|
ChooseMatrixG(matrix_g, matrix_gt);
|
||||||
auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData());
|
auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData());
|
||||||
auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData());
|
auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData());
|
||||||
|
@ -59,7 +70,7 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
|
||||||
free(trans_out_data);
|
free(trans_out_data);
|
||||||
delete matrix_g;
|
delete matrix_g;
|
||||||
delete matrix_gt;
|
delete matrix_gt;
|
||||||
return;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < channel_out; i++) {
|
for (int i = 0; i < channel_out; i++) {
|
||||||
int out_c_block = i / oc_block;
|
int out_c_block = i / oc_block;
|
||||||
|
@ -92,6 +103,7 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
|
||||||
free(trans_out_data);
|
free(trans_out_data);
|
||||||
delete matrix_g;
|
delete matrix_g;
|
||||||
delete matrix_gt;
|
delete matrix_gt;
|
||||||
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
||||||
|
@ -118,7 +130,11 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
auto weight_data = reinterpret_cast<float *>(filter_tensor->Data());
|
auto weight_data = reinterpret_cast<float *>(filter_tensor->Data());
|
||||||
WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block);
|
ret = WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block);
|
||||||
|
if (ret != RET_OK) {
|
||||||
|
MS_LOG(ERROR) << "winograd filter transfrom failed.";
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
// init bias
|
// init bias
|
||||||
size_t new_bias_size = oc4 * C4NUM * sizeof(float);
|
size_t new_bias_size = oc4 * C4NUM * sizeof(float);
|
||||||
|
@ -182,7 +198,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
||||||
int oc4 = UP_DIV(channel_out, C4NUM);
|
int oc4 = UP_DIV(channel_out, C4NUM);
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
|
|
||||||
/*=============================gemm_out_============================*/
|
|
||||||
gemm_out_ = reinterpret_cast<float *>(
|
gemm_out_ = reinterpret_cast<float *>(
|
||||||
ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float)));
|
ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float)));
|
||||||
if (gemm_out_ == nullptr) {
|
if (gemm_out_ == nullptr) {
|
||||||
|
@ -190,7 +205,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_data_============================*/
|
|
||||||
int out_w_block = UP_DIV(output_w, output_unit_);
|
int out_w_block = UP_DIV(output_w, output_unit_);
|
||||||
int out_h_block = UP_DIV(output_h, output_unit_);
|
int out_h_block = UP_DIV(output_h, output_unit_);
|
||||||
tmp_out_data_ =
|
tmp_out_data_ =
|
||||||
|
@ -201,7 +215,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_data_============================*/
|
|
||||||
tmp_data_ = reinterpret_cast<float *>(
|
tmp_data_ = reinterpret_cast<float *>(
|
||||||
ctx_->allocator->Malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float)));
|
ctx_->allocator->Malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float)));
|
||||||
if (tmp_data_ == nullptr) {
|
if (tmp_data_ == nullptr) {
|
||||||
|
@ -263,7 +276,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
|
@ -284,7 +296,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
||||||
conv_param_->input_unit_ = input_unit_;
|
conv_param_->input_unit_ = input_unit_;
|
||||||
conv_param_->output_unit_ = output_unit_;
|
conv_param_->output_unit_ = output_unit_;
|
||||||
|
|
||||||
/*=============================nhwc4_input_============================*/
|
|
||||||
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
|
||||||
size_t nhwc4_input_size =
|
size_t nhwc4_input_size =
|
||||||
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
|
||||||
|
@ -295,7 +306,6 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||||
|
|
||||||
/*=============================trans_input_============================*/
|
|
||||||
size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
|
size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
|
||||||
trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
|
||||||
if (trans_input_ == nullptr) {
|
if (trans_input_ == nullptr) {
|
||||||
|
|
|
@ -80,7 +80,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
|
||||||
TmpBufferAddress tmp_buffer_address_list_[5];
|
TmpBufferAddress tmp_buffer_address_list_[5];
|
||||||
GEMM_FUNC_FP32 gemm_func_ = nullptr;
|
GEMM_FUNC_FP32 gemm_func_ = nullptr;
|
||||||
};
|
};
|
||||||
void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
|
||||||
ConvParameter *conv_param, int oc_block);
|
ConvParameter *conv_param, int oc_block);
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_
|
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_
|
||||||
|
|
|
@ -61,6 +61,8 @@ int SoftmaxCPUKernel::ReSize() {
|
||||||
for (int i = axis + 1; i < n_dim; i++) {
|
for (int i = axis + 1; i < n_dim; i++) {
|
||||||
in_plane_size *= in_shape[i];
|
in_plane_size *= in_shape[i];
|
||||||
}
|
}
|
||||||
|
in_plane_size_ = in_plane_size;
|
||||||
|
out_plane_size_ = out_plane_size;
|
||||||
if (sum_data_ != nullptr) {
|
if (sum_data_ != nullptr) {
|
||||||
free(sum_data_);
|
free(sum_data_);
|
||||||
}
|
}
|
||||||
|
@ -69,7 +71,6 @@ int SoftmaxCPUKernel::ReSize() {
|
||||||
MS_LOG(ERROR) << "malloc data for softmax fail!";
|
MS_LOG(ERROR) << "malloc data for softmax fail!";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float));
|
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,6 +80,7 @@ int SoftmaxCPUKernel::Run() {
|
||||||
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
memset(sum_data_, 0, in_plane_size_ * out_plane_size_ * sizeof(float));
|
||||||
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
|
auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
|
||||||
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
|
auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
|
||||||
Softmax(input_ptr, output_ptr, sum_data_, softmax_param_);
|
Softmax(input_ptr, output_ptr, sum_data_, softmax_param_);
|
||||||
|
|
|
@ -40,6 +40,8 @@ class SoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
float *sum_data_ = nullptr;
|
float *sum_data_ = nullptr;
|
||||||
|
int in_plane_size_;
|
||||||
|
int out_plane_size_;
|
||||||
};
|
};
|
||||||
} // namespace mindspore::kernel
|
} // namespace mindspore::kernel
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() {
|
||||||
int output_h = conv_param_->output_h_;
|
int output_h = conv_param_->output_h_;
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
|
|
||||||
/*=============================block_unit_buffer_============================*/
|
|
||||||
size_t block_unit_buffer_size = thread_count_ * 4 * 4 * C8NUM * sizeof(int16_t);
|
size_t block_unit_buffer_size = thread_count_ * 4 * 4 * C8NUM * sizeof(int16_t);
|
||||||
block_unit_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
block_unit_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size));
|
||||||
if (block_unit_buffer_ == nullptr) {
|
if (block_unit_buffer_ == nullptr) {
|
||||||
|
@ -125,7 +124,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_dst_buffer_============================*/
|
|
||||||
size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t);
|
size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t);
|
||||||
tmp_dst_buffer_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
tmp_dst_buffer_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
|
||||||
if (tmp_dst_buffer_ == nullptr) {
|
if (tmp_dst_buffer_ == nullptr) {
|
||||||
|
@ -133,7 +131,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_============================*/
|
|
||||||
size_t tmp_out_size = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t);
|
size_t tmp_out_size = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t);
|
||||||
tmp_out_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_out_size));
|
tmp_out_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_out_size));
|
||||||
if (tmp_out_ == nullptr) {
|
if (tmp_out_ == nullptr) {
|
||||||
|
@ -174,7 +171,6 @@ int Convolution3x3Int8CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (input_data_ != nullptr) {
|
if (input_data_ != nullptr) {
|
||||||
free(input_data_);
|
free(input_data_);
|
||||||
input_data_ = nullptr;
|
input_data_ = nullptr;
|
||||||
|
@ -190,7 +186,6 @@ int Convolution3x3Int8CPUKernel::ReSize() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================input_data_============================*/
|
|
||||||
int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM);
|
||||||
size_t c8_input_size =
|
size_t c8_input_size =
|
||||||
conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t);
|
conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t);
|
||||||
|
@ -201,7 +196,6 @@ int Convolution3x3Int8CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(input_data_, 0, c8_input_size);
|
memset(input_data_, 0, c8_input_size);
|
||||||
|
|
||||||
/*=============================tile_buffer_============================*/
|
|
||||||
size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t);
|
size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t);
|
||||||
tile_buffer_ = reinterpret_cast<int16_t *>(malloc(tile_buffer_size));
|
tile_buffer_ = reinterpret_cast<int16_t *>(malloc(tile_buffer_size));
|
||||||
if (tile_buffer_ == nullptr) {
|
if (tile_buffer_ == nullptr) {
|
||||||
|
|
|
@ -35,22 +35,25 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (packed_weight_ != nullptr) {
|
if (packed_weight_ != nullptr) {
|
||||||
delete packed_weight_;
|
free(packed_weight_);
|
||||||
packed_weight_ = nullptr;
|
packed_weight_ = nullptr;
|
||||||
}
|
}
|
||||||
if (packed_input_ != nullptr) {
|
if (packed_input_ != nullptr) {
|
||||||
delete packed_input_;
|
free(packed_input_);
|
||||||
packed_input_ = nullptr;
|
packed_input_ = nullptr;
|
||||||
}
|
}
|
||||||
if (need_align_) {
|
if (need_align_) {
|
||||||
if (packed_output_ != nullptr) {
|
if (packed_output_ != nullptr) {
|
||||||
delete packed_output_;
|
free(packed_output_);
|
||||||
packed_output_ = nullptr;
|
packed_output_ = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { FreeTmpBuffer(); }
|
ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
|
||||||
|
FreeTmpBuffer();
|
||||||
|
FreeQuantParam();
|
||||||
|
}
|
||||||
|
|
||||||
int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
|
int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
|
||||||
// init weight, int8 -> int16
|
// init weight, int8 -> int16
|
||||||
|
@ -118,7 +121,11 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
|
||||||
ConvolutionBaseCPUKernel::Init();
|
ConvolutionBaseCPUKernel::Init();
|
||||||
|
|
||||||
// init sliding window param
|
// init sliding window param
|
||||||
sliding = new SlidingWindowParam;
|
sliding = new (std::nothrow) SlidingWindowParam;
|
||||||
|
if (sliding == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "new sliding window param.";
|
||||||
|
return RET_ERROR;
|
||||||
|
}
|
||||||
InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
|
InitSlidingParamConvDw(sliding, conv_param_, C4NUM);
|
||||||
|
|
||||||
// init quant param
|
// init quant param
|
||||||
|
|
|
@ -113,25 +113,24 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
|
||||||
}
|
}
|
||||||
free(weight_sum);
|
free(weight_sum);
|
||||||
|
|
||||||
/*=============================input_sum_============================*/
|
|
||||||
size_t input_sum_size;
|
size_t input_sum_size;
|
||||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||||
input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t);
|
input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t);
|
||||||
} else {
|
} else {
|
||||||
input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
|
input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
|
||||||
}
|
}
|
||||||
input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size));
|
input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size));
|
||||||
if (input_sum_ == nullptr) {
|
if (input_sum_ == nullptr) {
|
||||||
MS_LOG(ERROR) << "malloc input_sum_ failed.";
|
MS_LOG(ERROR) << "malloc input_sum_ failed.";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t));
|
memset(input_sum_, 0, input_sum_size);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConvolutionInt8CPUKernel::InitTmpBuffer() {
|
int ConvolutionInt8CPUKernel::InitTmpBuffer() {
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
/*=============================tmp_dst_============================*/
|
|
||||||
size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
|
size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
|
||||||
tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
|
tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
|
||||||
if (tmp_dst_ == nullptr) {
|
if (tmp_dst_ == nullptr) {
|
||||||
|
@ -139,7 +138,6 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_============================*/
|
|
||||||
tmp_out_ =
|
tmp_out_ =
|
||||||
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
|
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
|
||||||
if (tmp_out_ == nullptr) {
|
if (tmp_out_ == nullptr) {
|
||||||
|
@ -202,7 +200,6 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
|
||||||
}
|
}
|
||||||
free(weight_sum);
|
free(weight_sum);
|
||||||
|
|
||||||
/*=============================input_sum_============================*/
|
|
||||||
size_t input_sum_size;
|
size_t input_sum_size;
|
||||||
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
|
||||||
input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t);
|
input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t);
|
||||||
|
@ -214,13 +211,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
|
||||||
MS_LOG(ERROR) << "malloc input_sum_ failed.";
|
MS_LOG(ERROR) << "malloc input_sum_ failed.";
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t));
|
memset(input_sum_, 0, input_sum_size);
|
||||||
return RET_OK;
|
return RET_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
|
int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
|
||||||
MS_ASSERT(ctx_->allocator != nullptr);
|
MS_ASSERT(ctx_->allocator != nullptr);
|
||||||
/*=============================tmp_dst_============================*/
|
|
||||||
size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
|
size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
|
||||||
tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
|
tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
|
||||||
if (tmp_dst_ == nullptr) {
|
if (tmp_dst_ == nullptr) {
|
||||||
|
@ -228,7 +225,6 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
|
||||||
return RET_ERROR;
|
return RET_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================tmp_out_============================*/
|
|
||||||
tmp_out_ =
|
tmp_out_ =
|
||||||
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
|
reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
|
||||||
if (tmp_out_ == nullptr) {
|
if (tmp_out_ == nullptr) {
|
||||||
|
@ -287,7 +283,6 @@ int ConvolutionInt8CPUKernel::ReSize() {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeTmpBuffer();
|
|
||||||
if (nhwc4_input_ != nullptr) {
|
if (nhwc4_input_ != nullptr) {
|
||||||
free(nhwc4_input_);
|
free(nhwc4_input_);
|
||||||
nhwc4_input_ = nullptr;
|
nhwc4_input_ = nullptr;
|
||||||
|
@ -312,7 +307,6 @@ int ConvolutionInt8CPUKernel::ReSize() {
|
||||||
}
|
}
|
||||||
memset(nhwc4_input_, 0, nhwc4_input_size);
|
memset(nhwc4_input_, 0, nhwc4_input_size);
|
||||||
|
|
||||||
/*=============================packed_input_============================*/
|
|
||||||
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
|
int output_count = conv_param_->output_h_ * conv_param_->output_w_;
|
||||||
int output_tile_count = UP_DIV(output_count, tile_num_);
|
int output_tile_count = UP_DIV(output_count, tile_num_);
|
||||||
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
|
||||||
|
|
|
@ -28,7 +28,10 @@ using mindspore::lite::RET_OK;
|
||||||
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
|
using mindspore::schema::PrimitiveType_DeDepthwiseConv2D;
|
||||||
|
|
||||||
namespace mindspore::kernel {
|
namespace mindspore::kernel {
|
||||||
DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() { FreeTmpBuffer(); }
|
DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
|
||||||
|
FreeTmpBuffer();
|
||||||
|
FreeQuantParam();
|
||||||
|
}
|
||||||
|
|
||||||
void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
|
void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() {
|
||||||
if (sliding != nullptr) {
|
if (sliding != nullptr) {
|
||||||
|
|
|
@ -49,7 +49,7 @@ int PoolingInt8CPUKernel::ReSize() {
|
||||||
MS_LOG(ERROR) << "PoolingBase Init failed.";
|
MS_LOG(ERROR) << "PoolingBase Init failed.";
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
SetQuantParam();
|
|
||||||
ret = SetQuantParam();
|
ret = SetQuantParam();
|
||||||
if (ret != RET_OK) {
|
if (ret != RET_OK) {
|
||||||
MS_LOG(ERROR) << "Set pooling quant param failed.";
|
MS_LOG(ERROR) << "Set pooling quant param failed.";
|
||||||
|
|
|
@ -262,6 +262,12 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
|
||||||
int kernel_plane = kernel_h * kernel_w;
|
int kernel_plane = kernel_h * kernel_w;
|
||||||
int unit_size = kernel_plane * ic4 * C4NUM;
|
int unit_size = kernel_plane * ic4 * C4NUM;
|
||||||
int packed_input_size = output_tile_count * tile_n * unit_size;
|
int packed_input_size = output_tile_count * tile_n * unit_size;
|
||||||
|
int input_sum_offset;
|
||||||
|
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
|
||||||
|
input_sum_offset = tile_n * out_channel;
|
||||||
|
} else {
|
||||||
|
input_sum_offset = tile_n;
|
||||||
|
}
|
||||||
|
|
||||||
for (int b = 0; b < in_batch; b++) {
|
for (int b = 0; b < in_batch; b++) {
|
||||||
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
|
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
|
||||||
|
@ -270,7 +276,7 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
|
||||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
||||||
int start_index = thread_id * tile_n;
|
int start_index = thread_id * tile_n;
|
||||||
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
|
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
|
||||||
int32_t *tmp_input_sum = input_sum + task_id * tile_n;
|
int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
|
||||||
int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
|
int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
|
||||||
// clear tmp buffer before compute
|
// clear tmp buffer before compute
|
||||||
memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
|
memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
|
||||||
|
@ -317,6 +323,12 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
|
||||||
int kernel_plane = kernel_h * kernel_w;
|
int kernel_plane = kernel_h * kernel_w;
|
||||||
int unit_size = kernel_plane * ic4 * C4NUM;
|
int unit_size = kernel_plane * ic4 * C4NUM;
|
||||||
int packed_input_size = output_tile_count * tile_n * unit_size;
|
int packed_input_size = output_tile_count * tile_n * unit_size;
|
||||||
|
int input_sum_offset;
|
||||||
|
if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
|
||||||
|
input_sum_offset = tile_n * out_channel;
|
||||||
|
} else {
|
||||||
|
input_sum_offset = tile_n;
|
||||||
|
}
|
||||||
|
|
||||||
for (int b = 0; b < in_batch; b++) {
|
for (int b = 0; b < in_batch; b++) {
|
||||||
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
|
int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
|
||||||
|
@ -325,7 +337,7 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
|
||||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
|
||||||
int start_index = thread_id * tile_n;
|
int start_index = thread_id * tile_n;
|
||||||
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
|
int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
|
||||||
int32_t *tmp_input_sum = input_sum + task_id * tile_n;
|
int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
|
||||||
int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
|
int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
|
||||||
// clear tmp buffer before compute
|
// clear tmp buffer before compute
|
||||||
memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
|
memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
|
||||||
|
|
Loading…
Reference in New Issue