forked from mindspore-Ecosystem/mindspore
!11977 [MS][LITE][CPU]change the parallel strategy for fp16 winograd
From: @fuzhiye Reviewed-by: @zhang_xue_tong,@hangangqiang Signed-off-by: @zhang_xue_tong
This commit is contained in:
commit
ca3f916c1e
|
@ -160,7 +160,9 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
|
|||
int out_w_block = UP_DIV(conv_param->output_w_, conv_param->output_unit_);
|
||||
int out_h_block = UP_DIV(conv_param->output_h_, conv_param->output_unit_);
|
||||
int output_count = out_w_block * out_h_block;
|
||||
int output_tile_count = UP_DIV(output_count, tile_num);
|
||||
int per_thread_num = UP_DIV(output_count, conv_param->thread_num_);
|
||||
int real_tile = per_thread_num < tile_num ? per_thread_num : tile_num;
|
||||
int output_tile_count = UP_DIV(output_count, real_tile);
|
||||
int oc8 = UP_DIV(conv_param->output_channel_, C8NUM);
|
||||
int input_unit_square = conv_param->input_unit_ * conv_param->input_unit_;
|
||||
|
||||
|
@ -178,9 +180,12 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
|
|||
int in_batch_offset = b * in_channel * conv_param->input_h_ * conv_param->input_w_;
|
||||
int out_batch_offset = b * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
|
||||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) {
|
||||
int out_tile_index = thread_id * tile_num;
|
||||
int cal_num = output_count - thread_id * tile_num;
|
||||
cal_num = cal_num > tile_num ? tile_num : cal_num;
|
||||
int out_tile_index = thread_id * real_tile;
|
||||
int cal_num = output_count - thread_id * real_tile;
|
||||
cal_num = cal_num > real_tile ? real_tile : cal_num;
|
||||
if (cal_num <= 0) {
|
||||
return;
|
||||
}
|
||||
WinogradInputTransformFp16(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
|
||||
tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
|
||||
in_func);
|
||||
|
@ -189,7 +194,7 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
|
|||
float16_t *dst_ptr = gemm_out + task_id * gemm_out_offset;
|
||||
float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
|
||||
for (int i = 0; i < input_unit_square; ++i) {
|
||||
RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel);
|
||||
RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, cal_num, in_channel);
|
||||
MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
|
||||
cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8);
|
||||
}
|
||||
|
|
|
@ -16,12 +16,11 @@
|
|||
|
||||
#include "nnacl/fp16/matmul_fp16.h"
|
||||
|
||||
void ColMajor2Row8MajorFp16(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16) {
|
||||
static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||
int row_c8 = row / C8NUM * C8NUM;
|
||||
int col_c8 = col / C8NUM * C8NUM;
|
||||
int ci = 0;
|
||||
if (src_float16) {
|
||||
const float16_t *src = (const float16_t *)src_ptr;
|
||||
int ci = 0;
|
||||
for (; ci < col_c8; ci += C8NUM) {
|
||||
int ri = 0;
|
||||
for (; ri < row_c8; ri += C8NUM) {
|
||||
|
@ -107,7 +106,12 @@ void ColMajor2Row8MajorFp16(const void *src_ptr, float16_t *dst_ptr, size_t row,
|
|||
dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = src[tc * row + r];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
static void Col2Row8SrcFromFp32(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||
int row_c8 = row / C8NUM * C8NUM;
|
||||
int col_c8 = col / C8NUM * C8NUM;
|
||||
int ci = 0;
|
||||
const float *src = (const float *)src_ptr;
|
||||
for (; ci < col_c8; ci += C8NUM) {
|
||||
int ri = 0;
|
||||
|
@ -212,6 +216,13 @@ void ColMajor2Row8MajorFp16(const void *src_ptr, float16_t *dst_ptr, size_t row,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ColMajor2Row8MajorFp16(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16) {
|
||||
if (src_float16) {
|
||||
Col2Row8SrcFromFp16(src_ptr, dst_ptr, row, col);
|
||||
} else {
|
||||
Col2Row8SrcFromFp32(src_ptr, dst_ptr, row, col);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -274,21 +285,7 @@ void MatVecMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f
|
|||
MatVecMulFp16Neon64(a, b, c, bias, (int)act_type, depth, col);
|
||||
}
|
||||
|
||||
void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||
size_t row_up_16 = UP_ROUND(row, C16NUM);
|
||||
size_t row16 = row / C16NUM * C16NUM;
|
||||
size_t col8 = col / C8NUM * C8NUM;
|
||||
const float16_t *src_r = src_ptr;
|
||||
float16_t *dst_r = dst_ptr;
|
||||
|
||||
size_t ri = 0;
|
||||
for (; ri < row16; ri += C16NUM) {
|
||||
size_t ci = 0;
|
||||
for (; ci < col8; ci += C8NUM) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C16NUM;
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
static void Row2Col16Block16(const float16_t *src_ptr, float16_t *dst_ptr, size_t col) {
|
||||
size_t stride = col * 2;
|
||||
asm volatile(
|
||||
"mov x10, %[src_c]\n"
|
||||
|
@ -390,10 +387,27 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si
|
|||
"st1 {v27.8h}, [x11], #16\n"
|
||||
"st1 {v31.8h}, [x11], #16\n"
|
||||
:
|
||||
: [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride)
|
||||
: [ dst_c ] "r"(dst_ptr), [ src_c ] "r"(src_ptr), [ stride ] "r"(stride)
|
||||
: "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
||||
"v30", "v31");
|
||||
"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31");
|
||||
}
|
||||
|
||||
void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
|
||||
size_t row_up_16 = UP_ROUND(row, C16NUM);
|
||||
size_t row16 = row / C16NUM * C16NUM;
|
||||
size_t col8 = col / C8NUM * C8NUM;
|
||||
const float16_t *src_r = src_ptr;
|
||||
float16_t *dst_r = dst_ptr;
|
||||
size_t ri = 0;
|
||||
// find 16 block unit
|
||||
for (; ri < row16; ri += C16NUM) {
|
||||
size_t ci = 0;
|
||||
for (; ci < col8; ci += C8NUM) {
|
||||
const float16_t *src_c = src_r + ci;
|
||||
float16_t *dst_c = dst_r + ci * C16NUM;
|
||||
#ifdef ENABLE_ARM64
|
||||
Row2Col16Block16(src_c, dst_c, col);
|
||||
#else
|
||||
for (int tr = 0; tr < C16NUM; tr++) {
|
||||
for (int tc = 0; tc < C8NUM; tc++) {
|
||||
|
@ -413,7 +427,7 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si
|
|||
dst_r += C16NUM * col;
|
||||
}
|
||||
for (; ri < row; ri++) {
|
||||
for (size_t i = 0; i < col; i++) {
|
||||
for (size_t i = 0; i < col; ++i) {
|
||||
dst_r[i * C16NUM] = src_r[i];
|
||||
}
|
||||
src_r += col;
|
||||
|
|
|
@ -40,6 +40,9 @@ void ConvFp32(const float *input_data, float *packed_input, const float *packed_
|
|||
for (int thread_id = task_id; thread_id < output_tile_count; thread_id += conv_param->thread_num_) {
|
||||
int start_index = thread_id * cal_num;
|
||||
int real_cal_num = (output_count - start_index) < cal_num ? (output_count - start_index) : cal_num;
|
||||
if (real_cal_num <= 0) {
|
||||
return;
|
||||
}
|
||||
float *gemm_input = packed_input + task_id * deep * cal_num;
|
||||
float *col_major_gemm_input = col_major_input + task_id * deep * cal_num;
|
||||
size_t packed_input_size = deep * cal_num * sizeof(float);
|
||||
|
|
|
@ -56,6 +56,9 @@ void ConvWinogardFp32(const float *input_data, const float *trans_weight, const
|
|||
int out_tile_index = thread_id * tile_num;
|
||||
int cal_num = output_count - out_tile_index;
|
||||
cal_num = cal_num > tile_num ? tile_num : cal_num;
|
||||
if (cal_num <= 0) {
|
||||
return;
|
||||
}
|
||||
WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
|
||||
tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
|
||||
in_func);
|
||||
|
|
|
@ -36,7 +36,6 @@ ConvolutionBaseCPUKernel::~ConvolutionBaseCPUKernel() {
|
|||
}
|
||||
|
||||
void ConvolutionBaseCPUKernel::FreeQuantParam() {
|
||||
ConvQuantArg *conv_quant_arg_ = &conv_param_->conv_quant_arg_;
|
||||
if (conv_quant_arg_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -44,7 +44,10 @@ class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
|
|||
void FreeCopiedData();
|
||||
int Init() override;
|
||||
int ReSize() override;
|
||||
int Run() override { return fp16_conv_kernel_->Run(); }
|
||||
int Run() override {
|
||||
fp16_conv_kernel_->set_name(name_);
|
||||
return fp16_conv_kernel_->Run();
|
||||
}
|
||||
|
||||
private:
|
||||
uint8_t need_free_ = 0b00;
|
||||
|
|
|
@ -102,6 +102,13 @@ int ConvolutionFP16CPUKernel::Init() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConvolutionFP16CPUKernel::AdjustNumberOfThread() {
|
||||
auto out_tensor = out_tensors_.front();
|
||||
int out_plane = out_tensor->Height() * out_tensor->Width();
|
||||
thread_count_ = MSMIN(ctx_->thread_num_, UP_DIV(out_plane, C16NUM));
|
||||
conv_param_->thread_num_ = thread_count_;
|
||||
}
|
||||
|
||||
int ConvolutionFP16CPUKernel::ReSize() {
|
||||
auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -44,6 +44,7 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
|||
int RunImpl(int task_id);
|
||||
int InitWeightBias();
|
||||
int InitTmpBuffer();
|
||||
void AdjustNumberOfThread();
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer() {
|
||||
|
|
|
@ -108,7 +108,6 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
|
|||
int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
|
||||
const int cal_num = 16;
|
||||
int channel_out = conv_param_->output_channel_;
|
||||
int oc8 = UP_DIV(channel_out, C8NUM);
|
||||
|
||||
size_t tile_buffer_size =
|
||||
thread_count_ * cal_num * input_unit_ * input_unit_ * conv_param_->input_channel_ * sizeof(float16_t);
|
||||
|
@ -118,8 +117,8 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
gemm_out_ = reinterpret_cast<float16_t *>(
|
||||
ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t)));
|
||||
gemm_out_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(
|
||||
thread_count_ * cal_num * input_unit_ * input_unit_ * UP_ROUND(channel_out, C8NUM) * sizeof(float16_t)));
|
||||
if (gemm_out_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc gemm_out_ failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -174,6 +173,13 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
void ConvolutionWinogradFP16CPUKernel::AdjustNumberOfThread() {
|
||||
auto out_tensor = out_tensors_.front();
|
||||
int cal_plane = UP_DIV(out_tensor->Height(), output_unit_) * UP_DIV(out_tensor->Width(), output_unit_);
|
||||
thread_count_ = MSMIN(ctx_->thread_num_, UP_DIV(cal_plane, C8NUM));
|
||||
conv_param_->thread_num_ = thread_count_;
|
||||
}
|
||||
|
||||
int ConvolutionWinogradFP16CPUKernel::ReSize() {
|
||||
auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
|
||||
if (ret != RET_OK) {
|
||||
|
@ -190,6 +196,7 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
|
|||
MS_LOG(ERROR) << "ConfigInputOutput failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
AdjustNumberOfThread();
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
|
|||
int InitTmpBuffer();
|
||||
int ConfigInputOutput();
|
||||
int WinogradFilterTransformFp16(const float16_t *weight_data, float *matrix_g, float *matrix_gt, int oc_block);
|
||||
void AdjustNumberOfThread();
|
||||
|
||||
private:
|
||||
void FreeTmpBuffer() {
|
||||
|
|
|
@ -48,16 +48,9 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|||
conv_param_->input_channel_ = in_channel;
|
||||
conv_param_->output_channel_ = out_channel;
|
||||
|
||||
int oc4 = UP_DIV(out_channel, C4NUM);
|
||||
#ifdef ENABLE_AVX
|
||||
const int oc_block = C16NUM;
|
||||
#else
|
||||
const int oc_block = C8NUM;
|
||||
#endif
|
||||
int oc_block_num = UP_DIV(out_channel, oc_block);
|
||||
|
||||
// set data
|
||||
auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float);
|
||||
auto trans_matrix_data_size =
|
||||
input_unit_ * input_unit_ * in_channel * UP_ROUND(out_channel, oc_block_) * sizeof(float);
|
||||
if (trans_weight_ == nullptr) {
|
||||
trans_weight_ = reinterpret_cast<float *>(malloc(trans_matrix_data_size));
|
||||
if (trans_weight_ == nullptr) {
|
||||
|
@ -83,14 +76,15 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
|
||||
return ret;
|
||||
}
|
||||
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block);
|
||||
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block_);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "winograd filter transform failed.";
|
||||
return ret;
|
||||
}
|
||||
|
||||
// init bias
|
||||
size_t new_bias_size = oc4 * C4NUM * sizeof(float);
|
||||
size_t new_bias_size = UP_ROUND(out_channel, C4NUM) * sizeof(float);
|
||||
bias_data_ = malloc(new_bias_size);
|
||||
if (bias_data_ == nullptr) {
|
||||
bias_data_ = reinterpret_cast<float *>(malloc(new_bias_size));
|
||||
if (bias_data_ == nullptr) {
|
||||
|
@ -98,31 +92,30 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
|
|||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
}
|
||||
memset(bias_data_, 0, new_bias_size);
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
|
||||
size_t origin_size = out_channel * sizeof(float);
|
||||
memcpy(bias_data_, origin_bias_, origin_size);
|
||||
memset(reinterpret_cast<float *>(bias_data_) + out_channel, 0, new_bias_size - origin_size);
|
||||
} else {
|
||||
MS_ASSERT(in_tensors_.size() == kInputSize1);
|
||||
memset(bias_data_, 0, new_bias_size);
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
||||
int channel_out = conv_param_->output_channel_;
|
||||
int oc8 = UP_DIV(channel_out, C8NUM);
|
||||
int tile_num = C12NUM;
|
||||
MS_ASSERT(ctx_->allocator != nullptr);
|
||||
|
||||
size_t tile_buffer_size =
|
||||
thread_count_ * tile_num * input_unit_ * input_unit_ * conv_param_->input_channel_ * sizeof(float);
|
||||
thread_count_ * tile_num_ * input_unit_ * input_unit_ * conv_param_->input_channel_ * sizeof(float);
|
||||
trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
|
||||
if (trans_input_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc trans_input_ failed.";
|
||||
return RET_MEMORY_FAILED;
|
||||
}
|
||||
|
||||
int oc8 = UP_ROUND(conv_param_->output_channel_, C8NUM);
|
||||
gemm_out_ = reinterpret_cast<float *>(
|
||||
ctx_->allocator->Malloc(thread_count_ * tile_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float)));
|
||||
ctx_->allocator->Malloc(thread_count_ * tile_num_ * input_unit_ * input_unit_ * oc8 * sizeof(float)));
|
||||
if (gemm_out_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc gemm_out_ failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -136,7 +129,7 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
|
|||
}
|
||||
|
||||
col_buffer_ = reinterpret_cast<float *>(
|
||||
ctx_->allocator->Malloc(thread_count_ * tile_num * conv_param_->input_channel_ * sizeof(float)));
|
||||
ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->input_channel_ * sizeof(float)));
|
||||
if (col_buffer_ == nullptr) {
|
||||
MS_LOG(ERROR) << "malloc col_buffer_ failed.";
|
||||
return RET_ERROR;
|
||||
|
@ -164,10 +157,17 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
|
|||
}
|
||||
|
||||
int ConvolutionWinogradCPUKernel::Init() {
|
||||
tile_num_ = C12NUM;
|
||||
#ifdef ENABLE_AVX
|
||||
oc_block_ = C16NUM;
|
||||
#else
|
||||
oc_block_ = C8NUM;
|
||||
#endif
|
||||
kernel_unit_ = conv_param_->kernel_h_;
|
||||
input_unit_ = output_unit_ + kernel_unit_ - 1;
|
||||
conv_param_->input_unit_ = input_unit_;
|
||||
conv_param_->output_unit_ = output_unit_;
|
||||
|
||||
auto ret = InitWeightBias();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init weight bias failed.";
|
||||
|
@ -197,8 +197,8 @@ int ConvolutionWinogradCPUKernel::ReSize() {
|
|||
|
||||
int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
|
||||
auto input_tensor = in_tensors_.at(kInputIndex);
|
||||
auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
|
||||
auto output_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
|
||||
auto ori_input_data = reinterpret_cast<float *>(input_tensor->data_c());
|
||||
auto output_data = reinterpret_cast<float *>(out_tensors_.front()->data_c());
|
||||
ConvWinogardFp32(ori_input_data, trans_weight_, reinterpret_cast<const float *>(bias_data_), output_data,
|
||||
tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
|
||||
return RET_OK;
|
||||
|
|
|
@ -70,9 +70,11 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
|
|||
col_buffer_ = nullptr;
|
||||
}
|
||||
}
|
||||
int kernel_unit_;
|
||||
int input_unit_;
|
||||
int kernel_unit_{0};
|
||||
int input_unit_{0};
|
||||
int output_unit_;
|
||||
int oc_block_{0};
|
||||
int tile_num_{0};
|
||||
float *origin_weight_; // do not free
|
||||
float *origin_bias_; // do not free
|
||||
float *tmp_data_ = nullptr;
|
||||
|
|
Loading…
Reference in New Issue