!30297 [lite]optimize matmul's logic

Merge pull request !30297 from 徐安越/master_core
This commit is contained in:
i-robot 2022-02-22 02:20:55 +00:00 committed by Gitee
commit 403ab94932
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
4 changed files with 283 additions and 396 deletions

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -27,17 +27,17 @@ namespace mindspore::kernel {
int FullconnectionCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
CHECK_LESS_RETURN(out_tensors_.size(), 1);
params_->a_const_ = in_tensors_[kInputIndex]->IsConst() && !op_parameter_->is_train_session_;
params_->b_const_ = in_tensors_[kWeightIndex]->IsConst() && !op_parameter_->is_train_session_;
MatmulFp32BaseCPUKernel::InitParameter();
if (params_->a_const_) {
if (params_->a_const_ || InferShapeDone()) {
auto a_shape = in_tensors_.at(0)->shape();
CHECK_LESS_RETURN(a_shape.size(), C2NUM);
params_->row_ = a_shape[0];
params_->deep_ = a_shape[1];
}
if (params_->b_const_) {
if (params_->b_const_ || InferShapeDone()) {
auto b_shape = in_tensors_.at(1)->shape();
CHECK_LESS_RETURN(b_shape.size(), C2NUM);
params_->col_ = b_shape[0];

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -56,13 +56,14 @@ void MatmulCPUKernel::InitShapeB() {
int MatmulCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
CHECK_LESS_RETURN(out_tensors_.size(), 1);
MatmulFp32BaseCPUKernel::InitParameter();
params_->a_const_ = in_tensors_[kInputIndex]->IsConst() && !op_parameter_->is_train_session_;
params_->b_const_ = in_tensors_[kWeightIndex]->IsConst() && !op_parameter_->is_train_session_;
if (params_->a_const_) {
if (params_->a_const_ || InferShapeDone()) {
InitShapeA();
}
if (params_->b_const_) {
if (params_->b_const_ || InferShapeDone()) {
InitShapeB();
}

View File

@ -37,164 +37,65 @@ int MatmulRun(const void *cdata, int task_id, float, float) {
}
MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() {
FreeResizeBufA();
FreeResizeBufB();
// packed const-matrix will be delete by framework.
if (out_need_aligned_ && output_data_ != nullptr) {
free(output_data_);
output_data_ = nullptr;
}
FreeBiasBuf();
}
void MatmulFp32BaseCPUKernel::InitParameter() {
NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kInputIndex]);
NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kWeightIndex]);
params_->a_const_ = in_tensors_[kInputIndex]->IsConst();
params_->b_const_ = in_tensors_[kWeightIndex]->IsConst();
if (op_parameter_->is_train_session_) {
params_->a_const_ = false;
params_->b_const_ = false;
if (matrix_c_.pack_ptr != nullptr) {
free(matrix_c_.pack_ptr);
matrix_c_.pack_ptr = nullptr;
}
}
int MatmulFp32BaseCPUKernel::InitBufferA() {
if (a_pack_ptr_ != nullptr) {
return RET_OK;
}
if (vec_matmul_) {
a_pack_ptr_ = reinterpret_cast<float *>(in_tensors().at(0)->data());
} else {
int MatmulFp32BaseCPUKernel::BackupConstMatrix(MatrixInfo *matrix_info, int index) {
MS_CHECK_TRUE_MSG(index < static_cast<int>(in_tensors_.size()), RET_ERROR, "matrix is not existing.");
auto element_num = in_tensors_[index]->ElementsNum();
MS_CHECK_TRUE_MSG(element_num > 0, RET_ERROR, "matrix is invalid.");
matrix_info->origin_ptr = reinterpret_cast<float *>(ms_context_->allocator->Malloc(element_num * sizeof(float)));
MS_CHECK_TRUE_MSG(matrix_info->origin_ptr != nullptr, RET_ERROR, "matrix is invalid.");
auto src_ptr = in_tensors_[index]->data();
MS_CHECK_TRUE_MSG(src_ptr != nullptr, RET_ERROR, "matrix is invalid.");
memcpy(matrix_info->origin_ptr, src_ptr, element_num * sizeof(float));
matrix_info->has_origin = true;
return RET_OK;
}
int MatmulFp32BaseCPUKernel::PackMatrixA() {
if (!params_->a_const_) {
if (!matrix_a_.need_pack) {
matrix_a_.pack_ptr = reinterpret_cast<float *>(in_tensors_[FIRST_INPUT]->data());
return RET_OK;
}
if (op_parameter_->is_train_session_) {
a_pack_ptr_ = reinterpret_cast<float *>(workspace());
matrix_a_.pack_ptr = reinterpret_cast<float *>(workspace());
} else {
#ifdef SERVER_INFERENCE
if (!params_->a_const_) {
a_pack_ptr_ = reinterpret_cast<float *>(
ms_context_->allocator->Malloc(static_cast<size_t>(matrix_a_pack_size_) * sizeof(float)));
} else {
auto a_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
in_tensors()[0], static_cast<size_t>(matrix_a_pack_size_) * sizeof(float));
a_pack_ptr_ = reinterpret_cast<float *>(a_packed.second);
a_is_packed_ = a_packed.first;
}
#else
a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
#endif
matrix_a_.pack_ptr =
reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_.pack_size * sizeof(float)));
}
}
if (a_pack_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
return RET_ERROR;
}
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitBufferB() {
if (b_pack_ptr_ != nullptr) {
return RET_OK;
}
if (op_parameter_->is_train_session_) {
b_pack_ptr_ = reinterpret_cast<float *>(workspace()) + matrix_a_pack_size_;
} else {
#ifdef SERVER_INFERENCE
if (params_->b_const_) {
auto b_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
in_tensors()[1], static_cast<size_t>(matrix_b_pack_size_) * sizeof(float));
b_pack_ptr_ = reinterpret_cast<float *>(b_packed.second);
b_is_packed_ = b_packed.first;
} else {
b_pack_ptr_ = reinterpret_cast<float *>(
ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
auto a_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
in_tensors()[FIRST_INPUT], static_cast<size_t>(matrix_a_.pack_size) * sizeof(float));
matrix_a_.pack_ptr = reinterpret_cast<float *>(a_packed.second);
if (a_packed.first == lite::PACKED) {
return RET_OK;
}
if (a_packed.first == lite::MALLOC && matrix_a_.pack_ptr == nullptr) {
matrix_a_.pack_ptr = reinterpret_cast<float *>(
ms_context_->allocator->Malloc(static_cast<size_t>(matrix_a_.pack_size) * sizeof(float)));
}
#else
b_pack_ptr_ = reinterpret_cast<float *>(
ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
matrix_a_.pack_ptr = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_.pack_size * sizeof(float)));
#endif
}
if (b_pack_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
return RET_ERROR;
}
return RET_OK;
}
int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
lite::Tensor *bias_tensor = in_tensors_.at(2);
int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
if (!params_->b_const_) {
MS_LOG(WARNING) << "matmul do not support broadcast bias data";
} else {
lite::Tensor *const_tensor = in_tensors_.at(1);
size_t shape_size = const_tensor->shape().size();
if (params_->b_transpose_) {
MS_CHECK_TRUE_RET(shape_size >= kBiasIndex, max_bias_data);
max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kBiasIndex], col_tile_);
} else {
MS_CHECK_TRUE_RET(shape_size >= kWeightIndex, max_bias_data);
max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kWeightIndex], col_tile_);
}
}
return max_bias_data;
}
int MatmulFp32BaseCPUKernel::InitBiasData() {
if (in_tensors_.size() != FOURTH_INPUT) {
return RET_OK;
}
auto bias_tensor = in_tensors_[THIRD_INPUT];
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "bias_tensor invalid";
return RET_ERROR;
}
if (bias_tensor->data() == nullptr) {
MS_LOG(ERROR) << "bias_tensor data invalid";
return RET_ERROR;
}
auto bias_num = static_cast<size_t>(bias_tensor->ElementsNum());
MS_CHECK_TRUE_RET(bias_num > 0, RET_ERROR);
if (bias_num == 1) {
// broadcast bias data
size_t max_bias_data = CalBroadCastBiasDataElements();
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * sizeof(float)));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
// broadcast bias data
for (size_t i = 0; i < max_bias_data; ++i) {
bias_ptr_[i] = broadcast_data;
}
return RET_OK;
}
auto max_bias_data = static_cast<size_t>(UP_ROUND(bias_num, col_tile_));
// malloc addr need to aligned to 32 bytes
bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
if (bias_ptr_ == nullptr) {
MS_LOG(ERROR) << "malloc bias_ptr_ failed";
return RET_ERROR;
}
CHECK_NULL_RETURN(bias_tensor->data());
(void)memcpy(bias_ptr_, bias_tensor->data(), bias_num * static_cast<int>(sizeof(float)));
memset(bias_ptr_ + bias_num, 0, (max_bias_data - bias_num) * sizeof(float));
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const {
if (in_tensors().at(FIRST_INPUT) == nullptr || in_tensors().at(FIRST_INPUT)->data_type() != kNumberTypeFloat32) {
MS_LOG(ERROR) << "Invalid param in matmul";
return RET_ERROR;
}
CHECK_NULL_RETURN(src_ptr);
if (vec_matmul_) {
return RET_OK;
}
auto src_ptr =
matrix_a_.has_origin ? matrix_a_.origin_ptr : reinterpret_cast<float *>(in_tensors_[FIRST_INPUT]->data());
MS_CHECK_TRUE_MSG(src_ptr != nullptr, RET_ERROR, "matrix-a source ptr is a nullptr.");
MS_CHECK_TRUE_MSG(matrix_a_.pack_ptr != nullptr, RET_ERROR, "matrix-a pack ptr is a nullptr.");
for (int i = 0; i < a_batch_; i++) {
const float *src = src_ptr + i * params_->deep_ * params_->row_;
float *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
float *dst = matrix_a_.pack_ptr + i * params_->deep_ * params_->row_align_;
if (params_->a_transpose_) {
matrix_a_pack_fun_(src, dst, params_->deep_, params_->row_);
} else {
@ -204,16 +105,48 @@ int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) const {
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const {
if (in_tensors().at(SECOND_INPUT) == nullptr || in_tensors().at(SECOND_INPUT)->data_type() != kNumberTypeFloat32) {
MS_LOG(ERROR) << "Invalid param in matmul";
return RET_ERROR;
void MatmulFp32BaseCPUKernel::FreePackedMatrixA() {
if (matrix_a_.need_pack && !op_parameter_->is_train_session_ && matrix_a_.pack_ptr != nullptr) {
ms_context_->allocator->Free(matrix_a_.pack_ptr);
}
matrix_a_.pack_ptr = nullptr;
}
CHECK_NULL_RETURN(src_ptr);
int MatmulFp32BaseCPUKernel::PackMatrixB() {
if (!params_->b_const_) {
if (!matrix_b_.need_pack) {
matrix_b_.pack_ptr = reinterpret_cast<float *>(in_tensors_[SECOND_INPUT]->data());
return RET_OK;
}
if (op_parameter_->is_train_session_) {
matrix_b_.pack_ptr = reinterpret_cast<float *>(workspace()) + matrix_a_.pack_size;
} else {
matrix_b_.pack_ptr =
reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_b_.pack_size * sizeof(float)));
}
} else {
#ifdef SERVER_INFERENCE
auto b_packed = lite::PackWeightManager::GetInstance()->GetPackedTensor(
in_tensors()[SECOND_INPUT], static_cast<size_t>(matrix_b_.pack_size) * sizeof(float));
matrix_b_.pack_ptr = reinterpret_cast<float *>(b_packed.second);
if (b_packed.first == lite::PACKED) {
return RET_OK;
}
if (b_packed.first == lite::MALLOC && matrix_b_.pack_ptr == nullptr) {
matrix_b_.pack_ptr = reinterpret_cast<float *>(
ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_.pack_size) * sizeof(float)));
}
#else
matrix_b_.pack_ptr = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_b_.pack_size * sizeof(float)));
#endif
}
auto src_ptr =
matrix_b_.has_origin ? matrix_b_.origin_ptr : reinterpret_cast<float *>(in_tensors_[SECOND_INPUT]->data());
MS_CHECK_TRUE_MSG(src_ptr != nullptr, RET_ERROR, "matrix-b source ptr is a nullptr.");
MS_CHECK_TRUE_MSG(matrix_b_.pack_ptr != nullptr, RET_ERROR, "matrix-b pack ptr is a nullptr.");
for (int i = 0; i < b_batch_; i++) {
const float *src = src_ptr + i * params_->deep_ * params_->col_;
float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
float *dst = matrix_b_.pack_ptr + i * params_->deep_ * params_->col_align_;
if (params_->b_transpose_) {
matrix_b_pack_fun_(src, dst, params_->col_, params_->deep_);
} else {
@ -223,37 +156,50 @@ int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) const {
return RET_OK;
}
void MatmulFp32BaseCPUKernel::FreeBiasBuf() {
if (bias_ptr_ != nullptr) {
free(bias_ptr_);
bias_ptr_ = nullptr;
void MatmulFp32BaseCPUKernel::FreePackedMatrixB() {
if (matrix_b_.need_pack && !op_parameter_->is_train_session_ && matrix_b_.pack_ptr != nullptr) {
ms_context_->allocator->Free(matrix_b_.pack_ptr);
}
matrix_b_.pack_ptr = nullptr;
}
void MatmulFp32BaseCPUKernel::FreeResizeBufA() {
if (!vec_matmul_ && !op_parameter_->is_train_session_ && a_pack_ptr_ != nullptr && is_pack_) {
#ifdef SERVER_INFERENCE
if (a_is_packed_ == lite::MALLOC) {
#endif
ms_context_->allocator->Free(a_pack_ptr_);
#ifdef SERVER_INFERENCE
}
#endif
int MatmulFp32BaseCPUKernel::PackBiasMatrix() {
if (in_tensors_.size() != FOURTH_INPUT) {
return RET_OK;
}
a_pack_ptr_ = nullptr;
}
void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
if (!op_parameter_->is_train_session_ && b_pack_ptr_ != nullptr && is_pack_) {
#ifdef SERVER_INFERENCE
if (b_is_packed_ == lite::MALLOC) {
#endif
ms_context_->allocator->Free(b_pack_ptr_);
#ifdef SERVER_INFERENCE
if (matrix_c_.has_packed) {
if (matrix_c_.pack_size < params_->col_align_) {
MS_LOG(ERROR) << "matmul don't support that column is dynamic.";
return RET_ERROR;
}
#endif
return RET_OK;
}
b_pack_ptr_ = nullptr;
auto bias_tensor = in_tensors_[THIRD_INPUT];
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "bias_tensor invalid";
return RET_ERROR;
}
auto bias_src = matrix_c_.has_origin ? matrix_c_.origin_ptr : reinterpret_cast<float *>(bias_tensor->data());
MS_CHECK_TRUE_MSG(bias_src != nullptr, RET_ERROR, "matrix-c is a nullptr.");
auto bias_num = bias_tensor->ElementsNum();
MS_CHECK_TRUE_MSG(bias_num > 0 && params_->col_align_ >= bias_num, RET_ERROR, "matrix-c is invalid.");
matrix_c_.pack_size = params_->col_align_;
matrix_c_.pack_ptr = reinterpret_cast<float *>(malloc(static_cast<size_t>(matrix_c_.pack_size) * sizeof(float)));
MS_CHECK_TRUE_MSG(matrix_c_.pack_ptr != nullptr, RET_ERROR, "matrix-c malloc failed.");
if (bias_num == 1) {
for (int i = 0; i < matrix_c_.pack_size; ++i) {
matrix_c_.pack_ptr[i] = bias_src[0];
}
} else {
(void)memcpy(matrix_c_.pack_ptr, bias_src, bias_num * static_cast<int>(sizeof(float)));
memset(matrix_c_.pack_ptr + bias_num, 0, (matrix_c_.pack_size - bias_num) * sizeof(float));
}
if (matrix_c_.has_origin) {
ms_context_->allocator->Free(matrix_c_.origin_ptr);
matrix_c_.origin_ptr = nullptr;
matrix_c_.has_origin = false;
}
return RET_OK;
}
int MatmulFp32BaseCPUKernel::ParallelRunByBatch(int task_id) const {
@ -261,12 +207,12 @@ int MatmulFp32BaseCPUKernel::ParallelRunByBatch(int task_id) const {
int end_batch = MSMIN(params_->batch, start_batch + batch_stride_);
for (int index = start_batch; index < end_batch; ++index) {
const float *a = a_pack_ptr_ + a_offset_[index] * params_->row_align_ * params_->deep_;
const float *b = b_pack_ptr_ + b_offset_[index] * params_->deep_ * params_->col_align_;
const float *a = matrix_a_.pack_ptr + a_offset_[index] * params_->row_align_ * params_->deep_;
const float *b = matrix_b_.pack_ptr + b_offset_[index] * params_->deep_ * params_->col_align_;
float *c = output_data_ + index * params_->row_ * col_step_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_;
if (vec_matmul_) {
auto bias = (matrix_c_.pack_ptr == nullptr) ? nullptr : matrix_c_.pack_ptr;
if (params_->row_ == 1) {
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512)
gemvCalFun(a, b, c, bias, params_->act_type_, params_->deep_, col_step_, params_->col_align_);
#elif defined(ENABLE_ARM64)
@ -300,17 +246,18 @@ int MatmulFp32BaseCPUKernel::ParallelRunByRow(int task_id) const {
return RET_OK;
}
#if defined(ENABLE_AVX512)
const float *input = a_pack_ptr_ + start_row * params_->deep_;
const float *input = matrix_a_.pack_ptr + start_row * params_->deep_;
float *output = output_data_ + start_row * params_->col_align_;
MatMulAvx512Fp32(input, b_pack_ptr_, output, bias_ptr_, params_->act_type_, params_->deep_, params_->col_align_,
params_->col_align_, row_num);
MatMulAvx512Fp32(input, matrix_b_.pack_ptr, output, matrix_c_.pack_ptr, params_->act_type_, params_->deep_,
params_->col_align_, params_->col_align_, row_num);
#elif defined(ENABLE_AVX)
const float *input = a_pack_ptr_ + start_row * params_->deep_;
const float *input = matrix_a_.pack_ptr + start_row * params_->deep_;
float *output = output_data_ + start_row * params_->col_align_;
MatMulAvxFp32(input, b_pack_ptr_, output, bias_ptr_, params_->act_type_, params_->deep_, params_->col_align_,
params_->col_align_, row_num);
MatMulAvxFp32(input, matrix_b_.pack_ptr, output, matrix_c_.pack_ptr, params_->act_type_, params_->deep_,
params_->col_align_, params_->col_align_, row_num);
#elif defined(ENABLE_ARM64)
GemmIsNotPackByRow(a_pack_ptr_, b_pack_ptr_, output_data_, bias_ptr_, start_row, end_row, params_->deep_);
GemmIsNotPackByRow(matrix_a_.pack_ptr, matrix_b_.pack_ptr, output_data_, matrix_c_.pack_ptr, start_row, end_row,
params_->deep_);
#endif
return RET_OK;
}
@ -320,12 +267,12 @@ int MatmulFp32BaseCPUKernel::ParallelRunIsNotPackByBatch(int task_id) const {
int start_batch = task_id * batch_stride_;
int end_batch = MSMIN(params_->batch, start_batch + batch_stride_);
float bias = 0;
if (bias_ptr_ != nullptr) {
bias = bias_ptr_[0];
if (matrix_c_.pack_ptr != nullptr) {
bias = matrix_c_.pack_ptr[0];
}
for (int index = start_batch; index < end_batch; ++index) {
const float *a = a_pack_ptr_ + a_offset_[index] * params_->row_ * params_->deep_;
const float *b = b_pack_ptr_ + b_offset_[index] * params_->deep_ * params_->col_;
const float *a = matrix_a_.pack_ptr + a_offset_[index] * params_->row_ * params_->deep_;
const float *b = matrix_b_.pack_ptr + b_offset_[index] * params_->deep_ * params_->col_;
float *c = output_data_ + index * params_->row_ * params_->col_;
gemmIsNotPackFun(a, b, c, &bias, params_->row_, params_->deep_);
}
@ -339,42 +286,45 @@ int MatmulFp32BaseCPUKernel::ParallelRunByOC(int task_id) const {
if (cur_oc <= 0) {
return RET_OK;
}
auto b = batch_b_ptr_ + current_start_oc * params_->deep_;
auto c = batch_c_ptr_ + current_start_oc;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + current_start_oc;
if (vec_matmul_) {
for (int i = 0; i < params_->batch; ++i) {
auto a = matrix_a_.pack_ptr + a_offset_[i] * params_->row_align_ * params_->deep_;
auto b =
matrix_b_.pack_ptr + b_offset_[i] * params_->deep_ * params_->col_align_ + current_start_oc * params_->deep_;
auto c = output_data_ + i * params_->row_ * col_step_ + current_start_oc;
auto bias = (matrix_c_.pack_ptr == nullptr) ? nullptr : matrix_c_.pack_ptr + current_start_oc;
if (params_->row_ == 1) {
#ifdef ENABLE_AVX512
MatVecMulAvx512Fp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
MatVecMulAvx512Fp32(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
#elif defined(ENABLE_AVX)
MatVecMulAvxFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
MatVecMulAvxFp32(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
#elif defined(ENABLE_ARM64)
int rest_align_col = MSMIN(params_->col_align_ - current_start_oc, oc_stride_ * col_tile_);
MatVecMulFp32Neon64(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, rest_align_col);
int rest_align_col = MSMIN(params_->col_align_ - current_start_oc, oc_stride_ * col_tile_);
MatVecMulFp32Neon64(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc, rest_align_col);
#elif defined(ENABLE_ARM32)
MatVecMulFp32Block4(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
MatVecMulFp32Block4(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
#else
MatVecMulFp32Block8(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
MatVecMulFp32Block8(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
#endif
} else {
} else {
#ifdef ENABLE_AVX512
MatMulAvx512Fp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_,
params_->row_);
MatMulAvx512Fp32(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_, params_->row_);
#elif defined(ENABLE_AVX)
MatMulAvxFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_,
params_->row_);
MatMulAvxFp32(a, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_, params_->row_);
#else
MatMulOpt(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
OutType_Nhwc);
MatMulOpt(a, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
#endif
}
}
return RET_OK;
}
int MatmulFp32BaseCPUKernel::init_global_variable() {
void MatmulFp32BaseCPUKernel::InitGlobalVariable() {
matrix_a_.need_pack = true;
matrix_b_.need_pack = true;
#ifdef ENABLE_AVX512
matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col64Major : RowMajor2Row64Major;
matrix_a_.need_pack = params_->a_transpose_;
row_tile_ = C1NUM;
col_tile_ = C16NUM;
gemmCalFun = MatMulAvx512Fp32;
@ -383,6 +333,7 @@ int MatmulFp32BaseCPUKernel::init_global_variable() {
#elif defined(ENABLE_AVX)
matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col32Major : RowMajor2Row32Major;
matrix_a_.need_pack = params_->a_transpose_;
row_tile_ = C1NUM;
col_tile_ = C8NUM;
gemmCalFun = MatMulAvxFp32;
@ -404,103 +355,58 @@ int MatmulFp32BaseCPUKernel::init_global_variable() {
row_tile_ = C12NUM;
col_tile_ = C8NUM;
#endif
if (params_->col_ == 1 && !params_->a_const_) {
is_pack_ = false;
out_need_aligned_ = false;
row_tile_ = 1;
col_tile_ = 1;
matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
}
params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_, params_->row_align_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_ * params_->row_align_, params_->deep_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_, params_->col_align_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_ * params_->col_align_, params_->deep_, RET_ERROR);
matrix_a_pack_size_ = a_batch_ * params_->row_align_ * params_->deep_;
matrix_b_pack_size_ = b_batch_ * params_->col_align_ * params_->deep_;
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512)
col_step_ = params_->col_align_;
#else
// need not aligned
col_step_ = params_->col_;
#endif
return RET_OK;
}
int MatmulFp32BaseCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
CHECK_LESS_RETURN(out_tensors_.size(), 1);
init_global_variable();
if (matrix_a_pack_size_ < 0 || matrix_b_pack_size_ < 0) {
MS_LOG(ERROR) << "Matrix pack size is negative "
<< "matrix_a_pack_size=" << matrix_a_pack_size_ << "matrix_b_pack_size=" << matrix_b_pack_size_;
return RET_ERROR;
MS_CHECK_TRUE_MSG(in_tensors_[FIRST_INPUT]->data_type() == kNumberTypeFloat32, RET_ERROR,
"matrix-a's data type is invalid.");
MS_CHECK_TRUE_MSG(in_tensors_[SECOND_INPUT]->data_type() == kNumberTypeFloat32, RET_ERROR,
"matrix-b's data type is invalid.");
if (in_tensors_.size() == FOURTH_INPUT) {
MS_CHECK_TRUE_MSG(in_tensors_[THIRD_INPUT]->IsConst(), RET_ERROR, "matrix-c must be const when existing.");
MS_CHECK_TRUE_MSG(in_tensors_[THIRD_INPUT]->data_type() == kNumberTypeFloat32, RET_ERROR,
"matrix-c's data type is invalid.");
}
auto ret = InitBiasData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitBiasData failed";
return ret;
}
auto ret = InitParameter();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "Init parameters failed.");
if (params_->a_const_) {
auto a_tensor = in_tensors_[0];
CHECK_NULL_RETURN(a_tensor);
if (InitBufferA() != RET_OK) {
return RET_ERROR;
}
#ifdef SERVER_INFERENCE
if (a_is_packed_ != lite::PACKED) {
#endif
ret = InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitMatrixA failed!";
return ret;
}
#ifdef SERVER_INFERENCE
}
#endif
ret = PackMatrixA();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "pack const-matrix a failed.");
matrix_a_.has_packed = true;
}
if (params_->b_const_) {
auto b_tensor = in_tensors_[1];
CHECK_NULL_RETURN(b_tensor);
if (InitBufferB() != RET_OK) {
return RET_ERROR;
ret = PackMatrixB();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "pack const-matrix b failed.");
matrix_b_.has_packed = true;
}
if (!InferShapeDone()) {
if (in_tensors_.size() == FOURTH_INPUT && !op_parameter_->is_train_session_) {
ret = BackupConstMatrix(&matrix_c_, THIRD_INPUT);
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "backup matrix-c failed.");
}
#ifdef SERVER_INFERENCE
if (b_is_packed_ != lite::PACKED) {
#endif
if (InitMatrixB(static_cast<float *>(b_tensor->data())) != RET_OK) {
MS_LOG(ERROR) << "InitMatrixB failed!";
return RET_ERROR;
}
#ifdef SERVER_INFERENCE
}
#endif
return RET_OK;
}
return RET_OK;
}
int MatmulFp32BaseCPUKernel::ReSize() {
ResizeParameter();
MS_CHECK_FALSE(INT_MUL_OVERFLOW(a_batch_, params_->row_), RET_ERROR);
row_num_ = a_batch_ * params_->row_;
matrix_a_pack_size_ = a_batch_ * params_->row_align_ * params_->deep_;
matrix_b_pack_size_ = b_batch_ * params_->col_align_ * params_->deep_;
if (matrix_a_pack_size_ < 0 || matrix_b_pack_size_ < 0) {
MS_LOG(ERROR) << "Matrix pack size is negative "
<< "matrix_a_pack_size=" << matrix_a_pack_size_ << "matrix_b_pack_size=" << matrix_b_pack_size_;
return RET_ERROR;
}
auto ret = InitParameter();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "Init parameters failed.");
if (op_parameter_->is_train_session_) {
set_workspace_size((matrix_a_pack_size_ + matrix_b_pack_size_) * static_cast<int>(sizeof(float)));
set_workspace_size((matrix_a_.pack_size + matrix_b_.pack_size) * static_cast<int>(sizeof(float)));
}
auto ret = GetThreadCuttingPolicy();
ret = GetThreadCuttingPolicy();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ThreadCuttingPolicy error!";
return ret;
}
if (!matrix_c_.has_packed) {
ret = PackBiasMatrix();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "pack const-matrix c failed.");
matrix_c_.has_packed = true;
}
ret = InitTmpOutBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitTmpOutBuffer error!";
@ -509,16 +415,48 @@ int MatmulFp32BaseCPUKernel::ReSize() {
return RET_OK;
}
void MatmulFp32BaseCPUKernel::ResizeParameter() {
init_global_variable();
int MatmulFp32BaseCPUKernel::InitParameter() {
InitGlobalVariable();
if (params_->row_ == 1) {
vec_matmul_ = true;
row_tile_ = 1;
} else {
vec_matmul_ = false;
matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_a_.need_pack = false;
}
if (params_->col_ == 1 && !params_->a_const_) {
out_need_aligned_ = false;
row_tile_ = 1;
col_tile_ = 1;
matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2ColMajor : RowMajor2RowMajor;
matrix_a_.need_pack = params_->a_transpose_ && params_->row_ != 1;
matrix_b_.need_pack = false;
}
params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_, params_->row_align_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_ * params_->row_align_, params_->deep_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_, params_->col_align_, RET_ERROR);
MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch_ * params_->col_align_, params_->deep_, RET_ERROR);
auto a_pack_size = a_batch_ * params_->row_align_ * params_->deep_;
auto b_pack_size = b_batch_ * params_->col_align_ * params_->deep_;
if ((matrix_a_.has_packed && matrix_a_.pack_size != a_pack_size) ||
(matrix_b_.has_packed && matrix_b_.pack_size != b_pack_size)) {
MS_LOG(ERROR) << "matmul don't support dynamic packing if matrix is a constant.";
return RET_ERROR;
}
matrix_a_.pack_size = a_pack_size;
matrix_b_.pack_size = b_pack_size;
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512)
col_step_ = params_->col_align_;
#else
// need not aligned
col_step_ = params_->col_;
#endif
params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
out_need_aligned_ = (out_need_aligned_ && ((params_->col_ % col_tile_) != 0));
MS_CHECK_FALSE(INT_MUL_OVERFLOW(a_batch_, params_->row_), RET_ERROR);
row_num_ = a_batch_ * params_->row_;
return RET_OK;
}
int MatmulFp32BaseCPUKernel::InitTmpOutBuffer() {
@ -541,15 +479,12 @@ int MatmulFp32BaseCPUKernel::InitTmpOutBuffer() {
}
int MatmulFp32BaseCPUKernel::GetThreadCuttingPolicy() {
if (params_->batch >= op_parameter_->thread_num_ || (params_->col_ == 1 && !params_->a_const_)) {
if (params_->batch >= op_parameter_->thread_num_ || params_->col_ == 1) {
thread_count_ = op_parameter_->thread_num_;
batch_stride_ = UP_DIV(params_->batch, thread_count_);
batch_split_ = true;
parallel_fun_ = &MatmulFp32BaseCPUKernel::ParallelRunByBatch;
} else if (CheckThreadCuttingByRow()) {
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512)
is_pack_ = !params_->b_const_;
batch_split_ = true;
parallel_fun_ = &MatmulFp32BaseCPUKernel::ParallelRunByRow;
GetThreadCuttingInfoByRow();
#else
@ -563,12 +498,9 @@ int MatmulFp32BaseCPUKernel::GetThreadCuttingPolicy() {
#else
oc_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
#endif
batch_split_ = false;
parallel_fun_ = &MatmulFp32BaseCPUKernel::ParallelRunByOC;
}
if (params_->col_ == 1 && !params_->a_const_) {
is_pack_ = false;
batch_split_ = true;
parallel_fun_ = &MatmulFp32BaseCPUKernel::ParallelRunIsNotPackByBatch;
if (params_->deep_ == 1) {
gemmIsNotPackFun = GemmIsNotPack;
@ -640,56 +572,21 @@ int MatmulFp32BaseCPUKernel::Run() {
if (!out_need_aligned_) {
output_data_ = out_data;
}
if (!params_->b_const_) {
auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->data());
CHECK_NULL_RETURN(b_ptr);
if (!is_pack_) {
b_pack_ptr_ = b_ptr;
} else {
if (InitBufferB() != RET_OK) {
return RET_ERROR;
}
auto ret = InitMatrixB(b_ptr);
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitMatrixB failed!";
return ret;
}
}
}
if (!params_->a_const_) {
auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->data());
CHECK_NULL_RETURN(a_ptr);
if (is_pack_ || (params_->a_transpose_ && params_->deep_ != 1)) {
if (InitBufferA() != RET_OK) {
return RET_ERROR;
}
auto ret = InitMatrixA(a_ptr);
if (ret != RET_OK) {
MS_LOG(ERROR) << "InitMatrixA failed!";
return ret;
}
} else {
a_pack_ptr_ = a_ptr;
}
auto ret = PackMatrixA();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "pack const-matrix a failed.");
}
if (!params_->b_const_) {
auto ret = PackMatrixB();
MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "pack const-matrix b failed.");
}
MS_CHECK_TRUE_MSG(matrix_a_.pack_ptr != nullptr, RET_ERROR, "matrix-a pack ptr is a nullptr.");
MS_CHECK_TRUE_MSG(matrix_b_.pack_ptr != nullptr, RET_ERROR, "matrix-b pack ptr is a nullptr.");
if (batch_split_) {
auto ret = ParallelLaunch(this->ms_context_, MatmulRun, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulRun failed in split by batch";
return ret;
}
} else {
for (int i = 0; i < params_->batch; ++i) {
batch_a_ptr_ = a_pack_ptr_ + a_offset_[i] * params_->row_align_ * params_->deep_;
batch_b_ptr_ = b_pack_ptr_ + b_offset_[i] * params_->deep_ * params_->col_align_;
batch_c_ptr_ = output_data_ + i * params_->row_ * col_step_;
auto ret = ParallelLaunch(this->ms_context_, MatmulRun, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulRun failed in split by oc";
return ret;
}
}
auto ret = ParallelLaunch(this->ms_context_, MatmulRun, this, thread_count_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MatmulRun failed in split by batch";
return ret;
}
if (out_need_aligned_) {
@ -698,20 +595,12 @@ int MatmulFp32BaseCPUKernel::Run() {
output_data_ = nullptr;
}
if (!params_->a_const_) {
if (is_pack_ || (params_->a_transpose_ && params_->deep_ != 1)) {
FreeResizeBufA();
} else {
a_pack_ptr_ = nullptr;
}
FreePackedMatrixA();
}
if (!params_->b_const_) {
if (is_pack_) {
FreeResizeBufB();
} else {
b_pack_ptr_ = nullptr;
}
FreePackedMatrixB();
}
return RET_OK;
} // namespace mindspore::kernel
}
} // namespace mindspore::kernel

View File

@ -44,37 +44,46 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx)
: InnerKernel(parameter, inputs, outputs, ctx) {
params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
vec_matmul_ = false;
}
~MatmulFp32BaseCPUKernel() override;
int Prepare() override;
int ReSize() override;
int Run() override;
using ParallelRun = int (MatmulFp32BaseCPUKernel::*)(int task_id) const;
ParallelRun parallel_fun_ = nullptr;
private:
struct MatrixInfo {
bool need_pack;
bool has_packed; // only valid for constant, only do once throughout the process.
bool has_origin; // only valid for constant, only true when failing to infer shape, then false after packing.
int pack_size;
float *origin_ptr; // only valid for constant, which is synchronized with the 'has_origin'.
float *pack_ptr;
MatrixInfo()
: need_pack(false),
has_packed(false),
has_origin(false),
pack_size(-1),
origin_ptr(nullptr),
pack_ptr(nullptr) {}
};
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512) || defined(ENABLE_ARM64)
int ParallelRunByRow(int task_id) const;
#endif
int ParallelRunByOC(int task_id) const;
int ParallelRunByBatch(int task_id) const;
int ParallelRunIsNotPackByBatch(int task_id) const;
using ParallelRun = int (MatmulFp32BaseCPUKernel::*)(int task_id) const;
ParallelRun parallel_fun_ = nullptr;
protected:
int InitBufferA();
int InitBufferB();
int InitMatrixA(const float *src_ptr) const;
int InitMatrixB(const float *src_ptr) const;
void FreeBiasBuf();
int InitBiasData();
void InitParameter();
int init_global_variable();
private:
void ResizeParameter();
void FreeResizeBufA();
void FreeResizeBufB();
int CalBroadCastBiasDataElements();
int BackupConstMatrix(MatrixInfo *matrix_info, int index);
void InitGlobalVariable();
int PackMatrixA();
int PackMatrixB();
int PackBiasMatrix();
void FreePackedMatrixA();
void FreePackedMatrixB();
int InitParameter();
int InitTmpOutBuffer();
int GetThreadCuttingPolicy();
bool CheckThreadCuttingByRow();
@ -82,12 +91,6 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
protected:
MatMulParameter *params_ = nullptr;
float *a_pack_ptr_ = nullptr;
float *b_pack_ptr_ = nullptr;
#ifdef SERVER_INFERENCE
lite::PackStatus a_is_packed_ = lite::MALLOC;
lite::PackStatus b_is_packed_ = lite::MALLOC;
#endif
int a_batch_ = 1;
int b_batch_ = 1;
std::vector<int> a_offset_;
@ -99,18 +102,7 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
int batch_stride_ = 0;
int oc_stride_ = 0;
int thread_count_ = 0;
bool vec_matmul_ = false;
float *bias_ptr_ = nullptr;
float *batch_a_ptr_ = nullptr;
float *batch_b_ptr_ = nullptr;
float *batch_c_ptr_ = nullptr;
float *output_data_ = nullptr;
int matrix_a_pack_size_ = -1;
int matrix_b_pack_size_ = -1;
MatrixPackFun matrix_a_pack_fun_ = nullptr;
MatrixPackFun matrix_b_pack_fun_ = nullptr;
bool batch_split_ = false;
bool is_pack_ = true;
bool out_need_aligned_ = false;
int col_step_ = 0;
#if defined(ENABLE_AVX) || defined(ENABLE_AVX512)
@ -120,6 +112,11 @@ class MatmulFp32BaseCPUKernel : public InnerKernel {
GemmIsNotPackFun gemmIsNotPackFun = nullptr;
int row_num_;
std::vector<int> row_split_points_;
MatrixInfo matrix_a_;
MatrixInfo matrix_b_;
MatrixInfo matrix_c_;
MatrixPackFun matrix_a_pack_fun_ = nullptr;
MatrixPackFun matrix_b_pack_fun_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_