From 7befa6baa45a64d152dd9497fd0aeec77073b5f0 Mon Sep 17 00:00:00 2001 From: wangpingan2 Date: Mon, 27 Feb 2023 14:44:03 +0800 Subject: [PATCH] support matmul fp32 pack. --- .../cpu/kernel/nnacl/matmul_parameter.h | 2 + .../src/litert/kernel/cpu/fp32/matmul_fp32.cc | 5 + .../src/litert/kernel/cpu/fp32/matmul_fp32.h | 3 + .../kernel/cpu/fp32/matmul_fp32_arm32.h | 4 +- .../kernel/cpu/fp32/matmul_fp32_arm64.cc | 2 +- .../kernel/cpu/fp32/matmul_fp32_arm64.h | 4 +- .../litert/kernel/cpu/fp32/matmul_fp32_avx.h | 4 +- .../kernel/cpu/fp32/matmul_fp32_avx512.h | 4 +- .../kernel/cpu/fp32/matmul_fp32_base.cc | 9 +- .../litert/kernel/cpu/fp32/matmul_fp32_base.h | 6 + .../litert/kernel/cpu/fp32/matmul_fp32_sse.h | 4 +- .../cpu/int8/matmul_dynamic_base_int8.cc | 6 + .../cpu/int8/matmul_dynamic_base_int8.h | 5 +- mindspore/lite/src/litert/lite_kernel.h | 2 + .../src/litert/runtime_packed_node_pass.cc | 118 +++++++++++---- .../src/litert/runtime_packed_node_pass.h | 4 +- .../tools/converter/converter_packed_node.cc | 136 +++++++++++------- .../converter/offline_packing_optimizer.cc | 5 +- 18 files changed, 229 insertions(+), 94 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/matmul_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/matmul_parameter.h index c5f7df341cc..525df9f7870 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/matmul_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/matmul_parameter.h @@ -41,6 +41,8 @@ typedef enum MatmulType { kMatmulInt8Cpu, kMatmulDynamicInt8Cpu, kMatmulDynamicSdotInt8Cpu, + kMatmulFp32BaseCpu, + kMatmulFp32Arm64Cpu, } MatmulType; typedef struct MatMulParameter { diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.cc index 715eeab2c41..0a369ecebc4 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.cc @@ -111,5 +111,10 @@ MatmulFp32BaseCPUKernel *CreateMatmulFp32CPUKernel(OpParameter *parameter, const return kernel; } +int MatmulCPUKernel::PreparePackedWeight(const lite::Tensor *tensor) { + matmul_base_->SetWeightIsPacked(true); + return RET_OK; +} + REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMulFusion, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.h index 525430f19c4..9be0e3ac764 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32.h @@ -88,6 +88,9 @@ class MatmulCPUKernel : public LiteKernel { int ReSize() override; int Run() override; + int PreparePackedWeight(const lite::Tensor *tensor) override; + MatmulFp32BaseCPUKernel *GetMatmulBase() const { return matmul_base_; } + private: MatmulFp32BaseCPUKernel *matmul_base_ = nullptr; }; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm32.h index a73090b913f..95d0200cad1 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm32.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm32.h @@ -24,7 +24,9 @@ class MatmulFp32ARM32CPUKernel : public MatmulFp32BaseCPUKernel { public: MatmulFp32ARM32CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx) - : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {} + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + params_->matmul_type_ = kNotImplemented; + } ~MatmulFp32ARM32CPUKernel() = default; void InitGlobalVariable() override; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.cc index a4d247bc185..a0aaddff73e 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.cc @@ -28,7 +28,7 @@ constexpr int64_t kPackAMinUnitNum = 1 << 13; } // namespace void MatmulFp32ARM64CPUKernel::InitGlobalVariable() { matrix_a_.need_pack = true; - matrix_b_.need_pack = true; + matrix_b_.need_pack = !weight_is_packed_; matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12MajorParallel : RowMajor2Col12MajorParallel; matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8MajorParallel : RowMajor2Row8MajorParallel; pack_opt_ = true; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.h index 3910b33aac1..021ba6e2beb 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_arm64.h @@ -25,7 +25,9 @@ class MatmulFp32ARM64CPUKernel : public MatmulFp32BaseCPUKernel { public: MatmulFp32ARM64CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx) - : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {} + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + params_->matmul_type_ = kMatmulFp32Arm64Cpu; + } ~MatmulFp32ARM64CPUKernel() = default; void InitGlobalVariable() override; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx.h index dbd40c0f897..0978df91812 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx.h @@ -25,7 +25,9 @@ class MatmulFp32AVXCPUKernel : public MatmulFp32BaseCPUKernel { public: MatmulFp32AVXCPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx) - : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {} + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + params_->matmul_type_ = kNotImplemented; + } ~MatmulFp32AVXCPUKernel() = default; void InitGlobalVariable() override; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx512.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx512.h index 7826bc0656c..8bc1afbf888 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx512.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_avx512.h @@ -32,7 +32,9 @@ class MatmulFp32AVX512CPUKernel : public MatmulFp32BaseCPUKernel { public: MatmulFp32AVX512CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx) - : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {} + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + params_->matmul_type_ = kNotImplemented; + } ~MatmulFp32AVX512CPUKernel() = default; void InitGlobalVariable() override; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.cc index cc695eca60d..1d80a908495 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.cc @@ -60,6 +60,9 @@ MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() { } } if (params_->b_const_) { + if (!matrix_b_.need_pack && weight_is_packed_) { + return; + } if (is_sharing_pack_) { lite::PackWeightManager::GetInstance()->Free(matrix_b_.pack_ptr); } else { @@ -70,7 +73,7 @@ MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() { void MatmulFp32BaseCPUKernel::InitGlobalVariable() { matrix_a_.need_pack = true; - matrix_b_.need_pack = true; + matrix_b_.need_pack = !weight_is_packed_; matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12MajorParallel : RowMajor2Col12MajorParallel; matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8MajorParallel : RowMajor2Row8MajorParallel; row_tile_ = C12NUM; @@ -239,6 +242,10 @@ int MatmulFp32BaseCPUKernel::PackMatrixB() { reinterpret_cast(ms_context_->allocator->Malloc(matrix_b_.pack_size * sizeof(float))); } } else { + if (!matrix_b_.need_pack && weight_is_packed_) { + matrix_b_.pack_ptr = reinterpret_cast(in_tensors_[SECOND_INPUT]->data()); + return RET_OK; + } bool is_packed = false; void *data = nullptr; if (is_sharing_pack_) { diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.h index f2b43408308..875a33f567c 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_base.h @@ -39,6 +39,7 @@ class MatmulFp32BaseCPUKernel : public LiteKernel { const std::vector &outputs, const mindspore::lite::InnerContext *ctx) : LiteKernel(parameter, inputs, outputs, ctx) { params_ = reinterpret_cast(op_parameter_); + params_->matmul_type_ = kMatmulFp32BaseCpu; } ~MatmulFp32BaseCPUKernel() override; int Prepare() override; @@ -72,6 +73,10 @@ class MatmulFp32BaseCPUKernel : public LiteKernel { virtual int ParallelRunByRow1Deep1GEPDOT(int task_id) const { return RET_ERROR; } virtual int GetThreadCuttingPolicy(); + const float *GetPackBPtr() const { return matrix_b_.pack_ptr; } + const int GetBBatch() const { return b_batch_; } + void SetWeightIsPacked(bool weight_is_packed) { this->weight_is_packed_ = weight_is_packed; } + public: struct MatrixInfo { bool need_pack{false}; @@ -140,6 +145,7 @@ class MatmulFp32BaseCPUKernel : public LiteKernel { float *conv1x1_origin_weight_ = nullptr; float *conv1x1_origin_bias_ = nullptr; bool is_sharing_pack_ = true; + bool weight_is_packed_{false}; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_MATMUL_FP32_BASE_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_sse.h b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_sse.h index 526b88aa8be..5484336b850 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_sse.h +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/matmul_fp32_sse.h @@ -25,7 +25,9 @@ class MatmulFp32SSECPUKernel : public MatmulFp32BaseCPUKernel { public: MatmulFp32SSECPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx) - : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {} + : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + params_->matmul_type_ = kNotImplemented; + } ~MatmulFp32SSECPUKernel() = default; void InitGlobalVariable() override; diff --git a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc index 5686d4dfc16..d3e6d0b4c22 100644 --- a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc +++ b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.cc @@ -433,4 +433,10 @@ int MatmulDynamicBaseInt8CPUKernel::InitBroadcastParams(const std::vector & return RET_OK; } + +int MatmulDynamicBaseInt8CPUKernel::PreparePackedWeight(const lite::Tensor *tensor) { + weight_is_packed_ = true; + weight_sums_tensor_ = tensor; + return RET_OK; +} } // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h index 90e67a21878..1754289ff1e 100644 --- a/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h +++ b/mindspore/lite/src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h @@ -45,8 +45,7 @@ class MatmulDynamicBaseInt8CPUKernel : public LiteKernel { const int8_t *GetPackBPtr() const { return pack_b_ptr_; } const int *GetWeightSums() const { return weight_sums_; } const int GetBBatch() const { return b_batch_; } - void SetWeightIsPacked(bool weight_is_packed) { this->weight_is_packed_ = weight_is_packed; } - void SetWeightSumsTensor(lite::Tensor *weight_sums_tensor) { this->weight_sums_tensor_ = weight_sums_tensor; } + int PreparePackedWeight(const lite::Tensor *tensor) override; private: void ResizeMatrixBParameter(); @@ -97,7 +96,7 @@ class MatmulDynamicBaseInt8CPUKernel : public LiteKernel { bool enable_fp16_ = false; PackFunc b_pack_func_ = nullptr; bool weight_is_packed_ = false; - lite::Tensor *weight_sums_tensor_ = nullptr; + const lite::Tensor *weight_sums_tensor_ = nullptr; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/lite_kernel.h b/mindspore/lite/src/litert/lite_kernel.h index 3268e970d60..b96acf571d7 100644 --- a/mindspore/lite/src/litert/lite_kernel.h +++ b/mindspore/lite/src/litert/lite_kernel.h @@ -182,6 +182,8 @@ class MS_API LiteKernel : public Abstractkernel { } bool ws_allocated_ = false; + virtual int PreparePackedWeight(const lite::Tensor *tensor) { return mindspore::lite::RET_OK; } + protected: virtual int UpdateThreadNumProcess(int32_t kernel_type, int64_t per_unit_load_num, int64_t per_unit_store_num, int64_t unit_num); diff --git a/mindspore/lite/src/litert/runtime_packed_node_pass.cc b/mindspore/lite/src/litert/runtime_packed_node_pass.cc index e70f69b74bc..03469282831 100644 --- a/mindspore/lite/src/litert/runtime_packed_node_pass.cc +++ b/mindspore/lite/src/litert/runtime_packed_node_pass.cc @@ -15,7 +15,7 @@ */ #include "src/litert/runtime_packed_node_pass.h" #include "nnacl/op_base.h" -#include "src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h" +#include "nnacl/matmul_parameter.h" using RecoveryWeightFunc = void (*)(void *, void *, int, int, bool); namespace mindspore { @@ -75,9 +75,9 @@ void PackedNodePass::Run(Model *model, const std::vector &tensors) { MS_LOG(ERROR) << "Custom attr error."; return; } - auto val_offset = schema::CreateMatMulFusion( - fbb, std::atoi(attr_map[kTransposeA].c_str()), std::atoi(attr_map[kTransposeB].c_str()), - static_cast(std::atoi(attr_map[kActivationType].c_str()))); + auto val_offset = + schema::CreateMatMulFusion(fbb, std::stoi(attr_map[kTransposeA]), std::stoi(attr_map[kTransposeB]), + static_cast(std::stoi(attr_map[kActivationType]))); auto prim_offset = schema::CreatePrimitive(fbb, schema::PrimitiveType_MatMulFusion, val_offset.o); fbb.Finish(prim_offset); void *prim = malloc(fbb.GetSize()); @@ -96,21 +96,23 @@ void PackedNodePass::Run(Model *model, const std::vector &tensors) { } node->primitive_ = custom_primitive; pack_info->is_packed_ = true; - pack_info->weight_sums_index_ = node->input_indices_.back(); - pack_info->b_batch_ = std::atoi(attr_map["b_batch"].c_str()); - pack_info->col_ = std::atoi(attr_map["col"].c_str()); - pack_info->deep_ = std::atoi(attr_map["deep"].c_str()); - pack_info->col_align_ = std::atoi(attr_map["col_align"].c_str()); - pack_info->deep_align_ = std::atoi(attr_map["deep_align"].c_str()); - pack_info->b_transpose_ = std::atoi(attr_map[kTransposeB].c_str()); + pack_info->b_batch_ = std::stoi(attr_map["b_batch"]); + pack_info->col_ = std::stoi(attr_map["col"]); + pack_info->deep_ = std::stoi(attr_map["deep"]); + pack_info->col_align_ = std::stoi(attr_map["col_align"]); + pack_info->deep_align_ = std::stoi(attr_map["deep_align"]); + pack_info->b_transpose_ = std::stoi(attr_map[kTransposeB]); pack_info->cpu_option_ = attr_map["cpu_option"]; AddNodePackInfo(node->name_, pack_info); - node->input_indices_.pop_back(); - node->node_type_ = schema::PrimitiveType_MatMulFusion; - } + if (node->quant_type_ == schema::QuantType_QUANT_DYNAMIC) { + pack_info->weight_sums_index_ = node->input_indices_.back(); + node->input_indices_.pop_back(); + if (!(reinterpret_cast(model)->keep_model_buf())) { + CopyWeightBiasSumsTensor(tensors); + } + } - if (!(reinterpret_cast(model)->keep_model_buf())) { - CopyWeightBiasSumsTensor(tensors); + node->node_type_ = schema::PrimitiveType_MatMulFusion; } } @@ -180,12 +182,69 @@ void MatmulDynamicSdotInt8Cpu(void *src, void *dst, int row, int col, bool trans } } +void MatmulFp32BaseCpu(void *src, void *dst, int row, int col, bool transpose) { + if (!transpose) { + // RowMajor2Row8MajorParallel + auto src_r = static_cast(src); + auto dst_r = static_cast(dst); + for (int r = 0; r < row; r++) { + float *src_c = src_r + r * col; + int c = 0; + for (; c < col; c++) { + int cd8 = c / C8NUM; + int cm8 = c % C8NUM; + src_c[c] = dst_r[cd8 * C8NUM * row + r * C8NUM + cm8]; + } + } + return; + } + // RowMajor2Col8MajorParallel + auto src_r = static_cast(src); + auto dst_r = static_cast(dst); + int row8 = row / C8NUM * C8NUM; + int col_skip = col / C4NUM * C4NUM; + int skip_size = C4NUM; + + int ri = 0; + for (; ri < row8; ri += C8NUM) { + int ci = 0; + for (; ci < col_skip; ci += skip_size) { + float *src_c = src_r + ci; + float *dst_c = dst_r + ci * C8NUM; + for (int tr = 0; tr < C8NUM; tr++) { + for (int tc = 0; tc < C4NUM; tc++) { + src_c[tr * col + tc] = dst_c[tc * C8NUM + tr]; + } + } + } + for (; ci < col; ci++) { + float *src_c = src_r + ci; + float *dst_c = dst_r + ci * C8NUM; + for (int i = 0; i < C8NUM; i++) { + src_c[i * col] = dst_c[i]; + } + } + src_r += C8NUM * col; + dst_r += C8NUM * col; + } + for (; ri < row; ri++, src_r += col, dst_r++) { + for (int i = 0; i < col; i++) { + src_r[i] = dst_r[i * C8NUM]; + } + } +} + RecoveryWeightFunc GetRecoveryWeightFunc(const int quant_type, const TypeId data_type, const int node_type, const std::string &cpu_option) { if (cpu_option == kArm64SimdDot && node_type == schema::PrimitiveType_MatMulFusion && quant_type == schema::QuantType_QUANT_DYNAMIC && data_type == kNumberTypeInt8) { return MatmulDynamicSdotInt8Cpu; } + + if (cpu_option == kArm64SimdDot && node_type == schema::PrimitiveType_MatMulFusion && + data_type == kNumberTypeFloat32) { + return MatmulFp32BaseCpu; + } return nullptr; } @@ -200,23 +259,26 @@ int PackedMatmulKernelExec(kernel::KernelExec *kernel_exec, const std::vectorkernel(); MS_CHECK_TRUE_MSG(kernel != nullptr, lite::RET_NULL_PTR, "kernel is nullptr."); auto param = reinterpret_cast(kernel_exec->op_parameter()); - if (dst_tensor->data_type() != kNumberTypeInt8 || kernel->quant_type() != schema::QuantType_QUANT_DYNAMIC) { + if (dst_tensor->data_type() == kNumberTypeFloat32) { + if (param->matmul_type_ == kNotImplemented) { + return RecoveryPackedWeight(dst_tensor, static_cast(kernel->quant_type()), dst_tensor->data_type(), + schema::PrimitiveType_MatMulFusion, pack_info); + } + } + + if (dst_tensor->data_type() == kNumberTypeInt8 && param->matmul_type_ != kMatmulDynamicSdotInt8Cpu && + pack_info->cpu_option_ == kArm64SimdDot) { return RecoveryPackedWeight(dst_tensor, static_cast(kernel->quant_type()), dst_tensor->data_type(), schema::PrimitiveType_MatMulFusion, pack_info); } - if (param->matmul_type_ != kMatmulDynamicSdotInt8Cpu && pack_info->cpu_option_ == kArm64SimdDot) { - return RecoveryPackedWeight(dst_tensor, static_cast(kernel->quant_type()), dst_tensor->data_type(), - schema::PrimitiveType_MatMulFusion, pack_info); - } - auto matmul_kernel = static_cast(kernel); - matmul_kernel->SetWeightIsPacked(true); + auto lite_kernel = static_cast(kernel); + lite::Tensor *weight_sums = nullptr; auto index = static_cast(pack_info->weight_sums_index_); if (index < tensors.size()) { - matmul_kernel->SetWeightSumsTensor(tensors.at(index)); + weight_sums = tensors.at(index); } - - return lite::RET_OK; + return lite_kernel->PreparePackedWeight(weight_sums); } int RecoveryPackedWeight(Tensor *weight, const int quant_type, const TypeId data_type, const int node_type, @@ -239,6 +301,10 @@ int RecoveryPackedWeight(Tensor *weight, const int quant_type, const TypeId data current_weight = static_cast(static_cast(unpack_data) + i * pack_info->deep_ * pack_info->col_); current_b_pack = static_cast(static_cast(pack_b_ptr) + i * pack_info->col_align_ * pack_info->deep_align_); + } else if (weight->data_type() == kNumberTypeFloat32) { + current_weight = static_cast(static_cast(unpack_data) + i * pack_info->deep_ * pack_info->col_); + current_b_pack = + static_cast(static_cast(pack_b_ptr) + i * pack_info->col_align_ * pack_info->deep_); } else { free(unpack_data); MS_LOG(ERROR) << "unsupported data type."; diff --git a/mindspore/lite/src/litert/runtime_packed_node_pass.h b/mindspore/lite/src/litert/runtime_packed_node_pass.h index 33369491e4a..b6169a2b554 100644 --- a/mindspore/lite/src/litert/runtime_packed_node_pass.h +++ b/mindspore/lite/src/litert/runtime_packed_node_pass.h @@ -28,13 +28,13 @@ namespace mindspore { namespace lite { struct PackInfo { bool is_packed_{false}; - int weight_sums_index_; + int weight_sums_index_{-1}; int b_batch_; int deep_; int col_; int deep_align_; int col_align_; - bool b_transpose_; + bool b_transpose_{false}; std::string cpu_option_; }; diff --git a/mindspore/lite/tools/converter/converter_packed_node.cc b/mindspore/lite/tools/converter/converter_packed_node.cc index 88a73eaa816..5a374f51a06 100644 --- a/mindspore/lite/tools/converter/converter_packed_node.cc +++ b/mindspore/lite/tools/converter/converter_packed_node.cc @@ -21,6 +21,7 @@ #include "tools/converter/offline_packing_optimizer.h" #include "src/litert/kernel/cpu/int8/matmul_dynamic_base_int8.h" #include "mindspore/core/ops/op_name.h" +#include "src/litert/kernel/cpu/fp32/matmul_fp32.h" namespace mindspore { namespace { @@ -37,6 +38,27 @@ void AddCustomAttr(std::vector> * attrs->emplace_back(std::move(attr)); } +int AddWeightSumsToInputs(const mindspore::kernel::MatmulDynamicBaseInt8CPUKernel *matmul_kernel, + schema::MetaGraphT *meta_graph, const std::unique_ptr &cnode, + size_t weight_sum_size) { + auto weight_sums_tensor = std::make_unique(); + weight_sums_tensor->nodeType = lite::NodeType_ValueNode; + weight_sums_tensor->format = schema::Format_NHWC; + weight_sums_tensor->dataType = TypeId::kNumberTypeInt32; + weight_sums_tensor->dims = {}; + weight_sums_tensor->dims.emplace_back(weight_sum_size / sizeof(int)); + weight_sums_tensor->data.resize(weight_sum_size); + weight_sums_tensor->name = cnode->name + "_weight_sums"; + if (memcpy_s(weight_sums_tensor->data.data(), weight_sums_tensor->data.size(), matmul_kernel->GetWeightSums(), + weight_sum_size) != EOK) { + MS_LOG(ERROR) << "new CustomT error."; + return RET_ERROR; + } + cnode->inputIndex.emplace_back(meta_graph->allTensors.size()); + meta_graph->allTensors.emplace_back(std::move(weight_sums_tensor)); + return RET_OK; +} + int ReplaceMatMulFusionToCustom(schema::MetaGraphT *meta_graph, const std::unique_ptr &cnode, const std::unique_ptr &b_input, const std::string &cpu_option) { @@ -51,65 +73,75 @@ int ReplaceMatMulFusionToCustom(schema::MetaGraphT *meta_graph, const std::uniqu return RET_ERROR; } auto matmul_param = reinterpret_cast(param); + if (matmul_param->matmul_type_ == kNotImplemented) { + MS_LOG(ERROR) << "Unsupported matmul type, only support fp32 and dynamic quant int8."; + return RET_ERROR; + } + cnode->primitive->value.type = schema::PrimitiveType_Custom; + auto primitive = new (std::nothrow) schema::CustomT; + if (primitive == nullptr) { + MS_LOG(ERROR) << "new CustomT error."; + return RET_NULL_PTR; + } + primitive->type = kMatmulCustomType; + + // activation_type + AddCustomAttr(&(primitive->attr), ops::kActivationType, std::to_string(matmul_param->act_type_)); + // transpose_a + AddCustomAttr(&(primitive->attr), ops::kTransposeA, std::to_string(matmul_param->a_transpose_)); + // transpose_b + AddCustomAttr(&(primitive->attr), ops::kTransposeB, std::to_string(matmul_param->b_transpose_)); + + int b_batch; + const void *pack_b_ptr = nullptr; + size_t pack_b_size; if (matmul_param->matmul_type_ == kMatmulDynamicSdotInt8Cpu) { - cnode->primitive->value.type = schema::PrimitiveType_Custom; - auto primitive = new (std::nothrow) schema::CustomT; - if (primitive == nullptr) { - MS_LOG(ERROR) << "new CustomT error."; - return RET_NULL_PTR; - } - primitive->type = kMatmulCustomType; - - // activation_type - AddCustomAttr(&(primitive->attr), ops::kActivationType, std::to_string(matmul_param->act_type_)); - // transpose_a - AddCustomAttr(&(primitive->attr), ops::kTransposeA, std::to_string(matmul_param->a_transpose_)); - // transpose_b - AddCustomAttr(&(primitive->attr), ops::kTransposeB, std::to_string(matmul_param->b_transpose_)); - // replace packed data auto matmul_kernel = reinterpret_cast(lite_kernel); - auto b_batch = matmul_kernel->GetBBatch(); - auto pack_b_size = b_batch * matmul_param->col_align_ * matmul_param->deep_align_ * sizeof(int8_t); - b_input->data.resize(pack_b_size); - if (memcpy_s(b_input->data.data(), b_input->data.size(), matmul_kernel->GetPackBPtr(), pack_b_size) != EOK) { - delete primitive; - MS_LOG(ERROR) << "new CustomT error."; - return RET_ERROR; - } - - // add weight_sums to inputs + b_batch = matmul_kernel->GetBBatch(); + pack_b_size = b_batch * matmul_param->col_align_ * matmul_param->deep_align_ * sizeof(int8_t); + pack_b_ptr = reinterpret_cast(matmul_kernel->GetPackBPtr()); auto weight_sum_size = b_batch * matmul_param->col_align_ * sizeof(int); - auto weight_sums_tensor = std::make_unique(); - weight_sums_tensor->nodeType = lite::NodeType_ValueNode; - weight_sums_tensor->format = schema::Format_NHWC; - weight_sums_tensor->dataType = TypeId::kNumberTypeInt32; - weight_sums_tensor->dims = {}; - weight_sums_tensor->dims.emplace_back(weight_sum_size / sizeof(int)); - weight_sums_tensor->data.resize(weight_sum_size); - weight_sums_tensor->name = cnode->name + "_weight_sums"; - if (memcpy_s(weight_sums_tensor->data.data(), weight_sums_tensor->data.size(), matmul_kernel->GetWeightSums(), - weight_sum_size) != EOK) { + int ret = AddWeightSumsToInputs(matmul_kernel, meta_graph, cnode, weight_sum_size); + if (ret != RET_OK) { delete primitive; - MS_LOG(ERROR) << "new CustomT error."; - return RET_ERROR; + MS_LOG(ERROR) << "add weight sums to inputs error."; + return ret; } - cnode->inputIndex.emplace_back(meta_graph->allTensors.size()); - meta_graph->allTensors.emplace_back(std::move(weight_sums_tensor)); - - // add scalar to attr - AddCustomAttr(&(primitive->attr), "b_batch", std::to_string(b_batch)); - AddCustomAttr(&(primitive->attr), "deep", std::to_string(matmul_param->deep_)); - AddCustomAttr(&(primitive->attr), "col", std::to_string(matmul_param->col_)); - AddCustomAttr(&(primitive->attr), "col_align", std::to_string(matmul_param->col_align_)); - AddCustomAttr(&(primitive->attr), "deep_align", std::to_string(matmul_param->deep_align_)); - - // add cpu option - std::string cpu_option_str = cpu_option; - AddCustomAttr(&(primitive->attr), "cpu_option", std::move(cpu_option_str)); - - cnode->primitive->value.value = primitive; + } else if (matmul_param->matmul_type_ == kMatmulFp32BaseCpu || matmul_param->matmul_type_ == kMatmulFp32Arm64Cpu) { + auto matmul_kernel = reinterpret_cast(lite_kernel); + auto matmul_kernel_base = matmul_kernel->GetMatmulBase(); + b_batch = matmul_kernel_base->GetBBatch(); + pack_b_size = b_batch * matmul_param->col_align_ * matmul_param->deep_ * sizeof(float); + pack_b_ptr = reinterpret_cast(matmul_kernel_base->GetPackBPtr()); } + + if (pack_b_ptr == nullptr) { + delete primitive; + MS_LOG(ERROR) << "pack_b_ptr is nullptr."; + return RET_NULL_PTR; + } + + // copy packed weight to meta graph + b_input->data.resize(pack_b_size); + if (memcpy_s(b_input->data.data(), b_input->data.size(), pack_b_ptr, pack_b_size) != EOK) { + delete primitive; + MS_LOG(ERROR) << "memcpy packed weight error."; + return RET_ERROR; + } + + // add scalar to attr + AddCustomAttr(&(primitive->attr), "b_batch", std::to_string(b_batch)); + AddCustomAttr(&(primitive->attr), "deep", std::to_string(matmul_param->deep_)); + AddCustomAttr(&(primitive->attr), "col", std::to_string(matmul_param->col_)); + AddCustomAttr(&(primitive->attr), "col_align", std::to_string(matmul_param->col_align_)); + AddCustomAttr(&(primitive->attr), "deep_align", std::to_string(matmul_param->deep_align_)); + + // add cpu option + std::string cpu_option_str = cpu_option; + AddCustomAttr(&(primitive->attr), "cpu_option", std::move(cpu_option_str)); + + cnode->primitive->value.value = primitive; return RET_OK; } diff --git a/mindspore/lite/tools/converter/offline_packing_optimizer.cc b/mindspore/lite/tools/converter/offline_packing_optimizer.cc index 5fd8db8313a..54df2912a3a 100644 --- a/mindspore/lite/tools/converter/offline_packing_optimizer.cc +++ b/mindspore/lite/tools/converter/offline_packing_optimizer.cc @@ -40,6 +40,7 @@ const char kAndroidArmCpuBackendOption[] = "ANDROID_ARM_CPU"; mindspore::lite::InnerContext *InitInnerContextForAndroidArmCpu() { // if the operation use thread_pool in inner context will throw exception. auto inner_context = new (std::nothrow) lite::InnerContext(); + inner_context->Init(); MS_CHECK_TRUE_MSG(inner_context != nullptr, nullptr, "Create InnerContext failed."); inner_context->thread_num_ = kSingleThread; inner_context->instructions_ctx_.support_sdot = true; @@ -234,10 +235,6 @@ STATUS MatmulPacking(const mindspore::CNodePtr &cnode_ptr, const FuncGraphPtr &f } op_parameter->thread_num_ = kSingleThread; op_parameter->quant_type_ = GetQuantType(cnode_ptr); - if (op_parameter->quant_type_ != schema::QuantType::QuantType_QUANT_DYNAMIC) { - MS_LOG(DEBUG) << "Only do pack for dynamic quant matmul operation now, skip " << cnode_ptr->fullname_with_scope(); - return RET_OK; - } (void)snprintf(op_parameter->name_, cnode_ptr->fullname_with_scope().length() + 1, "%s", cnode_ptr->fullname_with_scope().c_str());