diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc index 175154b311a..522bf5730c3 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc @@ -18,6 +18,9 @@ #include "src/runtime/agent/npu/optimizer/npu_pass_utils.h" #include "src/lite_kernel.h" #include "nnacl/concat_parameter.h" +#include "nnacl/split_parameter.h" +#include "nnacl/pad_parameter.h" +#include "nnacl/strided_slice_parameter.h" namespace mindspore::lite { bool CheckFusion(kernel::LiteKernel *kernel) { @@ -119,7 +122,7 @@ void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) { } void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { - auto tensors_vec = cur_kernel->in_tensors(); + auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_kernel); for (auto in_kernel : cur_kernel->in_kernels()) { lite::Tensor *cur_tensor = nullptr; auto in_tensor = in_kernel->in_tensors()[0]; @@ -136,6 +139,15 @@ void UpdatePreTensors(kernel::LiteKernel *cur_kernel) { } } } + // add constant inputs back + if (nodes2const_index.find(static_cast(cur_kernel->op_parameter()->type_)) != + nodes2const_index.end()) { + tensors_vec.resize(cur_kernel->in_tensors().size()); + auto const_index = nodes2const_index[static_cast(cur_kernel->op_parameter()->type_)]; + for (auto index : const_index) { + tensors_vec[index] = cur_kernel->in_tensors()[index]; + } + } cur_kernel->set_in_tensors(tensors_vec); } @@ -275,15 +287,75 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) { return RET_OK; } +int NPUFusionPass::SplitFusion(kernel::LiteKernel *kernel) { + UpdateKernel(kernel); + auto split_param = reinterpret_cast(kernel->op_parameter()); + split_param->split_dim_ = TransFormAxis(split_param->split_dim_); + return RET_OK; +} + +int NPUFusionPass::PadFusion(kernel::LiteKernel *kernel) { + UpdateKernel(kernel); + auto pad_param = reinterpret_cast(kernel->op_parameter()); + int c1 = pad_param->paddings_[6]; + int c2 = pad_param->paddings_[7]; + // 0 1 2 3 4 5 6 7 + // n n h h w w c c + // n n c c h h w w + pad_param->paddings_[6] = pad_param->paddings_[4]; + pad_param->paddings_[7] = pad_param->paddings_[5]; + pad_param->paddings_[4] = pad_param->paddings_[2]; + pad_param->paddings_[5] = pad_param->paddings_[3]; + pad_param->paddings_[2] = c1; + pad_param->paddings_[3] = c2; + return RET_OK; +} + +int NPUFusionPass::StridedSliceFusion(kernel::LiteKernel *kernel) { + // basic requirement: input is nhwc 4d + UpdateKernel(kernel); + auto param = reinterpret_cast(kernel->op_parameter()); + auto begin_tensor = kernel->in_tensors().at(1); + int *begin = reinterpret_cast(begin_tensor->data_c()); + (void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1); + auto end_tensor = kernel->in_tensors().at(2); + int *end = reinterpret_cast(end_tensor->data_c()); + NPUPassUtils::AssistDataNHWC2NCHW(end, 1); + auto stride_tensor = kernel->in_tensors().at(3); + if (kernel->in_tensors().size() == 5) { + stride_tensor = kernel->in_tensors().at(4); + } + int *stride = reinterpret_cast(stride_tensor->data_c()); + NPUPassUtils::AssistDataNHWC2NCHW(stride, 1); + param->begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->begins_mask_); + param->ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ends_mask_); + param->ellipsisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ellipsisMask_); + param->newAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->newAxisMask_); + param->shrinkAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->shrinkAxisMask_); + return RET_OK; +} + int NPUFusionPass::Run() { for (size_t i = 0; i < kernels->size(); i++) { auto kernel = (*kernels)[i]; if (CheckFusion(kernel)) { switch (kernel->Type()) { + case schema::PrimitiveType_Split: + i -= kernel->in_kernels().size(); + SplitFusion(kernel); + continue; case schema::PrimitiveType_Concat: i -= kernel->in_kernels().size(); ConcatFusion(kernel); continue; + case schema::PrimitiveType_PadFusion: + i -= kernel->in_kernels().size(); + PadFusion(kernel); + continue; + case schema::PrimitiveType_StridedSlice: + i -= kernel->in_kernels().size(); + StridedSliceFusion(kernel); + continue; case schema::PrimitiveType_AddFusion: case schema::PrimitiveType_Activation: case schema::PrimitiveType_Eltwise: diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h index 21962639722..d5ef66560b8 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h @@ -39,6 +39,9 @@ class NPUFusionPass : public NPUBasePass { int CommonFusion(kernel::LiteKernel *kernel); int ConcatFusion(kernel::LiteKernel *kernel); int FormatFusion(kernel::LiteKernel *kernel); + int SplitFusion(kernel::LiteKernel *kernel); + int PadFusion(kernel::LiteKernel *kernel); + int StridedSliceFusion(kernel::LiteKernel *kernel); private: std::vector *kernels; diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc index 9ca39be2c4d..2e955e7ca7d 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc @@ -23,8 +23,10 @@ namespace mindspore::lite { using kernel::KERNEL_ARCH::kNPU; enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert }; std::set npu_insert_nodes = { - schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise, - schema::PrimitiveType_Activation}; + schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise, + schema::PrimitiveType_Activation, schema::PrimitiveType_Split, schema::PrimitiveType_PadFusion, + schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Activation}; + // this pass goal is to minimize subgraphs generated // by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with // fusion pass. If transpose inserted are more than half of input output, we will insert remaining input @@ -44,7 +46,7 @@ std::set npu_insert_nodes = { // so we won't insert nc2nh or nh2nc when op's in kernels and out kernels contains no nc2nh or nh2nc. // This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d. -int GetInsertState(kernel::LiteKernel *kernel) { +int NPUInsertTransformPass::GetInsertState(kernel::LiteKernel *kernel) { // filter out irrelevant kernel if (npu_insert_nodes.find(kernel->Type()) == npu_insert_nodes.end()) { return InsertNone; @@ -52,15 +54,17 @@ int GetInsertState(kernel::LiteKernel *kernel) { // current kernel is target kernel // use out kernels to count how many out lines from current kernel + std::vector in_tensors = NPUPassUtils::GetNonConstInputs(kernel); size_t in_out_tensor_num = - kernel->in_tensors().size() + std::max(kernel->out_kernels().size(), static_cast(1)); + in_tensors.size() + + std::max(std::max(kernel->out_kernels().size(), static_cast(1)), kernel->out_tensors().size()); size_t transpose_input_num = 0; size_t transpose_output_num = 0; bool need_pre_insert = false; bool need_post_insert = false; // count number of input tensor from nc2nh and output tensor to nh2nc - for (size_t i = 0; i < kernel->in_tensors().size(); ++i) { - auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i); + for (size_t i = 0; i < in_tensors.size(); ++i) { + auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, in_tensors.at(i)); if (NPUPassUtils::IsNchw2Nhwc(in_kernel)) { transpose_input_num++; } else { @@ -81,21 +85,22 @@ int GetInsertState(kernel::LiteKernel *kernel) { // won't insert any thing if num of transpose tensor is smaller than half of total input output. // won't insert if total input output are all transpose tensor, the fusion pass will handle this. size_t transpose_tensor_num = transpose_input_num + transpose_output_num; - if (transpose_tensor_num <= in_out_tensor_num / 2 || transpose_tensor_num == in_out_tensor_num) { + if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num || + transpose_tensor_num == in_out_tensor_num) { return InsertNone; } - + InsertState ret; if (need_pre_insert && !need_post_insert) { - return PreInsert; - } - if (need_pre_insert && need_post_insert) { - return BothInsert; - } - if (!need_pre_insert && need_post_insert) { - return PostInsert; + ret = PreInsert; + } else if (need_pre_insert && need_post_insert) { + ret = BothInsert; + } else if (!need_pre_insert && need_post_insert) { + ret = PostInsert; + } else { + ret = InsertNone; } - return InsertNone; + return ret; } int NPUInsertTransformPass::InsertNode(kernel::LiteKernel *kernel, kernel::LiteKernel *post_kernel, @@ -200,13 +205,20 @@ int NPUInsertTransformPass::InsertForOutputTensor(kernel::LiteKernel *kernel, ke int NPUInsertTransformPass::InsertPreNodes(kernel::LiteKernel *kernel, std::vector *trans_kernels) { int ret = RET_OK; - for (size_t i = 0; i < kernel->in_tensors().size(); ++i) { - auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i); + auto in_tensors = NPUPassUtils::GetNonConstInputs(kernel); + for (auto tensor : in_tensors) { + auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, tensor); if (NPUPassUtils::IsNchw2Nhwc(pre_kernel)) { continue; } // if this tensor is input of graph, pre_kernel is nullptr. - ret = InsertForInputTensor(kernel, i, pre_kernel, trans_kernels); + auto it = find(kernel->in_tensors().begin(), kernel->in_tensors().end(), tensor); + if (it == kernel->in_tensors().end()) { + MS_LOG(ERROR) << "Find in tensor index error"; + return RET_ERROR; + } + size_t index = it - kernel->in_tensors().begin(); + ret = InsertForInputTensor(kernel, index, pre_kernel, trans_kernels); if (ret != RET_OK) { MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() << " failed."; return ret; @@ -249,59 +261,63 @@ int NPUInsertTransformPass::InsertPostNodes(kernel::LiteKernel *kernel, int NPUInsertTransformPass::Run() { std::vector insert_kernels; - for (size_t i = 0; i < all_kernels_->size(); i++) { - auto kernel = (*all_kernels_)[i]; - if (kernel->desc().arch != kNPU) { - continue; - } - auto insert_state = GetInsertState(kernel); - insert_kernels.clear(); - // If the every output kernel is nhwc2nchw, insert - // modify loop index add post_kernels.size() to the next kernel in the origin vector - switch (insert_state) { - case PreInsert: { - auto ret = InsertPreNodes(kernel, &insert_kernels); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() - << " failed."; - return RET_ERROR; - } - all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); - i += insert_kernels.size(); - break; + for (int j = 0; j < 2; ++j) { + for (size_t i = 0; i < all_kernels_->size(); i++) { + auto kernel = (*all_kernels_)[i]; + if (kernel->desc().arch != kNPU) { + continue; } - case PostInsert: { - auto ret = InsertPostNodes(kernel, &insert_kernels); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed."; - return RET_ERROR; + auto insert_state = GetInsertState(kernel); + insert_kernels.clear(); + // If the every output kernel is nhwc2nchw, insert + // modify loop index add post_kernels.size() to the next kernel in the origin vector + switch (insert_state) { + case PreInsert: { + auto ret = InsertPreNodes(kernel, &insert_kernels); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() + << " failed."; + return RET_ERROR; + } + all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); + i += insert_kernels.size(); + break; } - all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); - i += insert_kernels.size(); - break; - } - case BothInsert: { - auto ret = InsertPreNodes(kernel, &insert_kernels); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() - << " failed."; - return RET_ERROR; + case PostInsert: { + auto ret = InsertPostNodes(kernel, &insert_kernels); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() + << " failed."; + return RET_ERROR; + } + all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); + i += insert_kernels.size(); + break; } - all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); - i += insert_kernels.size(); + case BothInsert: { + auto ret = InsertPreNodes(kernel, &insert_kernels); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() + << " failed."; + return RET_ERROR; + } + all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end()); + i += insert_kernels.size(); - insert_kernels.clear(); - ret = InsertPostNodes(kernel, &insert_kernels); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed."; - return RET_ERROR; + insert_kernels.clear(); + ret = InsertPostNodes(kernel, &insert_kernels); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() + << " failed."; + return RET_ERROR; + } + all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); + i += insert_kernels.size(); + break; } - all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end()); - i += insert_kernels.size(); - break; + default: + MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name(); } - default: - MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name(); } } return RET_OK; diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h index 32cdee6699d..1a76e68be3a 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h @@ -34,6 +34,7 @@ class NPUInsertTransformPass : public NPUBasePass { int Run() override; private: + int GetInsertState(kernel::LiteKernel *kernel); int InsertPreNodes(kernel::LiteKernel *kernel, std::vector *trans_kernels); int InsertPostNodes(kernel::LiteKernel *kernel, std::vector *trans_kernels); diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc index 802d693b775..537640e1e91 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc @@ -25,7 +25,10 @@ namespace mindspore::lite { using kernel::KERNEL_ARCH::kCPU; using kernel::KERNEL_ARCH::kNPU; - +std::unordered_map> nodes2const_index{ + {schema::PrimitiveType_Split, {1}}, + {schema::PrimitiveType_PadFusion, {1}}, + {schema::PrimitiveType_StridedSlice, {1, 2, 3}}}; kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector &in_tensors, const std::vector &out_tensors, const InnerContext *ctx, const std::string &name) { @@ -125,8 +128,8 @@ void NPUPassUtils::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *pre_kernel, } std::copy(trans_kernels.begin(), trans_kernels.end(), std::back_inserter(cur_out_kernels)); pre_kernel->set_out_kernels(cur_out_kernels); - // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor with - // the input tensor of trans. + // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor + // with the input tensor of trans. pre_kernel->set_out_tensors({trans_kernels.at(0)->in_tensors().at(0)}); } @@ -158,7 +161,7 @@ void NPUPassUtils::UpdateNC2NHTransNodePostKernel(kernel::LiteKernel *kernel, ke Tensor *old_in_tensor = nullptr; // find out which input tensor of post_kernel should be updated for (size_t i = 0; i < post_in_tensors.size(); ++i) { - if (KernelInputFromKernel(post_kernel, i) == kernel) { + if (KernelInputFromKernel(post_kernel, post_in_tensors.at(i)) == kernel) { old_in_tensor = post_in_tensors.at(i); break; } @@ -219,17 +222,16 @@ bool NPUPassUtils::IsNchw2Nhwc(const kernel::LiteKernel *kernel) { } return false; } -kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index) { +kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor) { // given kernel and input tensor index, get which kernel output this tensor. // If input tensor is graph input, return nullptr. if (kernel == nullptr) { return nullptr; } - auto tensor = kernel->in_tensors().at(in_tensor_index); auto in_kernels = kernel->in_kernels(); - auto output_contain = [tensor](const kernel::LiteKernel *kernel) { + auto output_contain = [in_tensor](const kernel::LiteKernel *kernel) { auto out_tensors = kernel->out_tensors(); - return std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end(); + return std::find(out_tensors.begin(), out_tensors.end(), in_tensor) != out_tensors.end(); }; auto it = std::find_if(in_kernels.begin(), in_kernels.end(), output_contain); if (it == in_kernels.end()) { @@ -238,10 +240,57 @@ kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel return *it; } +std::vector NPUPassUtils::GetNonConstInputs(kernel::LiteKernel *kernel) { + if (kernel == nullptr) { + return std::vector{}; + } + auto type = static_cast(kernel->op_parameter()->type_); + auto it = nodes2const_index.find(type); + if (it != nodes2const_index.end()) { + auto const_input_indices = it->second; + std::vector non_const_in_tensors; + auto in_tensors = kernel->in_tensors(); + for (auto i = 0; i < in_tensors.size(); ++i) { + if (const_input_indices.find(i) == const_input_indices.end()) { + non_const_in_tensors.push_back(in_tensors[i]); + } + } + return non_const_in_tensors; + } + return kernel->in_tensors(); +} + bool NPUPassUtils::Scale4dCase(const kernel::LiteKernel *kernel) { MS_ASSERT(kernel != nullptr && kernel->op_parameter() != nullptr); auto scale_param = reinterpret_cast(kernel->op_parameter()); auto in_tensor = kernel->in_tensors().at(1); return in_tensor->shape().size() == 1 && (scale_param->axis_ == 3 || scale_param->axis_ == -1); } + +void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) { + MS_ASSERT(data != nullptr); + for (size_t i = 0; i < unit_size; ++i) { + int c = data[3 * unit_size + i]; + // n h w c + // n c h w + data[3 * unit_size + i] = data[2 * unit_size + i]; + data[2 * unit_size + i] = data[unit_size + i]; + data[unit_size + i] = c; + } +} + +int NPUPassUtils::MaskDataNHWC2NCHW(int mask) { + int mask_vec[4]; + for (int i = 0; i < 4; ++i) { + mask_vec[i] = (uint32_t)(mask) & (1 << i); + } + AssistDataNHWC2NCHW(mask_vec, 1); + int ret = 0; + for (int i = 0; i < 4; ++i) { + if (mask_vec[i]) { + ret += 1 << i; + } + } + return ret; +} } // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h index 7b776f903db..bab889c7a0b 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h @@ -17,9 +17,12 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ #include +#include #include +#include #include "src/lite_kernel.h" namespace mindspore::lite { +extern std::unordered_map> nodes2const_index; class NPUPassUtils { public: static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector &in_tensors, @@ -52,8 +55,11 @@ class NPUPassUtils { static bool IsNhwc2Nchw(const kernel::LiteKernel *kernel); static bool IsNchw2Nhwc(const kernel::LiteKernel *kernel); - static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index); + static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor); + static std::vector GetNonConstInputs(kernel::LiteKernel *kernel); static bool Scale4dCase(const kernel::LiteKernel *kernel); + static void AssistDataNHWC2NCHW(int *data, size_t unit_size); + static int MaskDataNHWC2NCHW(int mask); }; } // namespace mindspore::lite #endif // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_ diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc index 870df98b9d8..8a2855a257a 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc @@ -14,7 +14,6 @@ * limitations under the License. */ #include "src/runtime/agent/npu/optimizer/npu_transform_pass.h" -#include #include #include "src/lite_kernel.h" #include "src/runtime/agent/npu/npu_manager.h" @@ -22,7 +21,7 @@ namespace mindspore::lite { using kernel::KERNEL_ARCH::kNPU; -static std::set npu_trans_nodes = { +std::set npu_trans_nodes = { schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize, schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion, schema::PrimitiveType_ScaleFusion}; diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h index 6b75c91cb2f..696855b2598 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h @@ -16,11 +16,14 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_ #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_ + +#include #include #include "src/lite_kernel.h" #include "src/runtime/agent/npu/optimizer/npu_base_pass.h" namespace mindspore::lite { +extern std::set npu_trans_nodes; class NPUTransformPass : public NPUBasePass { public: int Run() override; diff --git a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc index 3a3a504dd04..890d7b99fa3 100644 --- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc +++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc @@ -31,7 +31,7 @@ int PadNPUKernel::IsSupport(const std::vector &inputs, const std } if (inputs.size() >= 2 && inputs[1]->data_c() != nullptr) { for (int i = 0; i < inputs[1]->ElementsNum(); i++) { - paddings_.push_back(static_cast(inputs[1]->data_c())[i]); + param_->paddings_[i] = static_cast(inputs[1]->data_c())[i]; } } else { MS_LOG(WARNING) << "NPU axis is attribute."; @@ -50,7 +50,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector &inputs, const int size = static_cast(param_->padding_length / 2); ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32); ge::TensorPtr padding_tensor = std::make_shared(padding_tensor_desc); - padding_tensor->SetData(reinterpret_cast(paddings_.data()), 2 * size * sizeof(int)); + padding_tensor->SetData(reinterpret_cast(param_->paddings_), 2 * size * sizeof(int)); hiai_paddings_ = new hiai::op::Const(name_ + "paddings"); hiai_paddings_->set_attr_value(padding_tensor); diff --git a/mindspore/lite/src/runtime/kernel/npu/pad_npu.h b/mindspore/lite/src/runtime/kernel/npu/pad_npu.h index cb849032571..534f593f85d 100644 --- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.h +++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.h @@ -39,7 +39,6 @@ class PadNPUKernel : public NPUKernel { private: hiai::op::PadV2 *op_ = nullptr; PadParameter *param_; - std::vector paddings_; hiai::op::Const *hiai_paddings_ = nullptr; hiai::op::Const *hiai_constant_ = nullptr; }; diff --git a/mindspore/lite/test/models_npu.cfg b/mindspore/lite/test/models_npu.cfg index f9033202759..e925a0fc055 100644 --- a/mindspore/lite/test/models_npu.cfg +++ b/mindspore/lite/test/models_npu.cfg @@ -77,3 +77,5 @@ ml_video_edit_img_segment_adaptise_pb2tflite.tflite 0.5 2 ml_video_edit_imitate_filter.onnx 200 hdc_mobilenet_1w_class.onnx 20 hdc_age_medium 504 +posenet_mobilenet_float_075_1_default_1.tflite 395 +nasnet_mobile.tflite 1