!13867 add pad and strided slice fusion npu

From: @zhaozhenlong Reviewed-by: Signed-off-by:
2021-03-25 08:45:42 +08:00 · 2021-03-25 08:45:42 +08:00 · 9b23952fc2
parent a78c37ebcb 32f35b1055
commit 9b23952fc2
11 changed files with 231 additions and 81 deletions
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.cc
@ -18,6 +18,9 @@
 #include "src/runtime/agent/npu/optimizer/npu_pass_utils.h"
 #include "src/lite_kernel.h"
 #include "nnacl/concat_parameter.h"
+#include "nnacl/split_parameter.h"
+#include "nnacl/pad_parameter.h"
+#include "nnacl/strided_slice_parameter.h"

 namespace mindspore::lite {
 bool CheckFusion(kernel::LiteKernel *kernel) {
@ -119,7 +122,7 @@ void NPUFusionPass::UpdatePostKernels(kernel::LiteKernel *cur_kernel) {
 }

 void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
-  auto tensors_vec = cur_kernel->in_tensors();
+  auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_kernel);
  for (auto in_kernel : cur_kernel->in_kernels()) {
    lite::Tensor *cur_tensor = nullptr;
    auto in_tensor = in_kernel->in_tensors()[0];
@ -136,6 +139,15 @@ void UpdatePreTensors(kernel::LiteKernel *cur_kernel) {
      }
    }
  }
+  // add constant inputs back
+  if (nodes2const_index.find(static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)) !=
+      nodes2const_index.end()) {
+    tensors_vec.resize(cur_kernel->in_tensors().size());
+    auto const_index = nodes2const_index[static_cast<schema::PrimitiveType>(cur_kernel->op_parameter()->type_)];
+    for (auto index : const_index) {
+      tensors_vec[index] = cur_kernel->in_tensors()[index];
+    }
+  }
  cur_kernel->set_in_tensors(tensors_vec);
 }

@ -275,15 +287,75 @@ int NPUFusionPass::FormatFusion(kernel::LiteKernel *kernel) {
  return RET_OK;
 }

+int NPUFusionPass::SplitFusion(kernel::LiteKernel *kernel) {
+  UpdateKernel(kernel);
+  auto split_param = reinterpret_cast<SplitParameter *>(kernel->op_parameter());
+  split_param->split_dim_ = TransFormAxis(split_param->split_dim_);
+  return RET_OK;
+}
+
+int NPUFusionPass::PadFusion(kernel::LiteKernel *kernel) {
+  UpdateKernel(kernel);
+  auto pad_param = reinterpret_cast<PadParameter *>(kernel->op_parameter());
+  int c1 = pad_param->paddings_[6];
+  int c2 = pad_param->paddings_[7];
+  // 0 1 2 3 4 5 6 7
+  // n n h h w w c c
+  // n n c c h h w w
+  pad_param->paddings_[6] = pad_param->paddings_[4];
+  pad_param->paddings_[7] = pad_param->paddings_[5];
+  pad_param->paddings_[4] = pad_param->paddings_[2];
+  pad_param->paddings_[5] = pad_param->paddings_[3];
+  pad_param->paddings_[2] = c1;
+  pad_param->paddings_[3] = c2;
+  return RET_OK;
+}
+
+int NPUFusionPass::StridedSliceFusion(kernel::LiteKernel *kernel) {
+  // basic requirement: input is nhwc 4d
+  UpdateKernel(kernel);
+  auto param = reinterpret_cast<StridedSliceParameter *>(kernel->op_parameter());
+  auto begin_tensor = kernel->in_tensors().at(1);
+  int *begin = reinterpret_cast<int *>(begin_tensor->data_c());
+  (void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1);
+  auto end_tensor = kernel->in_tensors().at(2);
+  int *end = reinterpret_cast<int *>(end_tensor->data_c());
+  NPUPassUtils::AssistDataNHWC2NCHW(end, 1);
+  auto stride_tensor = kernel->in_tensors().at(3);
+  if (kernel->in_tensors().size() == 5) {
+    stride_tensor = kernel->in_tensors().at(4);
+  }
+  int *stride = reinterpret_cast<int *>(stride_tensor->data_c());
+  NPUPassUtils::AssistDataNHWC2NCHW(stride, 1);
+  param->begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->begins_mask_);
+  param->ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ends_mask_);
+  param->ellipsisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->ellipsisMask_);
+  param->newAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->newAxisMask_);
+  param->shrinkAxisMask_ = NPUPassUtils::MaskDataNHWC2NCHW(param->shrinkAxisMask_);
+  return RET_OK;
+}
+
 int NPUFusionPass::Run() {
  for (size_t i = 0; i < kernels->size(); i++) {
    auto kernel = (*kernels)[i];
    if (CheckFusion(kernel)) {
      switch (kernel->Type()) {
+        case schema::PrimitiveType_Split:
+          i -= kernel->in_kernels().size();
+          SplitFusion(kernel);
+          continue;
        case schema::PrimitiveType_Concat:
          i -= kernel->in_kernels().size();
          ConcatFusion(kernel);
          continue;
+        case schema::PrimitiveType_PadFusion:
+          i -= kernel->in_kernels().size();
+          PadFusion(kernel);
+          continue;
+        case schema::PrimitiveType_StridedSlice:
+          i -= kernel->in_kernels().size();
+          StridedSliceFusion(kernel);
+          continue;
        case schema::PrimitiveType_AddFusion:
        case schema::PrimitiveType_Activation:
        case schema::PrimitiveType_Eltwise:
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_fusion_pass.h
@ -39,6 +39,9 @@ class NPUFusionPass : public NPUBasePass {
  int CommonFusion(kernel::LiteKernel *kernel);
  int ConcatFusion(kernel::LiteKernel *kernel);
  int FormatFusion(kernel::LiteKernel *kernel);
+  int SplitFusion(kernel::LiteKernel *kernel);
+  int PadFusion(kernel::LiteKernel *kernel);
+  int StridedSliceFusion(kernel::LiteKernel *kernel);

 private:
  std::vector<kernel::LiteKernel *> *kernels;
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.cc
@ -23,8 +23,10 @@ namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kNPU;
 enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert };
 std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
-  schema::PrimitiveType_Concat, schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
-  schema::PrimitiveType_Activation};
+  schema::PrimitiveType_Concat,       schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
+  schema::PrimitiveType_Activation,   schema::PrimitiveType_Split,     schema::PrimitiveType_PadFusion,
+  schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Activation};
+
 // this pass goal is to minimize subgraphs generated
 // by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with
 // fusion pass. If transpose inserted are more than half of input output, we will insert remaining input
@ -44,7 +46,7 @@ std::set<mindspore::schema::PrimitiveType> npu_insert_nodes = {
 // so we won't insert nc2nh or nh2nc when op's in kernels and out kernels contains no nc2nh or nh2nc.
 // This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d.

-int GetInsertState(kernel::LiteKernel *kernel) {
+int NPUInsertTransformPass::GetInsertState(kernel::LiteKernel *kernel) {
  // filter out irrelevant kernel
  if (npu_insert_nodes.find(kernel->Type()) == npu_insert_nodes.end()) {
    return InsertNone;
@ -52,15 +54,17 @@ int GetInsertState(kernel::LiteKernel *kernel) {

  // current kernel is target kernel
  // use out kernels to count how many out lines from current kernel
+  std::vector<Tensor *> in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
  size_t in_out_tensor_num =
-    kernel->in_tensors().size() + std::max(kernel->out_kernels().size(), static_cast<size_t>(1));
+    in_tensors.size() +
+    std::max(std::max(kernel->out_kernels().size(), static_cast<size_t>(1)), kernel->out_tensors().size());
  size_t transpose_input_num = 0;
  size_t transpose_output_num = 0;
  bool need_pre_insert = false;
  bool need_post_insert = false;
  // count number of input tensor from nc2nh and output tensor to nh2nc
-  for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
-    auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
+  for (size_t i = 0; i < in_tensors.size(); ++i) {
+    auto in_kernel = NPUPassUtils::KernelInputFromKernel(kernel, in_tensors.at(i));
    if (NPUPassUtils::IsNchw2Nhwc(in_kernel)) {
      transpose_input_num++;
    } else {
@ -81,21 +85,22 @@ int GetInsertState(kernel::LiteKernel *kernel) {
  // won't insert any thing if num of transpose tensor is smaller than half of total input output.
  // won't insert if total input output are all transpose tensor, the fusion pass will handle this.
  size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
-  if (transpose_tensor_num <= in_out_tensor_num / 2 || transpose_tensor_num == in_out_tensor_num) {
+  if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num ||
+      transpose_tensor_num == in_out_tensor_num) {
    return InsertNone;
  }
-
+  InsertState ret;
  if (need_pre_insert && !need_post_insert) {
-    return PreInsert;
-  }
-  if (need_pre_insert && need_post_insert) {
-    return BothInsert;
-  }
-  if (!need_pre_insert && need_post_insert) {
-    return PostInsert;
+    ret = PreInsert;
+  } else if (need_pre_insert && need_post_insert) {
+    ret = BothInsert;
+  } else if (!need_pre_insert && need_post_insert) {
+    ret = PostInsert;
+  } else {
+    ret = InsertNone;
  }

-  return InsertNone;
+  return ret;
 }

 int NPUInsertTransformPass::InsertNode(kernel::LiteKernel *kernel, kernel::LiteKernel *post_kernel,
@ -200,13 +205,20 @@ int NPUInsertTransformPass::InsertForOutputTensor(kernel::LiteKernel *kernel, ke
 int NPUInsertTransformPass::InsertPreNodes(kernel::LiteKernel *kernel,
                                           std::vector<kernel::LiteKernel *> *trans_kernels) {
  int ret = RET_OK;
-  for (size_t i = 0; i < kernel->in_tensors().size(); ++i) {
-    auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, i);
+  auto in_tensors = NPUPassUtils::GetNonConstInputs(kernel);
+  for (auto tensor : in_tensors) {
+    auto pre_kernel = NPUPassUtils::KernelInputFromKernel(kernel, tensor);
    if (NPUPassUtils::IsNchw2Nhwc(pre_kernel)) {
      continue;
    }
    // if this tensor is input of graph, pre_kernel is nullptr.
-    ret = InsertForInputTensor(kernel, i, pre_kernel, trans_kernels);
+    auto it = find(kernel->in_tensors().begin(), kernel->in_tensors().end(), tensor);
+    if (it == kernel->in_tensors().end()) {
+      MS_LOG(ERROR) << "Find in tensor index error";
+      return RET_ERROR;
+    }
+    size_t index = it - kernel->in_tensors().begin();
+    ret = InsertForInputTensor(kernel, index, pre_kernel, trans_kernels);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name() << " failed.";
      return ret;
@ -249,59 +261,63 @@ int NPUInsertTransformPass::InsertPostNodes(kernel::LiteKernel *kernel,

 int NPUInsertTransformPass::Run() {
  std::vector<kernel::LiteKernel *> insert_kernels;
-  for (size_t i = 0; i < all_kernels_->size(); i++) {
-    auto kernel = (*all_kernels_)[i];
-    if (kernel->desc().arch != kNPU) {
-      continue;
-    }
-    auto insert_state = GetInsertState(kernel);
-    insert_kernels.clear();
-    // If the every output kernel is nhwc2nchw, insert
-    // modify loop index add post_kernels.size() to the next kernel in the origin vector
-    switch (insert_state) {
-      case PreInsert: {
-        auto ret = InsertPreNodes(kernel, &insert_kernels);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
-                        << " failed.";
-          return RET_ERROR;
-        }
-        all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
-        i += insert_kernels.size();
-        break;
+  for (int j = 0; j < 2; ++j) {
+    for (size_t i = 0; i < all_kernels_->size(); i++) {
+      auto kernel = (*all_kernels_)[i];
+      if (kernel->desc().arch != kNPU) {
+        continue;
      }
-      case PostInsert: {
-        auto ret = InsertPostNodes(kernel, &insert_kernels);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
-          return RET_ERROR;
+      auto insert_state = GetInsertState(kernel);
+      insert_kernels.clear();
+      // If the every output kernel is nhwc2nchw, insert
+      // modify loop index add post_kernels.size() to the next kernel in the origin vector
+      switch (insert_state) {
+        case PreInsert: {
+          auto ret = InsertPreNodes(kernel, &insert_kernels);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
+                          << " failed.";
+            return RET_ERROR;
+          }
+          all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
+          i += insert_kernels.size();
+          break;
        }
-        all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
-        i += insert_kernels.size();
-        break;
-      }
-      case BothInsert: {
-        auto ret = InsertPreNodes(kernel, &insert_kernels);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
-                        << " failed.";
-          return RET_ERROR;
+        case PostInsert: {
+          auto ret = InsertPostNodes(kernel, &insert_kernels);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
+                          << " failed.";
+            return RET_ERROR;
+          }
+          all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
+          i += insert_kernels.size();
+          break;
        }
-        all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
-        i += insert_kernels.size();
+        case BothInsert: {
+          auto ret = InsertPreNodes(kernel, &insert_kernels);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel before kernel " << kernel->name()
+                          << " failed.";
+            return RET_ERROR;
+          }
+          all_kernels_->insert(all_kernels_->begin() + i, insert_kernels.begin(), insert_kernels.end());
+          i += insert_kernels.size();

-        insert_kernels.clear();
-        ret = InsertPostNodes(kernel, &insert_kernels);
-        if (ret != RET_OK) {
-          MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name() << " failed.";
-          return RET_ERROR;
+          insert_kernels.clear();
+          ret = InsertPostNodes(kernel, &insert_kernels);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw kernel and nchw2nhwc kernel after kernel " << kernel->name()
+                          << " failed.";
+            return RET_ERROR;
+          }
+          all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
+          i += insert_kernels.size();
+          break;
        }
-        all_kernels_->insert(all_kernels_->begin() + i + 1, insert_kernels.begin(), insert_kernels.end());
-        i += insert_kernels.size();
-        break;
+        default:
+          MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
      }
-      default:
-        MS_LOG(DEBUG) << "Insert Nothing on kernel " << kernel->name();
    }
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_insert_transform_pass.h
@ -34,6 +34,7 @@ class NPUInsertTransformPass : public NPUBasePass {
  int Run() override;

 private:
+  int GetInsertState(kernel::LiteKernel *kernel);
  int InsertPreNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);

  int InsertPostNodes(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *trans_kernels);
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc
@ -25,7 +25,10 @@
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kCPU;
 using kernel::KERNEL_ARCH::kNPU;
-
+std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index{
+  {schema::PrimitiveType_Split, {1}},
+  {schema::PrimitiveType_PadFusion, {1}},
+  {schema::PrimitiveType_StridedSlice, {1, 2, 3}}};
 kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
                                                        const std::vector<Tensor *> &out_tensors,
                                                        const InnerContext *ctx, const std::string &name) {
@ -125,8 +128,8 @@ void NPUPassUtils::UpdateNC2NHTransNodePreKernel(kernel::LiteKernel *pre_kernel,
  }
  std::copy(trans_kernels.begin(), trans_kernels.end(), std::back_inserter(cur_out_kernels));
  pre_kernel->set_out_kernels(cur_out_kernels);
-  // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor with
-  // the input tensor of trans.
+  // For kernel before trans, the output tensor is used for output tensor of trans, so replace the output tensor
+  // with the input tensor of trans.
  pre_kernel->set_out_tensors({trans_kernels.at(0)->in_tensors().at(0)});
 }

@ -158,7 +161,7 @@ void NPUPassUtils::UpdateNC2NHTransNodePostKernel(kernel::LiteKernel *kernel, ke
  Tensor *old_in_tensor = nullptr;
  // find out which input tensor of post_kernel should be updated
  for (size_t i = 0; i < post_in_tensors.size(); ++i) {
-    if (KernelInputFromKernel(post_kernel, i) == kernel) {
+    if (KernelInputFromKernel(post_kernel, post_in_tensors.at(i)) == kernel) {
      old_in_tensor = post_in_tensors.at(i);
      break;
    }
@ -219,17 +222,16 @@ bool NPUPassUtils::IsNchw2Nhwc(const kernel::LiteKernel *kernel) {
  }
  return false;
 }
-kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index) {
+kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor) {
  // given kernel and input tensor index, get which kernel output this tensor.
  // If input tensor is graph input, return nullptr.
  if (kernel == nullptr) {
    return nullptr;
  }
-  auto tensor = kernel->in_tensors().at(in_tensor_index);
  auto in_kernels = kernel->in_kernels();
-  auto output_contain = [tensor](const kernel::LiteKernel *kernel) {
+  auto output_contain = [in_tensor](const kernel::LiteKernel *kernel) {
    auto out_tensors = kernel->out_tensors();
-    return std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end();
+    return std::find(out_tensors.begin(), out_tensors.end(), in_tensor) != out_tensors.end();
  };
  auto it = std::find_if(in_kernels.begin(), in_kernels.end(), output_contain);
  if (it == in_kernels.end()) {
@ -238,10 +240,57 @@ kernel::LiteKernel *NPUPassUtils::KernelInputFromKernel(const kernel::LiteKernel
  return *it;
 }

+std::vector<Tensor *> NPUPassUtils::GetNonConstInputs(kernel::LiteKernel *kernel) {
+  if (kernel == nullptr) {
+    return std::vector<Tensor *>{};
+  }
+  auto type = static_cast<schema::PrimitiveType>(kernel->op_parameter()->type_);
+  auto it = nodes2const_index.find(type);
+  if (it != nodes2const_index.end()) {
+    auto const_input_indices = it->second;
+    std::vector<Tensor *> non_const_in_tensors;
+    auto in_tensors = kernel->in_tensors();
+    for (auto i = 0; i < in_tensors.size(); ++i) {
+      if (const_input_indices.find(i) == const_input_indices.end()) {
+        non_const_in_tensors.push_back(in_tensors[i]);
+      }
+    }
+    return non_const_in_tensors;
+  }
+  return kernel->in_tensors();
+}
+
 bool NPUPassUtils::Scale4dCase(const kernel::LiteKernel *kernel) {
  MS_ASSERT(kernel != nullptr && kernel->op_parameter() != nullptr);
  auto scale_param = reinterpret_cast<ScaleParameter *>(kernel->op_parameter());
  auto in_tensor = kernel->in_tensors().at(1);
  return in_tensor->shape().size() == 1 && (scale_param->axis_ == 3 || scale_param->axis_ == -1);
 }
+
+void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) {
+  MS_ASSERT(data != nullptr);
+  for (size_t i = 0; i < unit_size; ++i) {
+    int c = data[3 * unit_size + i];
+    // n h w c
+    // n c h w
+    data[3 * unit_size + i] = data[2 * unit_size + i];
+    data[2 * unit_size + i] = data[unit_size + i];
+    data[unit_size + i] = c;
+  }
+}
+
+int NPUPassUtils::MaskDataNHWC2NCHW(int mask) {
+  int mask_vec[4];
+  for (int i = 0; i < 4; ++i) {
+    mask_vec[i] = (uint32_t)(mask) & (1 << i);
+  }
+  AssistDataNHWC2NCHW(mask_vec, 1);
+  int ret = 0;
+  for (int i = 0; i < 4; ++i) {
+    if (mask_vec[i]) {
+      ret += 1 << i;
+    }
+  }
+  return ret;
+}
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.h
@ -17,9 +17,12 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
 #include <vector>
+#include <set>
 #include <string>
+#include <unordered_map>
 #include "src/lite_kernel.h"
 namespace mindspore::lite {
+extern std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index;
 class NPUPassUtils {
 public:
  static kernel::LiteKernel *CreateNchw2NhwcKernel(const std::vector<Tensor *> &in_tensors,
@ -52,8 +55,11 @@ class NPUPassUtils {
  static bool IsNhwc2Nchw(const kernel::LiteKernel *kernel);

  static bool IsNchw2Nhwc(const kernel::LiteKernel *kernel);
-  static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, size_t in_tensor_index);
+  static kernel::LiteKernel *KernelInputFromKernel(const kernel::LiteKernel *kernel, Tensor *in_tensor);
+  static std::vector<Tensor *> GetNonConstInputs(kernel::LiteKernel *kernel);
  static bool Scale4dCase(const kernel::LiteKernel *kernel);
+  static void AssistDataNHWC2NCHW(int *data, size_t unit_size);
+  static int MaskDataNHWC2NCHW(int mask);
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_UTILS_H_
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.cc
@ -14,7 +14,6 @@
 * limitations under the License.
 */
 #include "src/runtime/agent/npu/optimizer/npu_transform_pass.h"
-#include <set>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/agent/npu/npu_manager.h"
@ -22,7 +21,7 @@
 namespace mindspore::lite {
 using kernel::KERNEL_ARCH::kNPU;

-static std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
+std::set<mindspore::schema::PrimitiveType> npu_trans_nodes = {
  schema::PrimitiveType_Conv2DFusion,  schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize,
  schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion,         schema::PrimitiveType_ScaleFusion};

--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_transform_pass.h
@ -16,11 +16,14 @@

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_TRANSFORM_PASS_H_
+
+#include <set>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/agent/npu/optimizer/npu_base_pass.h"

 namespace mindspore::lite {
+extern std::set<mindspore::schema::PrimitiveType> npu_trans_nodes;
 class NPUTransformPass : public NPUBasePass {
 public:
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.cc
@ -31,7 +31,7 @@ int PadNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std
  }
  if (inputs.size() >= 2 && inputs[1]->data_c() != nullptr) {
    for (int i = 0; i < inputs[1]->ElementsNum(); i++) {
-      paddings_.push_back(static_cast<int *>(inputs[1]->data_c())[i]);
+      param_->paddings_[i] = static_cast<int *>(inputs[1]->data_c())[i];
    }
  } else {
    MS_LOG(WARNING) << "NPU axis is attribute.";
@ -50,7 +50,7 @@ int PadNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const
  int size = static_cast<int>(param_->padding_length / 2);
  ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32);
  ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc);
-  padding_tensor->SetData(reinterpret_cast<uint8_t *>(paddings_.data()), 2 * size * sizeof(int));
+  padding_tensor->SetData(reinterpret_cast<uint8_t *>(param_->paddings_), 2 * size * sizeof(int));
  hiai_paddings_ = new hiai::op::Const(name_ + "paddings");
  hiai_paddings_->set_attr_value(padding_tensor);

--- a/mindspore/lite/src/runtime/kernel/npu/pad_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/pad_npu.h
@ -39,7 +39,6 @@ class PadNPUKernel : public NPUKernel {
 private:
  hiai::op::PadV2 *op_ = nullptr;
  PadParameter *param_;
-  std::vector<int> paddings_;
  hiai::op::Const *hiai_paddings_ = nullptr;
  hiai::op::Const *hiai_constant_ = nullptr;
 };
--- a/mindspore/lite/test/models_npu.cfg
+++ b/mindspore/lite/test/models_npu.cfg
@ -77,3 +77,5 @@ ml_video_edit_img_segment_adaptise_pb2tflite.tflite 0.5 2
 ml_video_edit_imitate_filter.onnx 200
 hdc_mobilenet_1w_class.onnx 20
 hdc_age_medium 504
+posenet_mobilenet_float_075_1_default_1.tflite 395
+nasnet_mobile.tflite 1