reconstuct npu scale op and insert transpose pass

2022-01-29 01:10:37 +08:00 · 2022-01-29 01:10:37 +08:00 · c564c628b5
parent f6e83e0383
commit c564c628b5
34 changed files with 660 additions and 568 deletions
--- a/mindspore/lite/src/delegate/npu/npu_converter_utils.cc
+++ b/mindspore/lite/src/delegate/npu/npu_converter_utils.cc
@ -15,7 +15,7 @@
 */
 #include "src/delegate/npu/npu_converter_utils.h"
-#include "src/common/log_adapter.h"
+#include "src/delegate/npu/op/npu_op.h"
 namespace mindspore {
 #define C4NUM 4
 #define C8NUM 8
@ -55,7 +55,7 @@ void Float16ToFloat32(const float16_t *__restrict input, float *__restrict outpu
 #endif
 ge::Shape ConverterToNPUShape(const std::vector<int64_t> &src_shape, bool is_expand_4d) {
-  vector<int64_t> shapes;
+  std::vector<int64_t> shapes;
  shapes.reserve(src_shape.size());
  for (int i = 0; i < src_shape.size(); i++) {
    shapes.push_back(src_shape[i]);
@ -64,8 +64,7 @@ ge::Shape ConverterToNPUShape(const std::vector<int64_t> &src_shape, bool is_exp
    if (shapes.size() == 1) {
      return ge::Shape({1, shapes[0], 1, 1});
    } else {
-      const int dimension4 = 4;
+      for (int i = src_shape.size(); i < NPU_SHAPE_SIZE; i++) {
      for (int i = src_shape.size(); i < dimension4; i++) {
        shapes.push_back(1);
      }
    }
@ -204,23 +203,23 @@ int TransFormAxis(int axis) {
 void AssistDataNHWC2NCHW(int *data, size_t unit_size) {
  MS_ASSERT(data != nullptr);
  for (size_t i = 0; i < unit_size; ++i) {
-    int c = data[3 * unit_size + i];
+    int org_c = data[NHWC_C * unit_size + i];
    // n h w c
    // n c h w
-    data[3 * unit_size + i] = data[2 * unit_size + i];
+    data[NCHW_W * unit_size + i] = data[NHWC_W * unit_size + i];
-    data[2 * unit_size + i] = data[unit_size + i];
+    data[NCHW_H * unit_size + i] = data[NHWC_H * unit_size + i];
-    data[unit_size + i] = c;
+    data[NCHW_C * unit_size + i] = org_c;
  }
 }
 int MaskDataNHWC2NCHW(int mask) {
-  int mask_vec[4];
+  int mask_vec[NPU_SHAPE_SIZE];
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < NPU_SHAPE_SIZE; ++i) {
    mask_vec[i] = (uint32_t)(mask) & (1 << i);
  }
  AssistDataNHWC2NCHW(mask_vec, 1);
  int ret = 0;
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < NPU_SHAPE_SIZE; ++i) {
    if (mask_vec[i]) {
      ret += 1 << i;
    }
--- a/mindspore/lite/src/delegate/npu/npu_converter_utils.h
+++ b/mindspore/lite/src/delegate/npu/npu_converter_utils.h
@ -27,6 +27,9 @@
 #include "include/graph/op/array_defs.h"
 #include "include/api/types.h"
 #include "include/api/data_type.h"
 #include "include/graph/op/all_ops.h"
 #include "src/common/log_adapter.h"
 #include "nnacl/op_base.h"
 namespace mindspore {
 enum NCHW_SHAPE { NCHW_INVALID = -1, NCHW_N = 0, NCHW_C = 1, NCHW_H = 2, NCHW_W = 3 };
@ -91,5 +94,29 @@ int TransFormAxis(int axis);
 void AssistDataNHWC2NCHW(int *data, size_t unit_size);
 int MaskDataNHWC2NCHW(int mask);
 template <typename T>
 ge::Operator *GetNPUConst(const uint8_t *const_data, const std::vector<int64_t> &shape, const ge::DataType data_type,
                          std::string name = "const", bool is_expand_4d = false) {
  MS_CHECK_TRUE_MSG(const_data != nullptr, nullptr, "Const data can not be nullptr.");
  int element_num = 1;
  if (!shape.empty()) {
    for (size_t i = 0; i < shape.size(); i++) {
      MS_CHECK_GT(shape.at(i), 0, nullptr);
      MS_CHECK_INT_MUL_NOT_OVERFLOW(element_num, shape.at(i), nullptr);
      element_num *= shape.at(i);
    }
  }
  ge::TensorDesc const_tensor_desc(ConverterToNPUShape(shape, is_expand_4d), ge::FORMAT_NCHW, data_type);
  ge::TensorPtr const_tensor = std::make_shared<hiai::Tensor>(const_tensor_desc);
  const_tensor->SetData(const_data, element_num * sizeof(T));
  auto const_op = new (std::nothrow) hiai::op::Const(name);
  if (const_op == nullptr) {
    MS_LOG(ERROR) << "New Const op failed.";
    return const_op;
  }
  const_op->set_attr_value(const_tensor);
  return const_op;
 }
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_CONVERTER_UITLS_H_
--- a/mindspore/lite/src/delegate/npu/npu_delegate.cc
+++ b/mindspore/lite/src/delegate/npu/npu_delegate.cc
@ -52,12 +52,12 @@
 #include "src/delegate/npu/op/transpose_npu.h"
 #include "src/delegate/npu/op/unsqueeze_npu.h"
 #include "src/delegate/npu/op/abs_npu.h"
 #include "src/delegate/npu/op/flatten_npu.h"
 #include "src/delegate/npu/npu_graph.h"
 #include "src/delegate/delegate_utils.h"
 #include "src/delegate/npu/pass/npu_transform_pass.h"
 #include "src/delegate/npu/pass/npu_insert_transform_pass.h"
 #include "src/delegate/npu/pass/npu_fusion_pass.h"
 #include "src/delegate/npu/pass/npu_infer_format_pass.h"
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@ -97,13 +97,6 @@ Status NPUDelegate::AddPasses() {
    return mindspore::kLiteNullptr;
  }
  pass_manager_->AddPass(fusion_pass);
  auto infer_format_pass = new (std::nothrow) NPUInferFormatPass();
  if (infer_format_pass == nullptr) {
    MS_LOG(ERROR) << "New NPUInferFormatPass failed.";
    return mindspore::kLiteNullptr;
  }
  pass_manager_->AddPass(infer_format_pass);
  return mindspore::kSuccess;
 }
--- a/mindspore/lite/src/delegate/npu/npu_executor.cc
+++ b/mindspore/lite/src/delegate/npu/npu_executor.cc
@ -16,6 +16,7 @@
 #include "src/delegate/npu/npu_executor.h"
 #include <unordered_map>
 #include <set>
 #include "include/errorcode.h"
 #include "src/delegate/npu/npu_manager.h"
 #include "src/common/log_adapter.h"
@ -73,7 +74,8 @@ bool IsSameShapeTensor(mindspore::MSTensor tensor, const std::shared_ptr<hiai::A
 }
 int NPUExecutor::Run(const std::vector<mindspore::MSTensor> &in_tensors,
-                     const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<NPUOp *> &in_ops) {
+                     const std::vector<mindspore::MSTensor> &valid_out_tensors,
                     const std::vector<mindspore::MSTensor> &all_out_tensors, const std::vector<NPUOp *> &out_ops) {
  hiai::AiContext context;
  for (size_t i = 0; i < npu_input_tensors_.size(); ++i) {
    MS_CHECK_TRUE_RET(i < input_relationship_.size() && input_relationship_.at(i) < in_tensors.size(), RET_ERROR);
@ -97,19 +99,32 @@ int NPUExecutor::Run(const std::vector<mindspore::MSTensor> &in_tensors,
    return RET_ERROR;
  }
-  if (npu_output_tensors_.size() != out_tensors.size()) {
+  // if the multi-output op is the graph out op, all of its output tensor will be treat as graph output for om model.
-    MS_LOG(ERROR) << "The output count is not euqal to ms tensor.";
+  std::set<schema::PrimitiveType> multi_output_list = {schema::PrimitiveType_Split};
  bool has_multi_output_op = false;
  for (auto out_op : out_ops) {
    if (std::find(multi_output_list.begin(), multi_output_list.end(), out_op->type()) != multi_output_list.end()) {
      has_multi_output_op = true;
      break;
    }
  }
  if (npu_output_tensors_.size() != all_out_tensors.size() ||
      (!has_multi_output_op && npu_output_tensors_.size() != valid_out_tensors.size())) {
    MS_LOG(ERROR) << "The output count (" << npu_output_tensors_.size() << ") is not equal to ms tensor ("
                  << all_out_tensors.size() << ").";
    return RET_ERROR;
  }
  for (size_t i = 0; i < npu_output_tensors_.size(); ++i) {
-    mindspore::MSTensor out_tensor = out_tensors[i];
+    mindspore::MSTensor out_tensor = all_out_tensors[i];
-    auto data = out_tensor.MutableData();
+    if (std::find(valid_out_tensors.begin(), valid_out_tensors.end(), out_tensor) != valid_out_tensors.end()) {
-    if (data == nullptr) {
+      auto data = out_tensor.MutableData();
-      MS_LOG(ERROR) << "For " << model_name_ << ", the output tensor " << out_tensors[i].Name() << " data is nullptr";
+      if (data == nullptr) {
-      return RET_ERROR;
+        MS_LOG(ERROR) << "For " << model_name_ << ", the output tensor " << out_tensor.Name() << " data is nullptr";
        return RET_ERROR;
      }
      memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
    }
    memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/npu/npu_executor.h
+++ b/mindspore/lite/src/delegate/npu/npu_executor.h
@ -33,8 +33,8 @@ class NPUExecutor {
  ~NPUExecutor();
  int Prepare();
-  int Run(const std::vector<mindspore::MSTensor> &in_tensors, const std::vector<mindspore::MSTensor> &out_tensors,
+  int Run(const std::vector<mindspore::MSTensor> &in_tensors, const std::vector<mindspore::MSTensor> &valid_out_tensors,
-          const std::vector<NPUOp *> &in_ops);
+          const std::vector<mindspore::MSTensor> &all_out_tensors, const std::vector<NPUOp *> &out_ops);
  void InitInputMappingRelationShip(const std::vector<size_t> &input_index) { input_relationship_ = input_index; }
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.cc
@ -37,6 +37,7 @@ NPUSubGraph::~NPUSubGraph() {
  subgraph_input_ops_.clear();
  subgraph_output_ops_.clear();
  out_tensor_sorted_.clear();
  all_tensors_from_out_ops_.clear();
  for (auto op : op_buffer_) {
    delete op;
  }
@ -61,11 +62,11 @@ void NPUSubGraph::set_input(mindspore::MSTensor in_tensor, int index) {
 }
 void NPUSubGraph::set_output(mindspore::MSTensor out_tensor, int index) {
-  MS_ASSERT(index < out_tensor_sorted_.size());
+  MS_ASSERT(index < outputs_.size());
  auto origin_tensor = outputs_[index];
-  for (size_t i = 0; i < out_tensor_sorted_.size(); i++) {
+  for (size_t i = 0; i < all_tensors_from_out_ops_.size(); i++) {
-    if (out_tensor_sorted_[i] == origin_tensor) {
+    if (all_tensors_from_out_ops_[i] == origin_tensor) {
-      out_tensor_sorted_[i] = out_tensor;
+      all_tensors_from_out_ops_[i] = out_tensor;
    }
  }
  outputs_[index] = out_tensor;
@ -146,7 +147,7 @@ std::shared_ptr<domi::ModelBufferData> NPUSubGraph::BuildIRModel() {
  return om_model_buff;
 }
-int NPUSubGraph::Execute() { return executor_->Run(inputs(), out_tensor_sorted_, in_ops_); }
+int NPUSubGraph::Execute() { return executor_->Run(inputs(), outputs(), all_tensors_from_out_ops_, out_ops_); }
 int NPUSubGraph::BuildNPUInputOp() {
  int count = 0;
@ -242,12 +243,9 @@ int NPUSubGraph::BuildNPUOutputOp() {
    MS_LOG(ERROR) << "Get NPU operators failed.";
    return RET_ERROR;
  }
  out_tensor_sorted_.resize(outputs().size());
  int i = 0;
  for (auto node : out_ops_) {
    for (const auto &tensor : node->outputs()) {
-      if (std::find(outputs().begin(), outputs().end(), tensor) != outputs().end())
+      all_tensors_from_out_ops_.emplace_back(tensor);
        this->out_tensor_sorted_[i++] = tensor;
    }
  }
  if (subgraph_output_ops_.empty()) {
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.h
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.h
@ -73,6 +73,8 @@ class NPUSubGraph : public kernel::Kernel {
  std::vector<mindspore::MSTensor> out_tensor_sorted_;
  std::vector<mindspore::MSTensor> all_tensors_from_out_ops_;
  std::vector<ge::Operator *> op_buffer_;
  std::vector<NPUOp *> npu_ops_{};
--- a/mindspore/lite/src/delegate/npu/op/concat_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/concat_npu.cc
@ -31,18 +31,9 @@ int ConcatNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
    return RET_ERROR;
  }
  axis_ = concat_prim->axis();
-  return RET_OK;
+  auto input_num = in_tensors.size();
-}
+  concat_->set_attr_N(input_num);
-
+  concat_->create_dynamic_input_x(input_num);
 int ConcatNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                              const std::vector<mindspore::MSTensor> &out_tensors,
                              const std::vector<ge::Operator *> &npu_inputs) {
  concat_->set_attr_concat_dim(axis_);
  concat_->set_attr_N(npu_inputs.size());
  concat_->create_dynamic_input_x(npu_inputs.size());
  for (int i = 0; i < npu_inputs.size(); ++i) {
    concat_->set_dynamic_input_x(i + 1, *npu_inputs[i]);
  }
  return RET_OK;
 }
@ -51,8 +42,6 @@ int ConcatNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors
                              const std::vector<ge::Operator *> &npu_inputs,
                              const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
  concat_->set_attr_concat_dim(axis_);
  concat_->set_attr_N(npu_inputs.size());
  concat_->create_dynamic_input_x(npu_inputs.size());
  for (auto pair : index2_multi_out_index) {
    auto in_op = pair.second.first;
    MS_CHECK_TRUE_RET(in_op != nullptr, RET_ERROR);
--- a/mindspore/lite/src/delegate/npu/op/concat_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/concat_npu.h
@ -39,10 +39,6 @@ class ConcatNPUOp : public NPUOp {
  int Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
           const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
--- a/mindspore/lite/src/delegate/npu/op/convolution_int8_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/convolution_int8_npu.cc
@ -93,27 +93,6 @@ int ConvolutionInt8NPUOp::Init(const schema::Primitive *primitive, const std::ve
  return RET_OK;
 }
 int ConvolutionInt8NPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                                       const std::vector<mindspore::MSTensor> &out_tensors,
                                       const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = InitWeightConst(in_tensors);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set weight and bias for convolution op " << name_ << " failed when running npu";
    return RET_ERROR;
  }
  conv_->set_input_filter(*weight_);
  if (in_tensors.size() == CONV_INPUT_SIZE) {
    ret = InitBiasConst(in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Set bias for convolution op " << name_ << " failed when running npu";
      return RET_ERROR;
    }
    conv_->set_input_bias(*bias_);
  }
  conv_->set_input_x(*npu_inputs[0]);
  return RET_OK;
 }
 int ConvolutionInt8NPUOp::SetNPUInputs(
  const std::vector<mindspore::MSTensor> &in_tensors, const std::vector<mindspore::MSTensor> &out_tensors,
  const std::vector<ge::Operator *> &npu_inputs,
--- a/mindspore/lite/src/delegate/npu/op/convolution_int8_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/convolution_int8_npu.h
@ -37,10 +37,6 @@ class ConvolutionInt8NPUOp : public ConvolutionBaseNPUOp {
  int Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
           const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
--- a/mindspore/lite/src/delegate/npu/op/convolution_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/convolution_npu.cc
@ -97,27 +97,6 @@ int ConvolutionNPUOp::Init(const schema::Primitive *primitive, const std::vector
  return RET_OK;
 }
 int ConvolutionNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                                   const std::vector<mindspore::MSTensor> &out_tensors,
                                   const std::vector<ge::Operator *> &npu_inputs) {
  auto ret = InitWeightConst(in_tensors);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set weight and bias for convolution op " << name_ << " failed when running npu";
    return RET_ERROR;
  }
  conv_->set_input_filter(*weight_);
  if (in_tensors.size() == CONV_INPUT_SIZE) {
    ret = InitBiasConst(in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Set bias for convolution op " << name_ << " failed when running npu";
      return RET_ERROR;
    }
    conv_->set_input_bias(*bias_);
  }
  conv_->set_input_x(*npu_inputs[0]);
  return RET_OK;
 }
 int ConvolutionNPUOp::SetNPUInputs(
  const std::vector<mindspore::MSTensor> &in_tensors, const std::vector<mindspore::MSTensor> &out_tensors,
  const std::vector<ge::Operator *> &npu_inputs,
--- a/mindspore/lite/src/delegate/npu/op/convolution_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/convolution_npu.h
@ -37,10 +37,6 @@ class ConvolutionNPUOp : public ConvolutionBaseNPUOp {
  int Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
           const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
--- a/mindspore/lite/src/delegate/npu/op/eltwise_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/eltwise_npu.cc
@ -33,17 +33,25 @@ int EltwiseNPUOp::Init(const schema::Primitive *primitive, const std::vector<min
    return RET_ERROR;
  }
  eltwise_->set_attr_mode(ConverterToNPUEltwiseMode(eltwise_prim->mode()));
-  int size = in_tensors.size();
+  auto input_num = in_tensors.size();
-  eltwise_->create_dynamic_input_x(size);
+  eltwise_->create_dynamic_input_x(input_num);
-  eltwise_->set_attr_N(size);
+  eltwise_->set_attr_N(input_num);
  return RET_OK;
 }
 int EltwiseNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                               const std::vector<mindspore::MSTensor> &out_tensors,
-                               const std::vector<ge::Operator *> &npu_inputs) {
+                               const std::vector<ge::Operator *> &npu_inputs,
                               const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
  for (auto pair : index2_multi_out_index) {
    auto in_op = pair.second.first;
    MS_CHECK_TRUE_RET(in_op != nullptr, RET_ERROR);
    eltwise_->SetInput(pair.first, *in_op, pair.second.second);
  }
  for (int i = 0; i < npu_inputs.size(); ++i) {
-    eltwise_->set_dynamic_input_x(i + 1, *npu_inputs[i]);
+    if (index2_multi_out_index.find(i) == index2_multi_out_index.end()) {
      eltwise_->SetInput(i, *npu_inputs[i], 0);
    }
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/npu/op/eltwise_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/eltwise_npu.h
@ -18,6 +18,8 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ELTWISE_NPU_H_
 #include <vector>
 #include <string>
 #include <utility>
 #include <unordered_map>
 #include "include/graph/op/all_ops.h"
 #include "src/delegate/npu/op/npu_op.h"
@ -39,8 +41,8 @@ class EltwiseNPUOp : public NPUOp {
           const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
-                   const std::vector<mindspore::MSTensor> &out_tensors,
+                   const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
-                   const std::vector<ge::Operator *> &npu_inputs) override;
+                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
  ge::Operator *GetNPUOp() override;
--- a/mindspore/lite/src/delegate/npu/op/flatten_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/flatten_npu.cc
@ -0,0 +1,56 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/delegate/npu/op/flatten_npu.h"
 #include "include/graph/op/all_ops.h"
 #include "src/delegate/npu/npu_converter_utils.h"
 namespace mindspore {
 int FlattenNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                            const std::vector<mindspore::MSTensor> &out_tensors) {
  if (out_tensors.at(0).Shape().size() != C2NUM) {
    MS_LOG(WARNING) << "The output tensor can only be flatten to 2 dimension.";
    return RET_NOT_SUPPORT;
  }
  return RET_OK;
 }
 int FlattenNPUOp::Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                       const std::vector<mindspore::MSTensor> &out_tensors) {
  flatten_ = new (std::nothrow) hiai::op::Flatten(name_);
  if (flatten_ == nullptr) {
    MS_LOG(ERROR) << name_ << " op is nullptr";
    return RET_ERROR;
  }
  return RET_OK;
 }
 int FlattenNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                               const std::vector<mindspore::MSTensor> &out_tensors,
                               const std::vector<ge::Operator *> &npu_inputs) {
  flatten_->set_input_x(*npu_inputs[0]);
  return RET_OK;
 }
 ge::Operator *FlattenNPUOp::GetNPUOp() { return this->flatten_; }
 FlattenNPUOp::~FlattenNPUOp() {
  if (flatten_ != nullptr) {
    delete flatten_;
    flatten_ = nullptr;
  }
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/flatten_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/flatten_npu.h
@ -0,0 +1,48 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FLATTEN_NPU_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FLATTEN_NPU_H_
 #include <vector>
 #include <string>
 #include "include/graph/op/all_ops.h"
 #include "src/delegate/npu/op/npu_op.h"
 namespace mindspore {
 class FlattenNPUOp : public NPUOp {
 public:
  FlattenNPUOp(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
               const std::vector<mindspore::MSTensor> &out_tensors, std::string name)
      : NPUOp(primitive, in_tensors, out_tensors, name) {}
  ~FlattenNPUOp() override;
  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                const std::vector<mindspore::MSTensor> &out_tensors) override;
  int Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
           const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
  ge::Operator *GetNPUOp() override;
 private:
  hiai::op::Flatten *flatten_ = nullptr;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FLATTEN_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/matmul_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/matmul_npu.cc
@ -55,9 +55,6 @@ int MatMulNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
    MS_LOG(ERROR) << "New matmul npu operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
  if (in_tensors.size() == MATMUL_INPUT_SIZE) {
    has_bias_ = true;
  }
  auto matmul_prim = primitive->value_as_MatMulFusion();
  if (matmul_prim == nullptr) {
    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
@ -66,6 +63,15 @@ int MatMulNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
  matmul_->set_attr_transpose_x1(matmul_prim->transpose_a());
  matmul_->set_attr_transpose_x2(matmul_prim->transpose_b());
  act_type_ = matmul_prim->activation_type();
  if (in_tensors.size() == MATMUL_INPUT_SIZE) {
    has_bias_ = true;
    add_op_ = new (std::nothrow) hiai::op::Add(name_ + "_add");
    if (add_op_ == nullptr) {
      MS_LOG(ERROR) << "new add op failed.";
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
@ -75,11 +81,6 @@ int MatMulNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors
  matmul_->set_input_x1(*npu_inputs[0]);
  matmul_->set_input_x2(*npu_inputs[1]);
  if (has_bias_) {
    add_op_ = new (std::nothrow) hiai::op::Add(name_ + "_add");
    if (add_op_ == nullptr) {
      MS_LOG(ERROR) << "new add op failed.";
      return RET_ERROR;
    }
    add_op_->set_input_x1(*matmul_);
    auto bias_shape = in_tensors[BIAS_INDEX].Shape();
    auto bias_tensor = ConverterToNPUTensor(in_tensors[BIAS_INDEX]);
@ -104,7 +105,7 @@ int MatMulNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors
  }
  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
    int ret = RET_ERROR;
-    if (has_bias_ == true) {
+    if (has_bias_) {
      ret = SetActivation(add_op_);
    } else {
      ret = SetActivation(matmul_);
--- a/mindspore/lite/src/delegate/npu/op/npu_op.h
+++ b/mindspore/lite/src/delegate/npu/op/npu_op.h
@ -133,7 +133,8 @@ NPUOp *GetNPUOp(const schema::Primitive *primitive, const std::vector<mindspore:
    return nullptr;
  }
-  std::set<schema::PrimitiveType> int32_lists = {schema::PrimitiveType_Cast, schema::PrimitiveType_StridedSlice};
+  std::set<schema::PrimitiveType> int32_lists = {schema::PrimitiveType_Cast, schema::PrimitiveType_StridedSlice,
                                                 schema::PrimitiveType_Reshape, schema::PrimitiveType_ReduceFusion};
  auto support_int32 = in_tensors[0].DataType() == DataType::kNumberTypeInt32 &&
                       find(int32_lists.begin(), int32_lists.end(), primitive->value_type()) != int32_lists.end();
  if (in_tensors[0].DataType() != DataType::kNumberTypeFloat32 &&
--- a/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
@ -32,7 +32,7 @@ int ReduceNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector
    return RET_NOT_SUPPORT;
  }
  reduce_mode_ = reduce_prim->mode();
-  if (reduce_mode_ != schema::ReduceMode_ReduceMean) {
+  if (reduce_mode_ != schema::ReduceMode_ReduceMean && reduce_mode_ != schema::ReduceMode_ReduceSum) {
    MS_LOG(WARNING) << "Npu does not support reduce mode " << reduce_prim->mode() << " for op " << name_;
    return RET_NOT_SUPPORT;
  }
@ -58,6 +58,14 @@ int ReduceNPUOp::Init(const schema::Primitive *primitive, const std::vector<mind
    }
    reduce_mean->set_attr_keep_dims(reduce_prim->keep_dims());
    reduce_ = reduce_mean;
  } else if (reduce_mode_ == schema::ReduceMode_ReduceSum) {
    auto reduce_sum = new (std::nothrow) hiai::op::ReduceSum(name_);
    if (reduce_sum == nullptr) {
      MS_LOG(ERROR) << "New reduce operator for op " << name_ << " failed.";
      return RET_ERROR;
    }
    reduce_sum->set_attr_keep_dims(reduce_prim->keep_dims());
    reduce_ = reduce_sum;
  } else {
    MS_LOG(ERROR) << "Npu does not support reduce mode " << reduce_prim->mode() << " for op " << name_;
    return RET_ERROR;
@ -71,6 +79,9 @@ int ReduceNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors
  if (reduce_mode_ == schema::ReduceMode_ReduceMean) {
    auto reduce_mean = reinterpret_cast<hiai::op::ReduceMean *>(reduce_);
    reduce_mean->set_input_x(*npu_inputs[0]).set_input_axes(*npu_inputs[1]);
  } else if (reduce_mode_ == schema::ReduceMode_ReduceSum) {
    auto reduce_sum = reinterpret_cast<hiai::op::ReduceSum *>(reduce_);
    reduce_sum->set_input_x(*npu_inputs[0]).set_input_axes(*npu_inputs[1]);
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/npu/op/reshape_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/reshape_npu.cc
@ -20,13 +20,18 @@
 namespace mindspore {
 int ReshapeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                            const std::vector<mindspore::MSTensor> &out_tensors) {
-  if (in_tensors.size() != 2) {
+  if (in_tensors.size() != kInputSize1) {
-    MS_LOG(WARNING) << "Npu op should have 2 input tensors.";
+    MS_LOG(WARNING) << "NPU op should have 2 input tensors.";
    return RET_NOT_SUPPORT;
  }
  auto shape_tensor = in_tensors.at(1);
  if (shape_tensor.Data() == nullptr) {
-    MS_LOG(WARNING) << "Npu reshape op only supports const shape.";
+    MS_LOG(WARNING) << "NPU Reshape op only supports const shape.";
    return RET_NOT_SUPPORT;
  }
  if (shape_tensor.Shape().size() > 1 || shape_tensor.ElementNum() > NPU_SHAPE_SIZE) {
    MS_LOG(WARNING) << "For NPU Reshape op, the shape tensor should be a one-dimension tensor and its element number "
                       "should be less than 4.";
    return RET_NOT_SUPPORT;
  }
  return RET_OK;
@ -42,14 +47,6 @@ int ReshapeNPUOp::Init(const schema::Primitive *primitive, const std::vector<min
  return RET_OK;
 }
 int ReshapeNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                               const std::vector<mindspore::MSTensor> &out_tensors,
                               const std::vector<ge::Operator *> &npu_inputs) {
  reshape_->set_input_x(*npu_inputs[0]);
  reshape_->set_input_shape(*npu_inputs[1]);
  return RET_OK;
 }
 int ReshapeNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                               const std::vector<mindspore::MSTensor> &out_tensors,
                               const std::vector<ge::Operator *> &npu_inputs,
--- a/mindspore/lite/src/delegate/npu/op/reshape_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/reshape_npu.h
@ -37,10 +37,6 @@ class ReshapeNPUOp : public NPUOp {
  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                const std::vector<mindspore::MSTensor> &out_tensors) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
  int SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                   const std::vector<mindspore::MSTensor> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.cc
@ -30,88 +30,122 @@ int ScaleNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<
    MS_LOG(ERROR) << "Get null primitive value for op: " << name_;
    return RET_ERROR;
  }
  auto input_dims = in_tensors.at(INPUT_INDEX).Shape().size();
  axis_ = scale_prim->axis();
  if (axis_ < 0) {
-    axis_ = axis_ + in_tensors[INPUT_INDEX].Shape().size();
+    axis_ = axis_ + input_dims;
  }
  if (axis_ != NHWC_C && axis_ != NCHW_C) {
    if (in_tensors.size() <= BIAS_INDEX) {
      MS_LOG(INFO) << "Npu Scale op does not support axis: " << axis_ << ", trying to convert to Mul op.";
      use_mul_ = true;
      return RET_OK;
    } else {
      MS_LOG(WARNING) << "Npu Scale axis attr only support 1 or channel, now is " << axis_;
      return RET_NOT_SUPPORT;
    }
  }
  if (input_dims < NPU_SHAPE_SIZE) {
    need_expand_ = true;
  }
  return RET_OK;
 }
 int ScaleNPUOp::Init(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
                     const std::vector<mindspore::MSTensor> &out_tensors) {
  if (!use_mul_) {
    // note that Scale only support the default axis(i.e., 1), setting axis is meaningless.
    op_ = new (std::nothrow) hiai::op::Scale(name_);
  } else {
    op_ = new (std::nothrow) hiai::op::Mul(name_);
  }
  if (op_ == nullptr) {
    MS_LOG(ERROR) << name_ << " op is nullptr";
    return RET_ERROR;
  }
  auto scale_prim = primitive->value_as_ScaleFusion();
  if (scale_prim == nullptr) {
    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
    return RET_ERROR;
  }
  if (use_mul_) {
    mul_ = new (std::nothrow) hiai::op::Mul(name_ + "_mul");
    if (mul_ == nullptr) {
      MS_LOG(ERROR) << "New Mul npu operator for op " << name_ << "_mul failed.";
      return RET_ERROR;
    }
    scale_ops_.emplace_back(mul_);
  } else {
    // note that Scale only support the default axis(i.e., 1), setting axis is meaningless.
    scale_ = new (std::nothrow) hiai::op::Scale(name_);
    if (scale_ == nullptr) {
      MS_LOG(ERROR) << "New Scale npu operator for op " << name_ << " failed.";
      return RET_ERROR;
    }
    scale_ops_.emplace_back(scale_);
  }
  if (need_expand_) {
    out_reshape_ = new (std::nothrow) hiai::op::Reshape(name_ + "_restore");
    if (out_reshape_ == nullptr) {
      MS_LOG(ERROR) << "New Reshape npu operator for op " << name_ << "_restore failed.";
      return RET_ERROR;
    }
    scale_ops_.emplace_back(out_reshape_);
  }
  act_type_ = scale_prim->activation_type();
  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
-    auto ret = SetActivation(op_);
+    act_ = new (std::nothrow) hiai::op::Activation(name_ + "_act");
-    if (ret != RET_OK) {
+    if (act_ == nullptr) {
      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
-      return ret;
+      return RET_ERROR;
    }
-  }
+    scale_ops_.emplace_back(act_);
  return RET_OK;
 }
 int ScaleNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
                             const std::vector<mindspore::MSTensor> &out_tensors,
                             const std::vector<ge::Operator *> &npu_inputs) {
  MS_ASSERT(in_tensors.size() > SCALE_INDEX);
  if (use_mul_) {
    auto ret = ConvertScaleToMul(npu_inputs, op_, in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Convert Scale to Mul failed, op name: " << name_;
    }
    return ret;
  }
  auto scale_op = reinterpret_cast<hiai::op::Scale *>(op_);
  scale_op->set_input_x(*npu_inputs.at(INPUT_INDEX));
  scale_op->set_input_scale(*npu_inputs.at(SCALE_INDEX));
  if (in_tensors.size() > BIAS_INDEX && in_tensors[BIAS_INDEX] != nullptr) {
    scale_op->set_input_bias(*npu_inputs.at(BIAS_INDEX));
  }
  return RET_OK;
 }
 ge::Operator *ScaleNPUOp::GetNPUOp() {
-  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
    return op_;
  } else {
    return act_;
  } else if (use_mul_) {
    return mul_;
  } else if (need_expand_) {
    return out_reshape_;
  } else {
    return scale_;
  }
 }
-int ScaleNPUOp::SetActivation(const ge::Operator *input) {
+int ScaleNPUOp::SetNPUInputs(const std::vector<mindspore::MSTensor> &in_tensors,
-  act_ = new (std::nothrow) hiai::op::Activation(name_ + "_act");
+                             const std::vector<mindspore::MSTensor> &out_tensors,
-  if (act_ == nullptr) {
+                             const std::vector<ge::Operator *> &npu_inputs) {
-    MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+  if (use_mul_) {
-    return RET_ERROR;
+    auto ret = ConvertScaleToMul(npu_inputs, in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Convert Scale to Mul failed, op name: " << name_;
      return RET_ERROR;
    }
  } else {
    auto ret = Adopt4DScale(npu_inputs, in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Adopt 4D Scale op failed, op name: " << name_;
      return RET_ERROR;
    }
  }
-  act_->set_input_x(*input);
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
    auto ret = SetActivation();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Set Activation failed, op name: " << name_;
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
 int ScaleNPUOp::SetActivation() {
  ge::Operator *act_input = nullptr;
  if (use_mul_) {
    act_input = mul_;
  } else if (need_expand_) {
    act_input = out_reshape_;
  } else {
    act_input = scale_;
  }
  MS_CHECK_TRUE_MSG(act_input != nullptr, RET_ERROR, "Scale activation input is nullptr.");
  act_->set_input_x(*act_input);
  auto act_mode = ConverterToNPUActivationMode(act_type_);
  if (act_mode == ACTIVATION_INVALID) {
    MS_LOG(ERROR) << "Unsupported activation type for scale op " << name_;
@ -121,69 +155,138 @@ int ScaleNPUOp::SetActivation(const ge::Operator *input) {
  return RET_OK;
 }
-int ScaleNPUOp::ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *cur_op,
+int ScaleNPUOp::ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs,
                                  const std::vector<mindspore::MSTensor> &in_tensors) {
-  auto input_shape = in_tensors[INPUT_INDEX].Shape();
+  auto input_shape = in_tensors.at(INPUT_INDEX).Shape();
-  auto scale_shape = in_tensors[SCALE_INDEX].Shape();
+  auto scale_shape = in_tensors.at(SCALE_INDEX).Shape();
-  auto mul_op = reinterpret_cast<hiai::op::Mul *>(cur_op);
+  mul_->set_input_x1(*npu_inputs.at(INPUT_INDEX));
  mul_op->set_input_x1(*npu_inputs.at(INPUT_INDEX));
  if (input_shape.size() == scale_shape.size()) {
-    mul_op->set_input_x2(*npu_inputs.at(SCALE_INDEX));
+    mul_->set_input_x2(*npu_inputs.at(SCALE_INDEX));
  } else {
-    int valid_shape[NPU_SHAPE_SIZE] = {1, 1, 1, 1};
+    int64_t valid_dims = input_shape.size();
    std::vector<int> valid_shape(valid_dims, 1);
    for (size_t i = 0; i < scale_shape.size(); i++) {
      valid_shape[axis_ + i] = static_cast<int>(scale_shape[i]);
    }
-    reshape_ = new (std::nothrow) hiai::op::Reshape(name_ + "_reshape");
+    auto reshape = new (std::nothrow) hiai::op::Reshape(name_ + "_mul_reshape");
-    if (reshape_ == nullptr) {
+    if (reshape == nullptr) {
-      MS_LOG(ERROR) << "New Reshape npu operator for op " << name_ << "_reshape failed.";
+      MS_LOG(ERROR) << "New Reshape npu operator for op " << name_ << "_mul_reshape failed.";
      return RET_ERROR;
    }
-    std::shared_ptr<ge::Tensor> shape_tensor = std::make_shared<ge::Tensor>();
+    scale_ops_.emplace_back(reshape);
-    if (shape_tensor == nullptr) {
+    auto valid_data_ptr = reinterpret_cast<const uint8_t *>(valid_shape.data());
-      MS_LOG(ERROR) << "new shape_tensor failed.";
+    auto shape = GetNPUConst<int>(valid_data_ptr, {valid_dims}, ge::DT_INT32, name_ + "_mul_expand_shape");
    if (shape == nullptr) {
      MS_LOG(ERROR) << "Get shape const for op " << name_ << "_mul failed.";
      return RET_ERROR;
    }
-    ge::TensorDesc tensor_desc(ge::Shape({NPU_SHAPE_SIZE}), ge::FORMAT_ND, ge::DT_INT32);
+    scale_ops_.emplace_back(shape);
-    shape_tensor->SetTensorDesc(tensor_desc);
+    reshape->set_input_x(*npu_inputs.at(SCALE_INDEX));
-    shape_tensor->SetData(reinterpret_cast<const uint8_t *>(valid_shape), NPU_SHAPE_SIZE * sizeof(int));
+    reshape->set_input_shape(*shape);
-    shape_ = new (std::nothrow) hiai::op::Const(name_ + "_reshape_1");
+    mul_->set_input_x2(*reshape);
    if (shape_ == nullptr) {
      MS_LOG(ERROR) << "New shape const for op " << name_ << " failed.";
      return RET_ERROR;
    }
    shape_->set_attr_value(shape_tensor);
    reshape_->set_input_x(*npu_inputs.at(SCALE_INDEX));
    reshape_->set_input_shape(*shape_);
    mul_op->set_input_x2(*reshape_);
  }
  return RET_OK;
 }
 int ScaleNPUOp::Adopt4DScale(const std::vector<ge::Operator *> &npu_inputs,
                             const std::vector<mindspore::MSTensor> &in_tensors) {
  MS_ASSERT(scale_ != nullptr);
  // handle input
  auto org_input_tensor = in_tensors.at(INPUT_INDEX);
  ge::Operator *actual_input = npu_inputs.at(INPUT_INDEX);
  std::vector<int64_t> org_input_shape = org_input_tensor.Shape();
  if (need_expand_) {
    actual_input = ChangeDims(npu_inputs.at(INPUT_INDEX), org_input_shape, name_ + "_expand_input", true);
    if (actual_input == nullptr) {
      MS_LOG(ERROR) << "Change Scale op input dims failed.";
      return RET_ERROR;
    }
  }
  scale_->set_input_x(*actual_input);
  // handle scale, note that the scale axis can only be 1.
  auto org_scale_tensor = in_tensors.at(SCALE_INDEX);
  ge::Operator *actual_scale = npu_inputs.at(SCALE_INDEX);
  if (org_scale_tensor.Shape().size() == DIMENSION_2D) {
    std::vector<int64_t> expand_scale_shape = org_scale_tensor.Shape();
    expand_scale_shape.emplace_back(1);
    actual_scale = ChangeDims(npu_inputs.at(SCALE_INDEX), expand_scale_shape, name_ + "_expand_scale");
    if (actual_scale == nullptr) {
      MS_LOG(ERROR) << "Change Scale op scale dims failed.";
      return RET_ERROR;
    }
  }
  scale_->set_input_scale(*actual_scale);
  // handle bias
  if (in_tensors.size() > BIAS_INDEX) {
    auto org_bias_tensor = in_tensors.at(BIAS_INDEX);
    ge::Operator *actual_bias = npu_inputs.at(BIAS_INDEX);
    if (org_bias_tensor.Shape().size() == DIMENSION_2D) {
      std::vector<int64_t> expand_bias_shape = org_bias_tensor.Shape();
      expand_bias_shape.emplace_back(1);
      actual_bias = ChangeDims(npu_inputs.at(BIAS_INDEX), expand_bias_shape, name_ + "_expand_bias");
      if (actual_bias == nullptr) {
        MS_LOG(ERROR) << "Change Scale op bias dims failed.";
        return RET_ERROR;
      }
    }
    scale_->set_input_bias(*actual_bias);
  }
  // restore to origin input shape
  if (need_expand_) {
    int64_t dims = org_input_shape.size();
    std::vector<int> valid_shape;
    for (int i = 0; i < dims; i++) {
      valid_shape.emplace_back(static_cast<int>(org_input_shape.at(i)));
    }
    auto valid_data_ptr = reinterpret_cast<const uint8_t *>(valid_shape.data());
    auto shape = GetNPUConst<int>(valid_data_ptr, {dims}, ge::DT_INT32, name_ + "_restore_shape");
    if (shape == nullptr) {
      MS_LOG(ERROR) << "Get NPU Const for shape restoration failed.";
      return RET_ERROR;
    }
    scale_ops_.emplace_back(shape);
    out_reshape_->set_input_x(*scale_);
    out_reshape_->set_input_shape(*shape);
  }
  return RET_OK;
 }
 ge::Operator *ScaleNPUOp::ChangeDims(const ge::Operator *input, std::vector<int64_t> dst_shape, std::string name,
                                     bool need_expand_4d) {
  MS_ASSERT(input != nullptr);
  auto reshape = new (std::nothrow) hiai::op::Reshape(name);
  if (reshape == nullptr) {
    MS_LOG(ERROR) << "New Reshape NPU operator failed.";
    return nullptr;
  }
  scale_ops_.emplace_back(reshape);
  MS_CHECK_LE(dst_shape.size(), NPU_SHAPE_SIZE, nullptr);
  int64_t actual_dim = need_expand_4d ? NPU_SHAPE_SIZE : dst_shape.size();
  std::vector<int> valid_shape(actual_dim, 1);
  for (int i = 0; i < dst_shape.size(); i++) {
    valid_shape[i] = static_cast<int>(dst_shape.at(i));
  }
  auto valid_data_ptr = reinterpret_cast<const uint8_t *>(valid_shape.data());
  auto shape = GetNPUConst<int>(valid_data_ptr, {actual_dim}, ge::DT_INT32, name_ + "_shape");
  if (shape == nullptr) {
    MS_LOG(ERROR) << "Get NPU Const for shape restoration failed.";
    return nullptr;
  }
  scale_ops_.emplace_back(shape);
  reshape->set_input_x(*input);
  reshape->set_input_shape(*shape);
  return reshape;
 }
 ScaleNPUOp::~ScaleNPUOp() {
-  if (op_ != nullptr) {
+  for (auto op : scale_ops_) {
-    delete op_;
+    if (op != nullptr) {
-    op_ = nullptr;
+      delete op;
-  }
+      op = nullptr;
-  if (scale_ != nullptr) {
+    }
    delete scale_;
    scale_ = nullptr;
  }
  if (bias_ != nullptr) {
    delete bias_;
    bias_ = nullptr;
  }
  if (act_ != nullptr) {
    delete act_;
    act_ = nullptr;
  }
  if (reshape_ != nullptr) {
    delete reshape_;
    reshape_ = nullptr;
  }
  if (shape_ != nullptr) {
    delete shape_;
    shape_ = nullptr;
  }
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.h
@ -46,20 +46,25 @@ class ScaleNPUOp : public NPUOp {
  int GetAxis() { return axis_; }
 private:
-  int SetActivation(const ge::Operator *input);
+  int SetActivation();
-  int ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *cur_op,
+  int ConvertScaleToMul(const std::vector<ge::Operator *> &npu_inputs,
                        const std::vector<mindspore::MSTensor> &in_tensors);
  int Adopt4DScale(const std::vector<ge::Operator *> &npu_inputs, const std::vector<mindspore::MSTensor> &in_tensors);
  ge::Operator *ChangeDims(const ge::Operator *input, std::vector<int64_t> dst_shape, std::string name,
                           bool need_expand_4d = false);
  int axis_ = 0;
  bool use_mul_ = false;
  bool need_expand_ = false;
  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
-  ge::Operator *op_ = nullptr;
+  hiai::op::Reshape *out_reshape_ = nullptr;
-  hiai::op::Reshape *reshape_ = nullptr;
+  hiai::op::Scale *scale_ = nullptr;
-  hiai::op::Const *scale_ = nullptr;
+  hiai::op::Mul *mul_ = nullptr;
  hiai::op::Const *bias_ = nullptr;
  hiai::op::Const *shape_ = nullptr;
  hiai::op::Activation *act_ = nullptr;
  std::vector<ge::Operator *> scale_ops_ = {};
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SCALE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/split_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/split_npu.cc
@ -32,14 +32,21 @@ int SplitNPUOp::Init(const schema::Primitive *primitive, const std::vector<minds
    return RET_ERROR;
  }
  axis_ = static_cast<int>(split_prim->axis());
  auto split_dim = in_tensors.at(0).Shape().at(axis_);
  auto sizes_split = split_prim->size_splits();
  std::vector<int> sizes_split_vec;
  if (sizes_split != nullptr) {
    sizes_split_vec = std::vector<int>(sizes_split->begin(), sizes_split->end());
  } else {
    return RET_ERROR;
  }
  int size = split_prim->output_num();
  std::vector<int> sizes_split_vec;
  CHECK_NULL_RETURN(sizes_split);
  for (int i = 0; i < size; ++i) {
    auto cur_size = sizes_split->Get(i);
    if (i == size - 1 && cur_size == -1) {
      sizes_split_vec.emplace_back(split_dim);
      break;
    }
    split_dim -= cur_size;
    sizes_split_vec.emplace_back(cur_size);
  }
  ge::TensorDesc size_splits_tensor_desc(ge::Shape({size}), ge::FORMAT_NCHW, ge::DT_INT32);
  ge::TensorPtr size_splits_tensor = std::make_shared<hiai::Tensor>(size_splits_tensor_desc);
  size_splits_tensor->SetData(reinterpret_cast<uint8_t *>(sizes_split_vec.data()), size * sizeof(int));
@ -50,8 +57,6 @@ int SplitNPUOp::Init(const schema::Primitive *primitive, const std::vector<minds
  }
  size_splits_->set_attr_value(size_splits_tensor);
  split_->set_input_size_splits(*size_splits_);
  axis_ = static_cast<int>(split_prim->axis());
  split_->set_attr_num_split(size);
  split_->create_dynamic_output_y(size);
  return RET_OK;
--- a/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
@ -22,10 +22,6 @@
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 namespace {
 constexpr int kNumDims = 4;
 }  // namespace
 namespace mindspore {
 bool CheckFusion(NPUOp *cur_op, const std::vector<mindspore::MSTensor> &graph_outputs) {
  if (cur_op->in_ops().empty() || cur_op->out_ops().empty()) {
@ -77,32 +73,32 @@ void NPUFusionPass::RemoveAndFreeOp(NPUOp *cur_op) {
 }
 int NPUFusionPass::UpdatePreOps(NPUOp *cur_op) {
  auto cur_in_ops = cur_op->in_ops();
  for (auto in_op : cur_op->in_ops()) {
    // graph in op
    if (in_op->in_ops().empty()) {
-      continue;
+      cur_in_ops.erase(find(cur_in_ops.begin(), cur_in_ops.end(), in_op));
-    }
+    } else {
-    auto pre_op = in_op->in_ops()[0];
+      auto pre_op = in_op->in_ops()[0];
      auto pre_out_ops = pre_op->out_ops();
      for (size_t i = 0; i < pre_out_ops.size(); i++) {
        if (pre_out_ops[i] == in_op) {
          pre_out_ops[i] = cur_op;
          break;
        }
      }
      pre_op->set_out_ops(pre_out_ops);
-    auto pre_out_ops = pre_op->out_ops();
+      for (size_t i = 0; i < cur_in_ops.size(); i++) {
-    for (size_t i = 0; i < pre_out_ops.size(); i++) {
+        if (cur_in_ops[i] == in_op) {
-      if (pre_out_ops[i] == in_op) {
+          cur_in_ops[i] = pre_op;
-        pre_out_ops[i] = cur_op;
+          break;
-        break;
+        }
      }
    }
    pre_op->set_out_ops(pre_out_ops);
    auto cur_in_ops = cur_op->in_ops();
    for (size_t i = 0; i < cur_in_ops.size(); i++) {
      if (cur_in_ops[i] == in_op) {
        cur_in_ops[i] = pre_op;
        break;
      }
    }
    cur_op->set_in_ops(cur_in_ops);
    RemoveAndFreeOp(in_op);
  }
  cur_op->set_in_ops(cur_in_ops);
  return RET_OK;
 }
@ -139,19 +135,26 @@ int NPUFusionPass::UpdatePostOps(NPUOp *cur_op) {
 int UpdatePreTensors(NPUOp *cur_op) {
  auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_op);
  for (auto in_op : cur_op->in_ops()) {
-    if (in_op->inputs().empty() || in_op->outputs().empty() || in_op->in_ops().empty()) {
+    if (in_op->inputs().empty() || in_op->outputs().empty()) {
-      MS_LOG(ERROR) << "in_tensors/out_tensors/in_ops is empty.";
+      MS_LOG(ERROR) << "in_tensors or out_tensors of input op is empty.";
      return RET_ERROR;
    }
    mindspore::MSTensor cur_tensor;
    auto in_tensor = in_op->inputs()[0];
    auto out_tensor = in_op->outputs()[0];
-    auto pre_op = in_op->in_ops()[0];
+    if (!in_op->in_ops().empty()) {
-    for (size_t i = 0; i < pre_op->outputs().size(); i++) {
+      auto pre_op = in_op->in_ops()[0];
-      if (pre_op->outputs()[i] == in_tensor) {
+      for (size_t i = 0; i < pre_op->outputs().size(); i++) {
-        cur_tensor = pre_op->outputs()[i];
+        if (pre_op->outputs()[i] == in_tensor) {
          cur_tensor = pre_op->outputs()[i];
          break;
        }
      }
    } else {
      // graph input
      cur_tensor = in_tensor;
    }
    for (size_t i = 0; i < tensors_vec.size(); i++) {
      if (tensors_vec[i] == out_tensor) {
        tensors_vec[i] = cur_tensor;
@ -173,56 +176,47 @@ int UpdatePreTensors(NPUOp *cur_op) {
  return RET_OK;
 }
 bool NodeWithNhwc2nchw2nhwcOutput(NPUOp *cur_op) {
  auto out_ops = cur_op->out_ops();
  if (out_ops.empty()) {
    return false;
  }
  bool all_out_ops_transpose = std::all_of(out_ops.begin(), out_ops.end(), [](NPUOp *op) {
    return op->type() == schema::PrimitiveType_Transpose && op->out_ops().size() == 1 &&
           op->out_ops()[0]->type() == schema::PrimitiveType_Transpose && op->out_ops()[0]->out_ops().empty();
  });
  return all_out_ops_transpose;
 }
 int UpdatePostTensors(NPUOp *cur_op) {
-  auto tensor = cur_op->outputs()[0];
+  mindspore::MSTensor new_post_input;
  // in case: node->nh2nc->nc2nh(graph output) --->>> node->nc2nh, node out_tensor should be put to nc2nh out tensors
  auto out_ops = cur_op->out_ops();
  if (NodeWithNhwc2nchw2nhwcOutput(cur_op)) {
    std::vector<MSTensor> outputs;
    for (auto i = 0; i < out_ops.size(); ++i) {
      auto ori_out_tensor = cur_op->outputs()[i];
      auto nc_tensor = out_ops[i]->outputs()[0];
      outputs.push_back(nc_tensor);
      auto post_post_op = out_ops[i]->out_ops()[0];
      post_post_op->set_inputs({nc_tensor});
      post_post_op->set_outputs({ori_out_tensor});
    }
    cur_op->set_outputs(outputs);
    return RET_OK;
  }
  auto nhwc_shape = tensor.Shape();
  if (nhwc_shape.size() < kNumDims) {
    MS_LOG(ERROR) << "nhwc_shape < " << kNumDims;
    return RET_ERROR;
  }
  tensor.SetShape({nhwc_shape[NHWC_N], nhwc_shape[NHWC_C], nhwc_shape[NHWC_H], nhwc_shape[NHWC_W]});
  for (auto out_op : cur_op->out_ops()) {
    auto in_tensor = out_op->inputs()[0];
    auto out_tensor = out_op->outputs()[0];
-    if (out_op->out_ops().empty()) {
+    auto nhwc_shape = in_tensor.Shape();
-      cur_op->set_outputs({out_op->outputs()[0]});
+    if (in_tensor.format() == Format::NHWC) {
      MS_CHECK_TRUE_MSG(nhwc_shape.size() == NPU_SHAPE_SIZE, RET_ERROR, "Invalid transpose dim size!");
      in_tensor.SetShape({nhwc_shape[NHWC_N], nhwc_shape[NHWC_C], nhwc_shape[NHWC_H], nhwc_shape[NHWC_W]});
      in_tensor.SetFormat(Format::NCHW);
    }
-    for (auto post_op : out_op->out_ops()) {
+    // out_op is a graph output op
-      auto tensors_vec = post_op->inputs();
+    if (out_op->out_ops().empty()) {
-      for (int i = 0; i < tensors_vec.size(); i++) {
+      auto out_tensors_vec = cur_op->outputs();
-        if (tensors_vec[i] == out_tensor) {
+      for (size_t i = 0; i < out_tensors_vec.size(); i++) {
-          tensors_vec[i] = tensor;
+        if (out_tensors_vec[i] == in_tensor) {
          out_tensors_vec[i] = out_op->outputs()[0];
        }
      }
-      post_op->set_inputs(tensors_vec);
+      cur_op->set_outputs(out_tensors_vec);
      // exist other out_ops using the same tensor as the current out_op, note that the other out_op has likely been
      // updated, which mean it may be not a Transpose op anymore.
      for (auto other_out_op : cur_op->out_ops()) {
        auto other_in_tensors_vec = other_out_op->inputs();
        for (size_t i = 0; i < other_in_tensors_vec.size(); i++) {
          if (other_in_tensors_vec[i] == in_tensor) {
            other_in_tensors_vec[i] = out_op->outputs()[0];
          }
        }
        other_out_op->set_inputs(other_in_tensors_vec);
      }
    }
    // out_op is not a graph out op
    for (auto post_op : out_op->out_ops()) {
      auto in_tensors_vec = post_op->inputs();
      for (size_t i = 0; i < in_tensors_vec.size(); i++) {
        if (in_tensors_vec[i] == out_tensor) {
          in_tensors_vec[i] = in_tensor;
        }
      }
      post_op->set_inputs(in_tensors_vec);
    }
  }
  return RET_OK;
--- a/mindspore/lite/src/delegate/npu/pass/npu_infer_format_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_infer_format_pass.cc
@ -1,69 +0,0 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/delegate/npu/pass/npu_infer_format_pass.h"
 #include <vector>
 #include <queue>
 #include <map>
 #include "src/delegate/npu/pass/npu_pass_utils.h"
 #include "src/delegate/npu/npu_converter_utils.h"
 #include "src/tensor.h"
 #include "src/cxx_api/tensor/tensor_impl.h"
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 namespace mindspore {
 int NPUInferFormatPass::Run(NPUGraph *subgraph) {
  CHECK_NULL_RETURN(subgraph);
  all_ops_ = subgraph->GetOps();
  all_tensors_ = subgraph->GetInsertTensors();
  auto graph_inputs = subgraph->inputs();
  std::queue<NPUOp *> infer_ops;
  std::map<tensor::MSTensor *, bool> is_inferred;
  // initialization
  for (auto op : *all_ops_) {
    infer_ops.push(op);
  }
  for (auto tensor : *all_tensors_) {
    is_inferred[tensor->impl()->lite_tensor()] = false;
  }
  for (auto input_tensor : graph_inputs) {
    is_inferred[input_tensor.impl()->lite_tensor()] = true;
  }
  while (!infer_ops.empty()) {
    auto cur_op = infer_ops.front();
    infer_ops.pop();
    bool input_inferred = std::all_of(cur_op->inputs().begin(), cur_op->inputs().end(), [&](auto in_tensor) {
      return is_inferred[in_tensor.impl()->lite_tensor()] == true || in_tensor.IsConst();
    });
    if (input_inferred) {
      auto dst_format = cur_op->inputs().at(0).format();
      if (NPUPassUtils::IsNhwc2Nchw(cur_op) && dst_format == Format::NHWC) {
        dst_format = Format::NCHW;
      } else if (NPUPassUtils::IsNchw2Nhwc(cur_op) && dst_format == Format::NCHW) {
        dst_format = Format::NHWC;
      }
      for (auto &out_tensor : cur_op->outputs()) {
        const_cast<mindspore::MSTensor &>(out_tensor).SetFormat(dst_format);
        is_inferred[out_tensor.impl()->lite_tensor()] = true;
      }
    } else {
      infer_ops.push(cur_op);
    }
  }
  return RET_OK;
 }
 }  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/pass/npu_infer_format_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_infer_format_pass.h
@ -1,38 +0,0 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INFER_FORMAT_PASS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INFER_FORMAT_PASS_H_
 #include <set>
 #include <vector>
 #include "src/delegate/npu/op/npu_op.h"
 #include "src/delegate/npu/pass/npu_base_pass.h"
 #include "src/common/log_util.h"
 namespace mindspore {
 class NPUInferFormatPass : public NPUBasePass {
 public:
  NPUInferFormatPass() { name_ = "NPUInferFormatPass"; }
  int Run(NPUGraph *subgraph) override;
 private:
  std::vector<NPUOp *> *all_ops_;
  std::vector<mindspore::MSTensor *> *all_tensors_;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INFER_FORMAT_PASS_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -18,6 +18,7 @@
 #include <set>
 #include <string>
 #include "src/delegate/npu/pass/npu_pass_utils.h"
 #include "src/delegate/npu/npu_converter_utils.h"
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@ -52,9 +53,9 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
  if (insert_nodes.find(op->type()) == insert_nodes.end()) {
    return InsertNone;
  }
  // current op is target op
-  // use out ops to count how many out lines from current op
+  // Use out ops to count the out lines from current op since a single tensor can be used by multiple out ops. Besides,
  // a tensor can be used by out ops and graph output at the same time, there will be one more line in this case.
  std::vector<mindspore::MSTensor> inputs = NPUPassUtils::GetNonConstInputs(op);
  size_t in_out_tensor_num =
    inputs.size() + std::max(std::max(op->out_ops().size(), static_cast<size_t>(1)), op->outputs().size());
@ -76,13 +77,19 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
      graph_input_num++;
    }
  }
-  if (op->out_ops().empty()) {
+  auto graph_output = subgraph_->outputs();
-    need_post_insert = true;
+  for (auto output : op->outputs()) {
-  }
+    if (std::find(graph_output.begin(), graph_output.end(), output) != graph_output.end()) {
-  if (op->outputs().size() > op->out_ops().size()) {
+      graph_output_num++;
-    graph_output_num = op->outputs().size() - op->out_ops().size();
+      need_post_insert = true;
    }
  }
  for (const auto out_op : op->out_ops()) {
    for (auto out_op_input : out_op->inputs()) {
      if (std::find(graph_output.begin(), graph_output.end(), out_op_input) != graph_output.end()) {
        in_out_tensor_num++;
      }
    }
    if (NPUPassUtils::IsNhwc2Nchw(out_op)) {
      transpose_output_num++;
    } else {
@ -99,105 +106,81 @@ int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
      transpose_tensor_num == in_out_tensor_num) {
    return InsertNone;
  }
-  InsertState ret =
+  InsertState ret = (need_pre_insert && need_post_insert)
-    (need_pre_insert && need_post_insert)
+                      ? BothInsert
-      ? BothInsert
+                      : (need_pre_insert ? PreInsert : (need_post_insert ? PostInsert : InsertNone));
      : ((need_pre_insert && !need_post_insert) ? PreInsert
                                                : ((!need_pre_insert && need_post_insert) ? PostInsert : InsertNone));
  return ret;
 }
-int NPUInsertTransformPass::InsertNode(NPUOp *op, NPUOp *post_op, size_t post_input_index,
+int NPUInsertTransformPass::InsertTransNode(NPUOp *op, NPUOp *post_op, const mindspore::MSTensor &trans_in_tensor,
-                                       std::vector<NPUOp *> *trans_ops) {
+                                            std::vector<NPUOp *> *trans_ops) {
-  // Op and post_op can't be nullptr at the same time.
+  MS_ASSERT(op != nullptr || post_op != nullptr);
  std::string op_name;
  std::vector<mindspore::MSTensor> in_tensors;
  std::vector<NPUOp *> out_ops;
  // If post_op equals nullptr, op is the output of whole graph.
  if (post_op != nullptr) {
    out_ops.push_back(post_op);
    op_name = post_op->name() + "_pre";
    in_tensors.push_back(post_op->inputs().at(post_input_index));
  }
  std::vector<NPUOp *> in_ops;
-  // If op equals nullptr, post_op is the input of whole graph.
+  std::vector<NPUOp *> out_ops;
-  if (op != nullptr && !op->outputs().empty()) {
+  if (op != nullptr) {
    in_ops.push_back(op);
    op_name = op->name() + "_post";
-    in_tensors.resize(op->outputs().size());
+    in_ops.emplace_back(op);
    std::copy(op->outputs().begin(), op->outputs().end(), in_tensors.begin());
  }
-  for (auto i = 0; i < in_tensors.size(); ++i) {
+  if (post_op != nullptr) {
-    auto in_tensor = in_tensors[i];
+    op_name = post_op->name() + "_pre";
-    auto nhwc_shape = in_tensor.Shape();
+    out_ops.emplace_back(post_op);
-    if (nhwc_shape.size() == 0) {
+  }
-      continue;
+  auto nhwc_shape = trans_in_tensor.Shape();
-    } else if (nhwc_shape.size() < 4) {
+  std::vector<int64_t> nchw_shape = {nhwc_shape[NHWC_N], nhwc_shape[NHWC_C], nhwc_shape[NHWC_H], nhwc_shape[NHWC_W]};
      MS_LOG(ERROR) << "nhwc_shape size < " << 4;
      return RET_ERROR;
    }
    std::vector<int64_t> nchw_shape = {nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]};
-    auto nh2nc_name = op_name + "_nh2nc_" + std::to_string(total++);
+  auto nh2nc_name = op_name + "_nh2nc_" + std::to_string(total++);
-    auto nh2nc_tensor =
+  auto nh2nc_tensor =
-      mindspore::MSTensor::CreateTensor(nh2nc_name + "/output0", in_tensor.DataType(), nchw_shape, nullptr, 0);
+    mindspore::MSTensor::CreateTensor(nh2nc_name + "/output0", trans_in_tensor.DataType(), nchw_shape, nullptr, 0);
-    if (nh2nc_tensor == nullptr) {
+  if (nh2nc_tensor == nullptr) {
-      MS_LOG(ERROR) << "New nchw tensor failed when inserting nchw2nhwc op.";
+    MS_LOG(ERROR) << "New nchw tensor failed when inserting nchw2nhwc op.";
-      return RET_ERROR;
+    return RET_ERROR;
-    }
+  }
-    nh2nc_tensor->SetTensorName(nh2nc_name + "/output0");
+  nh2nc_tensor->SetFormat(Format::NCHW);
-    std::vector<mindspore::MSTensor> nh2nc_tensors = {*nh2nc_tensor};
+  std::vector<mindspore::MSTensor> nh2nc_tensors = {*nh2nc_tensor};
-    all_tensors_->push_back(nh2nc_tensor);
+  all_tensors_->push_back(nh2nc_tensor);
-    auto nc2nh_name = op_name + "_nc2nh_" + std::to_string(total++);
+  auto nc2nh_name = op_name + "_nc2nh_" + std::to_string(total++);
-    auto nc2nh_tensor =
+  auto nc2nh_tensor =
-      mindspore::MSTensor::CreateTensor(nc2nh_name + "/output0", in_tensor.DataType(), nhwc_shape, nullptr, 0);
+    mindspore::MSTensor::CreateTensor(nc2nh_name + "/output0", trans_in_tensor.DataType(), nhwc_shape, nullptr, 0);
-    if (nc2nh_tensor == nullptr) {
+  if (nc2nh_tensor == nullptr) {
-      MS_LOG(ERROR) << "New nhwc tensor failed when inserting nhwc2nchw op.";
+    MS_LOG(ERROR) << "New nhwc tensor failed when inserting nhwc2nchw op.";
-      return RET_ERROR;
+    return RET_ERROR;
-    }
+  }
-    std::vector<mindspore::MSTensor> nc2nh_tensors = {*nc2nh_tensor};
+  nc2nh_tensor->SetFormat(Format::NHWC);
-    all_tensors_->push_back(nc2nh_tensor);
+  std::vector<mindspore::MSTensor> nc2nh_tensors = {*nc2nh_tensor};
  all_tensors_->push_back(nc2nh_tensor);
-    auto *nh2nc_op = NPUPassUtils::CreateNhwc2NchwOp({in_tensor}, nh2nc_tensors, nh2nc_name);
+  auto *nh2nc_op = NPUPassUtils::CreateNhwc2NchwOp({trans_in_tensor}, nh2nc_tensors, nh2nc_name);
-    trans_ops->push_back(nh2nc_op);
+  trans_ops->push_back(nh2nc_op);
-    auto *nc2nh_op = NPUPassUtils::CreateNchw2NhwcOp(nh2nc_tensors, nc2nh_tensors, nc2nh_name);
+  auto *nc2nh_op = NPUPassUtils::CreateNchw2NhwcOp(nh2nc_tensors, nc2nh_tensors, nc2nh_name);
-    trans_ops->push_back(nc2nh_op);
+  trans_ops->push_back(nc2nh_op);
-    NPUPassUtils::UpdateOp(nh2nc_op, in_ops, {nc2nh_op}, {in_tensor}, nh2nc_tensors);
+  NPUPassUtils::UpdateOp(nh2nc_op, in_ops, {nc2nh_op}, {trans_in_tensor}, nh2nc_tensors);
-    NPUPassUtils::UpdateOp(nc2nh_op, {nh2nc_op}, out_ops, {nh2nc_tensors[0]}, nc2nh_tensors);
+  NPUPassUtils::UpdateOp(nc2nh_op, {nh2nc_op}, out_ops, {nh2nc_tensors[0]}, nc2nh_tensors);
-    if (op != nullptr) {
+  if (op != nullptr) {
-      NPUPassUtils::UpdateNH2NCTransNodePreOp(op, nh2nc_op, post_op);
+    NPUPassUtils::UpdateNH2NCTransNodePreOp(op, nh2nc_op, post_op);
-    }
+  }
-    if (post_op != nullptr) {
+  if (post_op != nullptr) {
-      NPUPassUtils::UpdateNC2NHTransNodePostOp(op, nc2nh_op, post_op);
+    NPUPassUtils::UpdateNC2NHTransNodePostOp(op, nc2nh_op, post_op);
-    } else {
+  } else {
-      // post_op nullptr mean output, we remain graph output tensor name unchanged
+    // post_op nullptr mean output, we remain graph output tensor name unchanged
-      auto graph_output_name = in_tensor.Name();
+    auto graph_output_name = trans_in_tensor.Name();
-      nc2nh_tensor->SetTensorName(graph_output_name + "_after_" + name_);
+    nc2nh_tensor->SetTensorName(graph_output_name + "_after_" + name_);
    }
  }
  return RET_OK;
 }
 int NPUInsertTransformPass::InsertForInputTensor(NPUOp *op, size_t in_tensor_index, NPUOp *pre_op,
                                                 std::vector<NPUOp *> *trans_ops) {
  // insert transpose nodes before target ops
  return InsertNode(pre_op, op, in_tensor_index, trans_ops);
 }
 int NPUInsertTransformPass::InsertForOutputTensor(NPUOp *op, NPUOp *post_op, size_t post_in_tensor_index,
                                                  std::vector<NPUOp *> *trans_ops) {
  // insert transpose nodes after target ops
  return InsertNode(op, post_op, post_in_tensor_index, trans_ops);
 }
 int NPUInsertTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
  int ret = RET_OK;
  auto inputs = NPUPassUtils::GetNonConstInputs(op);
  for (auto tensor : inputs) {
    if (tensor.Shape().size() < NPU_SHAPE_SIZE) {
      continue;
    }
    // the input tensor can only come from a single op
    auto pre_op = NPUPassUtils::OpInputFromOp(op, tensor);
    if (NPUPassUtils::IsNchw2Nhwc(pre_op)) {
      continue;
@ -209,7 +192,7 @@ int NPUInsertTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *tran
      return RET_ERROR;
    }
    size_t index = it - op->inputs().begin();
-    ret = InsertForInputTensor(op, index, pre_op, trans_ops);
+    ret = InsertTransNode(pre_op, op, op->inputs().at(index), trans_ops);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op before op " << op->name() << " failed.";
      return ret;
@ -220,38 +203,58 @@ int NPUInsertTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *tran
 int NPUInsertTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
  int ret = RET_OK;
-
+  for (size_t idx = 0; idx < op->outputs().size(); idx++) {
-  for (const auto post_op : op->out_ops()) {
+    auto out_tensor = op->outputs().at(idx);
-    if (NPUPassUtils::IsNhwc2Nchw(post_op)) {
+    if (out_tensor.Shape().size() < NPU_SHAPE_SIZE) {
      continue;
    }
-    auto post_op_in_tensors = post_op->inputs();
+    if (std::find(subgraph_->outputs().begin(), subgraph_->outputs().end(), out_tensor) != subgraph_->outputs().end()) {
-    // op's out tensor is one of post_op's input tensor
+      // the case that op's out tensor is graph output
-    auto it = std::find(post_op_in_tensors.begin(), post_op_in_tensors.end(), op->outputs().at(0));
+      ret = InsertTransNode(op, nullptr, op->outputs().at(idx), trans_ops);
-    if (it == post_op_in_tensors.end()) {
+      if (ret != RET_OK) {
-      return RET_ERROR;
+        MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
        return RET_ERROR;
      }
      // use origin output as the last trans op's output in order to avoid the lost of the output tensor after transpose
      // fusion. The input of the cur_op's out_op will be updated in the loop below.
      auto last_trans = trans_ops->back();
      auto trans_output = last_trans->outputs();
      auto cur_outputs = op->outputs();
      cur_outputs[idx] = last_trans->outputs()[0];
      trans_output[0] = op->outputs()[idx];
      last_trans->set_outputs(trans_output);
      op->set_outputs(cur_outputs);
    }
-    size_t input_index = it - post_op_in_tensors.begin();
+
-    ret = InsertForOutputTensor(op, post_op, input_index, trans_ops);
+    // besides of being as graph outputs, the output tensors also can connected with multiple ops.
-    if (ret != RET_OK) {
+    for (auto post_op : op->out_ops()) {
-      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+      auto post_op_input = post_op->inputs();
-      return ret;
+      auto it = std::find(post_op_input.begin(), post_op_input.end(), out_tensor);
-    }
+      if (it == post_op_input.end()) {
-  }
+        continue;
-  if (op->outputs().size() > op->out_ops().size()) {
+      }
-    // op out is graph output
+      auto related_idx = it - post_op_input.begin();
-    ret = InsertForOutputTensor(op, nullptr, 0, trans_ops);
+      post_op_input[related_idx] = op->outputs().at(idx);
-    if (ret != RET_OK) {
+      post_op->set_inputs(post_op_input);
-      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+
-      return ret;
+      if (NPUPassUtils::IsNhwc2Nchw(post_op)) {
        continue;
      }
      // the case that op's out tensor is one of post_op's input tensor
      ret = InsertTransNode(op, post_op, op->outputs().at(idx), trans_ops);
      if (ret != RET_OK) {
        MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
        return ret;
      }
    }
  }
  return ret;
 }
 int NPUInsertTransformPass::Run(NPUGraph *subgraph) {
-  all_ops_ = subgraph->GetOps();
+  subgraph_ = subgraph;
-  all_tensors_ = subgraph->GetInsertTensors();
+  all_ops_ = subgraph_->GetOps();
  all_tensors_ = subgraph_->GetInsertTensors();
  std::vector<NPUOp *> insert_ops;
  for (int j = 0; j < 2; ++j) {
    for (size_t i = 0; i < all_ops_->size(); i++) {
--- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -30,14 +30,14 @@ class NPUInsertTransformPass : public NPUBasePass {
  int GetInsertState(NPUOp *op);
  int InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
  int InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
-  int InsertNode(NPUOp *op, NPUOp *post_op, size_t post_input_index, std::vector<NPUOp *> *trans_ops);
+  int InsertTransNode(NPUOp *op, NPUOp *post_op, const mindspore::MSTensor &trans_in_tensor,
-  int InsertForInputTensor(NPUOp *op, size_t in_tensor_index, NPUOp *pre_op, std::vector<NPUOp *> *trans_ops);
+                      std::vector<NPUOp *> *trans_ops);
  int InsertForOutputTensor(NPUOp *op, NPUOp *post_op, size_t post_in_tensor_index, std::vector<NPUOp *> *trans_ops);
 private:
  int total = 0;
-  std::vector<NPUOp *> *all_ops_;
+  NPUGraph *subgraph_ = nullptr;
-  std::vector<mindspore::MSTensor *> *all_tensors_;
+  std::vector<NPUOp *> *all_ops_ = nullptr;
  std::vector<mindspore::MSTensor *> *all_tensors_ = nullptr;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INSERT_TRANSFORM_PASS_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.cc
@ -60,16 +60,16 @@ void NPUPassUtils::UpdateOp(NPUOp *op, const std::vector<NPUOp *> &in_ops, const
 void NPUPassUtils::UpdateNH2NCTransNodePreOp(NPUOp *pre_op, NPUOp *trans_op, NPUOp *op) {
  // For op before trans, update the out_ops; the output tensor of op is the input tensor of trans.
  std::vector<NPUOp *> out_ops = pre_op->out_ops();
-  size_t i = 0;
+  if (op == nullptr) {
-  for (; i < out_ops.size(); i++) {
+    out_ops.emplace_back(trans_op);
-    if (out_ops[i] == op) {
+  } else {
-      out_ops[i] = trans_op;
+    for (size_t i = 0; i < out_ops.size(); i++) {
-      break;
+      if (out_ops[i] == op) {
        out_ops[i] = trans_op;
        break;
      }
    }
  }
  if (i == out_ops.size()) {
    out_ops.push_back(trans_op);
  }
  pre_op->set_out_ops(out_ops);
 }
@ -177,8 +177,8 @@ NPUOp *NPUPassUtils::OpInputFromOp(NPUOp *op, mindspore::MSTensor in_tensor) {
    return nullptr;
  }
  auto in_ops = op->in_ops();
-  auto output_contain = [in_tensor](NPUOp *op) {
+  auto output_contain = [in_tensor](NPUOp *in_op) {
-    auto outputs = op->outputs();
+    auto outputs = in_op->outputs();
    return std::find(outputs.begin(), outputs.end(), in_tensor) != outputs.end();
  };
  auto it = std::find_if(in_ops.begin(), in_ops.end(), output_contain);
--- a/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.cc
@ -37,10 +37,6 @@ int NPUTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops)
    MS_LOG(ERROR) << "NPU Transform pass does not find in op with 4d output";
    return RET_ERROR;
  }
  if (op->inputs().front().format() == Format::NCHW) {
    // input format is already NCHW, no need to insert transpose.
    return RET_OK;
  }
  if (is_input_op || nchw_nodes.find((*it)->type()) == nchw_nodes.end()) {
    NPUOp *pre_op = nullptr;
    if (!is_input_op) {
@ -57,6 +53,7 @@ int NPUTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops)
      MS_LOG(ERROR) << "New nchw tensor failed when inserting pre nhwc2nchw op.";
      return RET_ERROR;
    }
    tensor->SetFormat(Format::NCHW);
    std::vector<mindspore::MSTensor> pre_trans_outputs = {*tensor};
    all_tensors_->push_back(tensor);
@ -83,11 +80,10 @@ int NPUTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops)
  return RET_OK;
 }
-int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops,
+int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
                                      std::vector<mindspore::MSTensor> graph_outputs) {
  bool is_output_op = false;
  if (op->out_ops().empty() ||
-      find(graph_outputs.begin(), graph_outputs.end(), op->outputs()[0]) != graph_outputs.end()) {
+      find(subgraph_->outputs().begin(), subgraph_->outputs().end(), op->outputs()[0]) != subgraph_->outputs().end()) {
    is_output_op = true;
  }
  // Get the post op that need insert trans op.
@ -116,6 +112,7 @@ int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops
    MS_LOG(ERROR) << "New nchw tensor failed when inserting post nchw2nhwc op.";
    return RET_ERROR;
  }
  nc2nh_tensor->SetFormat(Format::NCHW);
  all_tensors_->push_back(nc2nh_tensor);
  if (is_output_op) {
@ -145,6 +142,7 @@ int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops
      MS_LOG(ERROR) << "New nhwc tensor failed when inserting post nchw2nhwc op.";
      return RET_ERROR;
    }
    out_tensor->SetFormat(Format::NHWC);
    all_tensors_->push_back(out_tensor);
    nc2nh_outputs.push_back(*out_tensor);
@ -173,9 +171,9 @@ int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops
 }
 int NPUTransformPass::Run(NPUGraph *subgraph) {
-  all_ops_ = subgraph->GetOps();
+  subgraph_ = subgraph;
-  all_tensors_ = subgraph->GetInsertTensors();
+  all_ops_ = subgraph_->GetOps();
-  auto graph_outputs = subgraph->outputs();
+  all_tensors_ = subgraph_->GetInsertTensors();
  for (size_t i = 0; i < all_ops_->size();) {
    auto op = (*all_ops_)[i];
    if (nchw_nodes.find(op->type()) == nchw_nodes.end()) {
@ -204,7 +202,7 @@ int NPUTransformPass::Run(NPUGraph *subgraph) {
    // insert post_ops after op in vector
    // modify loop index add post_ops.size() to the next op in the origin vector
    std::vector<NPUOp *> post_ops;
-    ret = InsertPostNodes(op, &post_ops, graph_outputs);
+    ret = InsertPostNodes(op, &post_ops);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Insert nchw2nhwc op after op " << op->name() << " failed.";
      return RET_ERROR;
--- a/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.h
@ -32,12 +32,13 @@ class NPUTransformPass : public NPUBasePass {
 private:
  int InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
-  int InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops, std::vector<mindspore::MSTensor> graph_outputs);
+  int InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
 private:
  int total = 0;
-  std::vector<NPUOp *> *all_ops_;
+  NPUGraph *subgraph_ = nullptr;
-  std::vector<mindspore::MSTensor *> *all_tensors_;
+  std::vector<NPUOp *> *all_ops_ = nullptr;
  std::vector<mindspore::MSTensor *> *all_tensors_ = nullptr;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_TRANSFORM_PASS_H_
--- a/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/tflite/tflite_model_parser.cc
@ -493,6 +493,7 @@ STATUS TfliteModelParser::ConvertGraphOutputs(const std::unique_ptr<tflite::SubG
    auto make_tuple_cnode = func_graph->NewCNode(make_tuple_inputs);
    MSLITE_CHECK_PTR(make_tuple_cnode);
    make_tuple_cnode->set_fullname_with_scope("return_tuple");
    auto return_prim_ptr = std::make_shared<ops::Return>();
    if (return_prim_ptr == nullptr) {
      MS_LOG(ERROR) << "new Return failed";