[MSLITE][Develop] add npu delegate

2021-06-18 16:49:43 +08:00 · 2021-06-18 16:49:43 +08:00 · 16e5f61830
parent e7152ea01f
commit 16e5f61830
181 changed files with 7759 additions and 6525 deletions
--- a/mindspore/lite/include/delegate.h
+++ b/mindspore/lite/include/delegate.h
@ -28,18 +28,37 @@ namespace mindspore {
 using KernelIter = std::vector<kernel::Kernel *>::iterator;
 class DelegateModel {
 public:
+  /// \brief Constructor of MindSpore Lite DelegateModel.
  DelegateModel(std::vector<kernel::Kernel *> *kernels,
                const std::map<kernel::Kernel *, const schema::Primitive *> primitives)
      : kernels_(kernels), primitives_(primitives) {}

+  /// \brief Destructor of MindSpore Lite DelegateModel.
  ~DelegateModel() = default;

+  /// \brief Get Primitive of kernel::Kernel.
+  ///
+  /// \param[in] a kernel in DelegateModel kernels vector.
+  ///
+  /// \return The schema::Primitive of The kernel.
  const schema::Primitive *GetPrimitive(kernel::Kernel *kernel) const;

+  /// \brief Get the begin iterator of the DelegateModel kernels vector.
+  ///
+  /// \return The begin iterator of the DelegateModel kernels vector.
  KernelIter BeginKernelIterator();

+  /// \brief Get the end iterator of the DelegateModel kernels vector.
+  ///
+  /// \return The end iterator of the DelegateModel kernels vector.
  KernelIter EndKernelIterator();

+  /// \brief Replace the continuous kernel supported by the delegate with a delegate graph kernel.
+  ///
+  /// \param[in] from Define the begin iterator of continuous kernel supported by the delegate.
+  /// \param[in] end Define the end iterator of continuous kernel supported by the delegate.
+  ///
+  /// \return The next iterator after graph_kernel, point to the next kernel that is not visited.
  KernelIter Replace(KernelIter from, KernelIter end, kernel::Kernel *graph_kernel);

 protected:
@ -51,12 +70,22 @@ typedef void (*DelegateHook)(std::shared_ptr<Delegate> delegate);
 static void HookNullFuc(std::shared_ptr<Delegate> delegate) {}
 class Delegate {
 public:
+  /// \brief Constructor of MindSpore Lite Delegate.
  Delegate() = default;

+  /// \brief Destructor of MindSpore Lite Delegate.
  virtual ~Delegate() = default;

+  /// \brief Init delegate.
+  ///
+  /// \note Init willed be called in CreateSession.
  virtual int Init() = 0;

+  /// \brief Build delegate graph for MindSpore Lite model.
+  ///
+  /// \note Build willed be called in LiteSession::CompileGraph.
+  ///
+  /// \param[in] model Define the delegate model to be built.
  virtual int Build(DelegateModel *model) = 0;

  DelegateHook init_hook_ = HookNullFuc;
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -212,7 +212,7 @@ else()
    target_link_libraries(mindspore-lite_static cpu_kernel_mid nnacl_mid cpu_ops_mid)
 endif()
 if(SUPPORT_NPU)
-    add_subdirectory(runtime/agent/npu)
+    add_subdirectory(delegate/npu)
    target_link_libraries(mindspore-lite npu_kernel_mid)
    target_link_libraries(mindspore-lite_static npu_kernel_mid)
 endif()
--- a/mindspore/lite/src/runtime/agent/npu/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/agent/npu/CMakeLists.txt
@ -1,8 +1,8 @@
 include_directories(${DDK_PATH})
 file(GLOB_RECURSE NPU_RUNTIME_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/*.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../kernel/npu/*.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/optimizer/*.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/op/*.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/pass/*.cc
        )
 add_library(hiai SHARED IMPORTED)
 set_target_properties(hiai PROPERTIES IMPORTED_LOCATION
--- a/mindspore/lite/src/runtime/agent/npu/npu_converter_utils.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_converter_utils.cc
@ -14,8 +14,9 @@
 * limitations under the License.
 */

-#include "src/runtime/agent/npu/npu_converter_utils.h"
-namespace mindspore::lite {
+#include "src/delegate/npu/npu_converter_utils.h"
+#include "src/common/log_adapter.h"
+namespace mindspore {
 ge::Shape ConverterToNPUShape(const std::vector<int> &src_shape) {
  vector<int64_t> shapes;
  shapes.reserve(src_shape.size());
@ -25,14 +26,14 @@ ge::Shape ConverterToNPUShape(const std::vector<int> &src_shape) {
  return ge::Shape({shapes});
 }

-ge::Format ConverterToNPUFormat(mindspore::Format format) {
+ge::Format ConverterToNPUFormat(schema::Format format) {
  ge::Format ge_format;
  switch (format) {
-    case mindspore::NCHW:
+    case schema::Format_NCHW:
      ge_format = ge::FORMAT_NCHW;
      break;
-    case mindspore::NHWC:
-    case mindspore::KHWC:
+    case schema::Format_NHWC:
+    case schema::Format_KHWC:
      ge_format = ge::FORMAT_NHWC;
      break;
    default:
@ -76,31 +77,31 @@ ge::DataType ConverterToNPUDataType(TypeId type_id) {
  return data_type;
 }

-hiai::op::Data *ConverterToNPUData(Tensor *src, const std::string &name) {
+hiai::op::Data *ConverterToNPUData(tensor::MSTensor *src, const std::string &name) {
  auto data = new (std::nothrow) hiai::op::Data(name);
  if (data == nullptr) {
    MS_LOG(ERROR) << "new data failed.";
    return data;
  }
-  ge::TensorDesc tensor_desc(ConverterToNPUShape(src->shape()), ConverterToNPUFormat(src->format()),
+  ge::TensorDesc tensor_desc(ConverterToNPUShape(src->shape()), ge::FORMAT_NCHW,
                             ConverterToNPUDataType(src->data_type()));
  data->update_input_desc_x(tensor_desc);
  return data;
 }

-std::shared_ptr<ge::Tensor> ConverterToNPUTensor(Tensor *src) {
+std::shared_ptr<ge::Tensor> ConverterToNPUTensor(tensor::MSTensor *src) {
  std::shared_ptr<ge::Tensor> ge_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
  if (ge_tensor == nullptr) {
    MS_LOG(ERROR) << "new ge_tensor failed.";
    return ge_tensor;
  }
-  ge::TensorDesc tensor_desc(ConverterToNPUShape(src->shape()), ConverterToNPUFormat(src->format()),
+  ge::TensorDesc tensor_desc(ConverterToNPUShape(src->shape()), ge::FORMAT_NCHW,
                             ConverterToNPUDataType(src->data_type()));

  ge_tensor->SetTensorDesc(tensor_desc);

-  if (src->data_c() != nullptr) {
-    ge_tensor->SetData(reinterpret_cast<const uint8_t *>(src->data_c()), src->Size());
+  if (src->data() != nullptr) {
+    ge_tensor->SetData(reinterpret_cast<const uint8_t *>(src->data()), src->Size());
  }
  return ge_tensor;
 }
@ -123,4 +124,24 @@ int ConverterToNPUEltwiseMode(schema::EltwiseMode mode) {
  }
  return mode_num;
 }
-}  // namespace mindspore::lite
+
+int TransFormAxis(int axis) {
+  switch (axis) {
+    case 0:
+      return 0;
+    case 1:
+      return 2;
+    case 2:
+      return 3;
+    case 3:
+    case -1:
+      return 1;
+    default:
+      return -2;
+  }
+}
+
+bool IsContainMSTensor(const std::vector<tensor::MSTensor *> &tensor_vec, const tensor::MSTensor *tensor) {
+  return find(tensor_vec.begin(), tensor_vec.end(), tensor) != tensor_vec.end();
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/agent/npu/npu_converter_utils.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_converter_utils.h
@ -14,23 +14,23 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_CONVERTER_UITLS_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_CONVERTER_UITLS_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_CONVERTER_UITLS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_CONVERTER_UITLS_H_
 #include <string>
 #include <memory>
 #include <vector>
 #include "schema/ops_generated.h"
 #include "include/graph/tensor.h"
 #include "include/graph/op/array_defs.h"
-#include "src/tensor.h"
+#include "include/ms_tensor.h"

-namespace mindspore::lite {
+namespace mindspore {

-std::shared_ptr<ge::Tensor> ConverterToNPUTensor(Tensor *src);
+std::shared_ptr<ge::Tensor> ConverterToNPUTensor(tensor::MSTensor *src);

-hiai::op::Data *ConverterToNPUData(Tensor *src, const std::string &name);
+hiai::op::Data *ConverterToNPUData(tensor::MSTensor *src, const std::string &name);

-ge::Format ConverterToNPUFormat(mindspore::Format format);
+ge::Format ConverterToNPUFormat(schema::Format format);

 ge::DataType ConverterToNPUDataType(TypeId type_id);

@ -40,5 +40,8 @@ int ConverterToNPUActMode(schema::ActivationType type);

 int ConverterToNPUEltwiseMode(schema::EltwiseMode mode);

-}  // namespace mindspore::lite
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_CONVERTER_UITLS_H_
+int TransFormAxis(int axis);
+
+bool IsContainMSTensor(const std::vector<tensor::MSTensor *> &tensor_vec, const tensor::MSTensor *tensor);
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_CONVERTER_UITLS_H_
--- a/mindspore/lite/src/delegate/npu/npu_delegate.cc
+++ b/mindspore/lite/src/delegate/npu/npu_delegate.cc
@ -0,0 +1,303 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/npu_delegate.h"
+#include <queue>
+#include "src/delegate/npu/op/npu_op.h"
+#include "src/delegate/npu/op/activation_npu.h"
+#include "src/delegate/npu/op/argmax_npu.h"
+#include "src/delegate/npu/op/arithmetic_npu.h"
+#include "src/delegate/npu/op/arithmetic_self_npu.h"
+#include "src/delegate/npu/op/avg_pooling_npu.h"
+#include "src/delegate/npu/op/batchnorm_npu.h"
+#include "src/delegate/npu/op/cast_npu.h"
+#include "src/delegate/npu/op/concat_npu.h"
+#include "src/delegate/npu/op/convolution_npu.h"
+#include "src/delegate/npu/op/crop_and_resize_npu.h"
+#include "src/delegate/npu/op/deconvolution_npu.h"
+#include "src/delegate/npu/op/eltwise_npu.h"
+#include "src/delegate/npu/op/expand_dims_npu.h"
+#include "src/delegate/npu/op/fullconnection_npu.h"
+#include "src/delegate/npu/op/gather_npu.h"
+#include "src/delegate/npu/op/instance_norm_npu.h"
+#include "src/delegate/npu/op/matmul_npu.h"
+#include "src/delegate/npu/op/max_pooling_npu.h"
+#include "src/delegate/npu/op/pad_npu.h"
+#include "src/delegate/npu/op/reduce_npu.h"
+#include "src/delegate/npu/op/reshape_npu.h"
+#include "src/delegate/npu/op/resize_npu.h"
+#include "src/delegate/npu/op/scale_npu.h"
+#include "src/delegate/npu/op/slice_npu.h"
+#include "src/delegate/npu/op/softmax_npu.h"
+#include "src/delegate/npu/op/split_npu.h"
+#include "src/delegate/npu/op/squeeze_npu.h"
+#include "src/delegate/npu/op/strided_slice_npu.h"
+#include "src/delegate/npu/op/tile_npu.h"
+#include "src/delegate/npu/op/transpose_npu.h"
+#include "src/delegate/npu/op/unsqueeze_npu.h"
+#include "src/delegate/npu/npu_graph.h"
+#include "src/delegate/npu/npu_graph_utils.h"
+#include "src/delegate/npu/pass/npu_transform_pass.h"
+#include "src/delegate/npu/pass/npu_insert_transform_pass.h"
+#include "src/delegate/npu/pass/npu_fusion_pass.h"
+
+namespace mindspore {
+NPUDelegate::~NPUDelegate() {
+  if (npu_manager_ != nullptr) {
+    npu_manager_->Reset();
+    delete npu_manager_;
+    npu_manager_ = nullptr;
+  }
+  if (pass_manager_ != nullptr) {
+    pass_manager_->Clear();
+    delete pass_manager_;
+    pass_manager_ = nullptr;
+  }
+}
+
+int NPUDelegate::Init() {
+  npu_manager_ = new (std::nothrow) NPUManager();
+  if (npu_manager_ == nullptr) {
+    MS_LOG(ERROR) << "New npu manager failed.";
+    return RET_ERROR;
+  }
+  if (!npu_manager_->IsSupportNPU()) {
+    MS_LOG(DEBUG) << "Checking that npu is unsupported.";
+    free(npu_manager_);
+    return RET_NOT_SUPPORT;
+  }
+  pass_manager_ = new (std::nothrow) NPUPassManager();
+  if (pass_manager_ == nullptr) {
+    free(npu_manager_);
+    MS_LOG(ERROR) << "New npu pass manager failed.";
+    return RET_ERROR;
+  }
+  auto transform_pass = new (std::nothrow) NPUTransformPass();
+  pass_manager_->AddPass(transform_pass);
+  auto insert_transform_pass = new (std::nothrow) NPUInsertTransformPass();
+  pass_manager_->AddPass(insert_transform_pass);
+  auto fusion_pass = new (std::nothrow) NPUFusionPass();
+  pass_manager_->AddPass(fusion_pass);
+
+  op_func_lists_.clear();
+  op_func_lists_ = {
+    {schema::PrimitiveType_Activation, GetNPUOp<ActivationNPUOp>},
+    {schema::PrimitiveType_ArgMaxFusion, GetNPUOp<ArgmaxNPUOp>},
+    {schema::PrimitiveType_MulFusion, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_AddFusion, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_SubFusion, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_DivFusion, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_FloorMod, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_FloorDiv, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_LogicalAnd, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_LogicalOr, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Maximum, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Minimum, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_NotEqual, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Equal, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Less, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_LessEqual, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Greater, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_GreaterEqual, GetNPUOp<ArithmeticNPUOp>},
+    {schema::PrimitiveType_Ceil, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Cos, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Floor, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Log, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_LogicalNot, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Neg, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Reciprocal, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Round, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Rsqrt, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Sin, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Sqrt, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_Square, GetNPUOp<ArithmeticSelfNPUOp>},
+    {schema::PrimitiveType_AvgPoolFusion, GetNPUOp<AvgPoolingNPUOp>},
+    {schema::PrimitiveType_MaxPoolFusion, GetNPUOp<MaxPoolingNPUOp>},
+    {schema::PrimitiveType_FusedBatchNorm, GetNPUOp<BatchnormNPUOp>},
+    {schema::PrimitiveType_Cast, GetNPUOp<CastNPUOp>},
+    {schema::PrimitiveType_Concat, GetNPUOp<ConcatNPUOp>},
+    {schema::PrimitiveType_Conv2dTransposeFusion, GetNPUOp<DeconvolutionNPUOp>},
+    {schema::PrimitiveType_CropAndResize, GetNPUOp<CropAndResizeNPUOp>},
+    {schema::PrimitiveType_Eltwise, GetNPUOp<EltwiseNPUOp>},
+    {schema::PrimitiveType_ExpandDims, GetNPUOp<ExpandDimsNPUOp>},
+    {schema::PrimitiveType_FullConnection, GetNPUOp<FullconnectionNPUOp>},
+    {schema::PrimitiveType_Gather, GetNPUOp<GatherNPUOp>},
+    {schema::PrimitiveType_InstanceNorm, GetNPUOp<InstanceNormNPUOp>},
+    {schema::PrimitiveType_MatMul, GetNPUOp<MatMulNPUOp>},
+    {schema::PrimitiveType_PadFusion, GetNPUOp<PadNPUOp>},
+    {schema::PrimitiveType_ReduceFusion, GetNPUOp<ReduceNPUOp>},
+    {schema::PrimitiveType_Reshape, GetNPUOp<ReshapeNPUOp>},
+    {schema::PrimitiveType_Resize, GetNPUOp<ResizeNPUOp>},
+    {schema::PrimitiveType_ScaleFusion, GetNPUOp<ScaleNPUOp>},
+    {schema::PrimitiveType_SliceFusion, GetNPUOp<SliceNPUOp>},
+    {schema::PrimitiveType_Softmax, GetNPUOp<SoftmaxNPUOp>},
+    {schema::PrimitiveType_Split, GetNPUOp<SplitNPUOp>},
+    {schema::PrimitiveType_Squeeze, GetNPUOp<SqueezeNPUOp>},
+    {schema::PrimitiveType_StridedSlice, GetNPUOp<StridedSliceNPUOp>},
+    {schema::PrimitiveType_TileFusion, GetNPUOp<TileNPUOp>},
+    {schema::PrimitiveType_Transpose, GetNPUOp<TransposeNPUOp>},
+    {schema::PrimitiveType_Unsqueeze, GetNPUOp<UnsqueezeNPUOp>},
+  };
+  return RET_OK;
+}
+
+int NPUDelegate::Build(DelegateModel *model) {
+  KernelIter from, end;
+  std::vector<NPUOp *> npu_ops;
+  int graph_index = 0;
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    kernel::Kernel *kernel = *iter;
+    auto npu_op = GetOP(kernel, model->GetPrimitive(kernel));
+    if (npu_op != nullptr) {
+      // If npu_op does not equal nullptr, this kernel can be supported by delegate
+      if (npu_ops.size() == 0) {
+        from = iter;
+      }
+      npu_ops.push_back(npu_op);
+      end = iter;
+    } else {
+      if (npu_ops.size() > 0) {
+        auto npu_graph_kernel = CreateNPUGraph(npu_ops, model, from, end);
+        if (npu_graph_kernel == nullptr) {
+          MS_LOG(ERROR) << "Create NPU Graph failed.";
+          return RET_ERROR;
+        }
+        npu_graph_kernel->set_name("NpuGraph" + std::to_string(graph_index++));
+        iter = model->Replace(from, end + 1, npu_graph_kernel);
+        npu_ops.clear();
+      }
+    }
+  }
+  if (npu_ops.size() > 0) {
+    auto npu_graph_kernel = CreateNPUGraph(npu_ops, model, from, end);
+    if (npu_graph_kernel == nullptr) {
+      MS_LOG(ERROR) << "Create NPU Graph failed.";
+      return RET_ERROR;
+    }
+    npu_graph_kernel->set_name("NpuGraph" + std::to_string(graph_index++));
+    model->Replace(from, end + 1, npu_graph_kernel);
+    npu_ops.clear();
+  }
+  auto ret = npu_manager_->LoadOMModel();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "NPU client load model failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+NPUOp *NPUDelegate::GetOP(kernel::Kernel *kernel, const schema::Primitive *primitive) {
+  auto in_tensors = kernel->inputs();
+  auto out_tensors = kernel->outputs();
+  auto name = kernel->name();
+  NPUOp *npu_op = nullptr;
+  auto node_type = primitive->value_type();
+  if (node_type == schema::PrimitiveType_Conv2DFusion) {
+    npu_op = GetNPUConvOp(primitive, in_tensors, out_tensors, name);
+  } else {
+    if (op_func_lists_.find(node_type) != op_func_lists_.end()) {
+      npu_op = op_func_lists_[node_type](primitive, in_tensors, out_tensors, name);
+    } else {
+      MS_LOG(DEBUG) << "Unsupported op type for NPU.";
+      return nullptr;
+    }
+  }
+  return npu_op;
+}
+
+std::vector<tensor::MSTensor *> GraphInTensors(const std::vector<NPUOp *> &ops, DelegateModel *model, KernelIter from,
+                                               KernelIter end) {
+  auto in_tensors = NPUGraphUtils::GetGraphInTensors(ops);
+  std::vector<tensor::MSTensor *> all_in_tensors;
+  for (auto op : ops) {
+    for (auto in_tensor : op->inputs()) {
+      if (in_tensor->data() != nullptr && find(in_tensors.begin(), in_tensors.end(), in_tensor) == in_tensors.end()) {
+        all_in_tensors.push_back(in_tensor);
+      }
+    }
+  }
+
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    if (iter >= from && iter <= end) {
+      continue;
+    }
+    // The output of other kernels is the input of the current subgraph kernel.
+    for (auto out_tensor : (*iter)->outputs()) {
+      if (find(all_in_tensors.begin(), all_in_tensors.end(), out_tensor) != all_in_tensors.end()) {
+        in_tensors.push_back(out_tensor);
+      }
+    }
+  }
+  return in_tensors;
+}
+
+std::vector<tensor::MSTensor *> GraphOutTensors(const std::vector<NPUOp *> &ops, DelegateModel *model, KernelIter from,
+                                                KernelIter end) {
+  auto out_tensors = NPUGraphUtils::GetGraphOutTensors(ops);
+  std::vector<tensor::MSTensor *> all_out_tensors;
+  for (auto op : ops) {
+    for (auto out_tensor : op->outputs()) {
+      if (find(out_tensors.begin(), out_tensors.end(), out_tensor) == out_tensors.end()) {
+        all_out_tensors.push_back(out_tensor);
+      }
+    }
+  }
+
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    if (iter >= from && iter <= end) {
+      continue;
+    }
+    // The input of other kernels is the output of the current subgraph kernel.
+    for (auto in_tensor : (*iter)->inputs()) {
+      if (find(all_out_tensors.begin(), all_out_tensors.end(), in_tensor) != all_out_tensors.end()) {
+        out_tensors.push_back(in_tensor);
+      }
+    }
+  }
+  return out_tensors;
+}
+
+kernel::Kernel *NPUDelegate::CreateNPUGraph(const std::vector<NPUOp *> &ops, DelegateModel *model, KernelIter from,
+                                            KernelIter end) {
+  auto in_tensors = GraphInTensors(ops, model, from, end);
+  auto out_tensors = GraphOutTensors(ops, model, from, end);
+  auto graph_kernel = new (std::nothrow) NPUGraph(ops, npu_manager_, in_tensors, out_tensors);
+  if (graph_kernel == nullptr) {
+    MS_LOG(DEBUG) << "New NPU Graph failed.";
+    return nullptr;
+  }
+  // 1. For every op, find pre and next ops
+  auto ret = graph_kernel->FindPreNextOps();
+  if (ret != RET_OK) {
+    MS_LOG(DEBUG) << "NPU Graph find input and output ops for every op failed.";
+    return nullptr;
+  }
+  // 2. Pass
+  ret = pass_manager_->RunPass(graph_kernel);
+  if (ret != RET_OK) {
+    MS_LOG(DEBUG) << "NPU Graph run pass failed. This function mainly solves the problem that the format is "
+                     "inconsistent and requires interpolation transpose operators.";
+    return nullptr;
+  }
+  // 3. NPUGraph init, create subgraph_kernel and transpose_kernel
+  ret = graph_kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(DEBUG) << "NPU subgraph Init failed.";
+    return nullptr;
+  }
+  return graph_kernel;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/npu_delegate.h
+++ b/mindspore/lite/src/delegate/npu/npu_delegate.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_DELEGATE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_DELEGATE_H_
+
+#include <vector>
+#include <map>
+#include "include/delegate.h"
+#include "src/delegate/npu/npu_manager.h"
+#include "src/delegate/npu/pass/npu_pass_manager.h"
+#include "src/delegate/npu/op//npu_op.h"
+#include "include/context.h"
+#include "include/errorcode.h"
+#include "src/common/log_adapter.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {
+class NPUDelegate : public Delegate {
+ public:
+  explicit NPUDelegate(lite::NpuDeviceInfo device_info) : Delegate() { frequency_ = device_info.frequency_; }
+
+  ~NPUDelegate() override;
+
+  int Init() override;
+
+  int Build(DelegateModel *model) override;
+
+ protected:
+  NPUOp *GetOP(kernel::Kernel *kernel, const schema::Primitive *primitive);
+
+  kernel::Kernel *CreateNPUGraph(const std::vector<NPUOp *> &ops, DelegateModel *model, KernelIter from,
+                                 KernelIter end);
+
+  NPUManager *npu_manager_ = nullptr;
+  NPUPassManager *pass_manager_ = nullptr;
+  std::map<schema::PrimitiveType, NPUGetOp> op_func_lists_;
+  int frequency_ = 0;
+};
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_DELEGATE_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,12 +14,13 @@
 * limitations under the License.
 */

-#include "src/runtime/agent/npu/npu_executor.h"
+#include "src/delegate/npu/npu_executor.h"
 #include <unordered_map>
 #include "include/errorcode.h"
-#include "src/runtime/agent/npu/npu_manager.h"
-#include "nnacl/pack.h"
-namespace mindspore::lite {
+#include "src/delegate/npu/npu_manager.h"
+#include "src/common/log_adapter.h"
+
+namespace mindspore {
 NPUExecutor::~NPUExecutor() {
  client_.reset();
  for (auto t : npu_input_tensors_) {
@ -32,8 +33,7 @@ NPUExecutor::~NPUExecutor() {
  npu_output_tensors_.clear();
 }

-int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs,
-                         const std::vector<Tensor *> &outputs, const lite::InnerContext *ctx) {
+int NPUExecutor::Prepare() {
  MS_ASSERT(npu_manager_ != nullptr);
  this->client_ = npu_manager_->GetClient(model_name_);
  if (this->client_ == nullptr) {
@ -41,7 +41,7 @@ int NPUExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, const
    return RET_ERROR;
  }
  if (GetIOTensorVec() != RET_OK) {
-    MS_LOG(ERROR) << "Load model failed.";
+    MS_LOG(ERROR) << "NPUExecutor GetIOTensorVec failed.";
    return RET_ERROR;
  }
  return RET_OK;
@ -75,25 +75,7 @@ std::vector<int> ExpandShapeTo4d(const std::vector<int> &shape) {
  return ret;
 }

-bool IsSameShapeInTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
-  if (tensor->shape().size() > 4) {
-    MS_LOG(ERROR) << "Npu does not support input tensor dims greater than 4";
-    return false;
-  }
-  if (tensor->shape().size() == 4) {
-    return tensor->Batch() == npu_tensor->GetTensorDimension().GetNumber() &&
-           tensor->Channel() == npu_tensor->GetTensorDimension().GetChannel() &&
-           tensor->Height() == npu_tensor->GetTensorDimension().GetHeight() &&
-           tensor->Width() == npu_tensor->GetTensorDimension().GetWidth();
-  }
-  std::vector<int> npu_shape{static_cast<int>(npu_tensor->GetTensorDimension().GetNumber()),
-                             static_cast<int>(npu_tensor->GetTensorDimension().GetChannel()),
-                             static_cast<int>(npu_tensor->GetTensorDimension().GetHeight()),
-                             static_cast<int>(npu_tensor->GetTensorDimension().GetWidth())};
-  return ExpandShapeTo4d(tensor->shape()) == npu_shape;
-}
-
-bool IsSameShapeOutTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
+bool IsSameShapeTensor(tensor::MSTensor *tensor, std::shared_ptr<hiai::AiTensor> npu_tensor) {
  if (tensor->shape().size() > 4) {
    MS_LOG(ERROR) << "Npu does not support output tensor dims greater than 4";
    return false;
@ -101,39 +83,37 @@ bool IsSameShapeOutTensor(Tensor *tensor, std::shared_ptr<hiai::AiTensor> npu_te
  return GetNpuTensorShape(tensor->shape().size(), npu_tensor) == tensor->shape();
 }

-int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-                     const std::vector<kernel::LiteKernel *> &in_kernels,
-                     const std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
-                     const KernelCallBack &before, const KernelCallBack &after) {
+int NPUExecutor::Run(const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors, const std::vector<NPUOp *> &in_ops) {
  hiai::AiContext context;
-  std::unordered_map<lite::Tensor *, int> tensor_uses;
-  for (const auto ker : in_kernels) {
-    for (const auto ker_input : ker->in_tensors()) {
-      if (tensor_uses.find(ker_input) == tensor_uses.end()) {
-        tensor_uses.insert({ker_input, 1});
+  std::unordered_map<tensor::MSTensor *, int> tensor_uses;
+  for (const auto op : in_ops) {
+    for (const auto op_input : op->inputs()) {
+      if (tensor_uses.find(op_input) == tensor_uses.end()) {
+        tensor_uses.insert({op_input, 1});
      } else {
-        tensor_uses[ker_input]++;
+        tensor_uses[op_input]++;
      }
    }
  }
  for (int i = 0; i < npu_input_tensors_.size(); ++i) {
    int index = 0;
    for (; index < in_tensors.size(); index++) {
-      if (tensor_uses[in_tensors[index]] > 0 && IsSameShapeInTensor(in_tensors[index], npu_input_tensors_[i])) {
-        void *data = in_tensors[index]->data_c();
+      if (tensor_uses[in_tensors[index]] > 0 && IsSameShapeTensor(in_tensors[index], npu_input_tensors_[i])) {
+        void *data = in_tensors[index]->data();
        if (data == nullptr) {
-          MS_LOG(ERROR) << "For " << model_name_ << ", the " << i << "th input data is nullptr";
+          MS_LOG(ERROR) << "For " << model_name_ << ", the input tensor " << in_tensors[index]->tensor_name()
+                        << " data is nullptr";
          return RET_ERROR;
        }

        memcpy(npu_input_tensors_[i]->GetBuffer(), data, in_tensors[index]->Size());
        tensor_uses[in_tensors[index]]--;
-        in_tensors[index]->DecRefCount();
        break;
      }
    }
    if (index == in_tensors.size()) {
-      MS_LOG(ERROR) << "Can't find corresponding ms lite tensor of " << i << " input tensor for npu executor "
+      MS_LOG(ERROR) << "Can't find corresponding ms lite tensor of the " << i << "th input tensor for npu executor "
                    << model_name_;
      return RET_ERROR;
    }
@ -154,21 +134,21 @@ int NPUExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<
  for (int i = 0; i < npu_output_tensors_.size(); ++i) {
    int index = 0;
    for (; index < out_tensors.size(); index++) {
-      if (!outputs_visited[index] && IsSameShapeOutTensor(out_tensors[index], npu_output_tensors_[i])) {
-        void *data = out_tensors[index]->MutableData();
+      if (!outputs_visited[index] && IsSameShapeTensor(out_tensors[index], npu_output_tensors_[i])) {
+        void *data = out_tensors[index]->data();
        if (data == nullptr) {
-          MS_LOG(ERROR) << "For " << model_name_ << ", the " << i << "th output data is nullptr";
+          MS_LOG(ERROR) << "For " << model_name_ << ", the output tensor " << in_tensors[index]->tensor_name()
+                        << " data is nullptr";
          return RET_ERROR;
        }

        memcpy(data, npu_output_tensors_[i]->GetBuffer(), npu_output_tensors_[i]->GetSize());
-        out_tensors[index]->ResetRefCount();
        outputs_visited[index] = true;
        break;
      }
    }
    if (index == out_tensors.size()) {
-      MS_LOG(ERROR) << "Can't find corresponding ms lite tensor of " << i << " output tensor for npu executor "
+      MS_LOG(ERROR) << "Can't find corresponding ms lite tensor of the " << i << "th output tensor for npu executor "
                    << model_name_;
      return RET_ERROR;
    }
@ -246,4 +226,4 @@ int NPUExecutor::UpdateOutputTensorVec(const std::vector<hiai::TensorDimension>
  }
  return RET_OK;
 }
-}  // namespace mindspore::lite
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/agent/npu/npu_executor.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_executor.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,32 +14,27 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_EXECUTOR_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_EXECUTOR_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_EXECUTOR_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_EXECUTOR_H_
 #include <string>
 #include <memory>
 #include <utility>
 #include <vector>
-#include "src/executor.h"
 #include "include/errorcode.h"
 #include "include/HiAiModelManagerService.h"
-#ifdef SUPPORT_NPU
-#include "src/runtime/agent/npu/npu_manager.h"
-#endif
+#include "src/delegate/npu/npu_manager.h"
+#include "src/delegate/npu/op/npu_op.h"

-namespace mindspore::lite {
-class NPUExecutor : public Executor {
+namespace mindspore {
+class NPUExecutor {
 public:
  explicit NPUExecutor(const std::string &model_name, NPUManager *npu_manager = nullptr)
      : model_name_(model_name), npu_manager_(npu_manager) {}
-  ~NPUExecutor() override;
-  int Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs,
-              const std::vector<Tensor *> &outputs, const lite::InnerContext *ctx) override;
+  ~NPUExecutor();
+  int Prepare();

-  int Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
-          const std::vector<kernel::LiteKernel *> &in_kernels, const std::vector<kernel::LiteKernel *> &kernels,
-          Allocator *allocator = nullptr, const KernelCallBack &before = nullptr,
-          const KernelCallBack &after = nullptr);
+  int Run(const std::vector<tensor::MSTensor *> &in_tensors, const std::vector<tensor::MSTensor *> &out_tensors,
+          const std::vector<NPUOp *> &in_ops);

 private:
  int GetIOTensorVec();
@ -55,5 +50,5 @@ class NPUExecutor : public Executor {
  std::vector<std::shared_ptr<hiai::AiTensor>> npu_input_tensors_;
  std::vector<std::shared_ptr<hiai::AiTensor>> npu_output_tensors_;
 };
-}  // namespace mindspore::lite
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_EXECUTOR_H_
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_EXECUTOR_H_
--- a/mindspore/lite/src/delegate/npu/npu_graph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_graph.cc
@ -0,0 +1,226 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/npu_graph.h"
+#include <queue>
+#include "src/delegate/npu/npu_subgraph.h"
+#include "src/delegate/npu/npu_graph_utils.h"
+#include "src/delegate/npu/op/transpose_npu.h"
+#include "src/delegate/npu/transpose_kernel.h"
+namespace mindspore {
+NPUGraph::~NPUGraph() {
+  for (auto *kernel : all_kernels_) {
+    delete kernel;
+  }
+  for (auto *op : npu_ops_) {
+    delete op;
+  }
+  for (auto *tensor : insert_tensors_) {
+    delete tensor;
+  }
+}
+
+void NPUGraph::set_input(tensor::MSTensor *in_tensor, int index) {
+  MS_ASSERT(index < inputs_.size());
+  auto origin_tensor = this->inputs_[index];
+  for (auto kernel : all_kernels_) {
+    for (size_t i = 0; i < kernel->inputs().size(); i++) {
+      if (kernel->inputs()[i] == origin_tensor) {
+        kernel->set_input(in_tensor, i);
+      }
+    }
+  }
+  this->inputs_[index] = in_tensor;
+}
+
+void NPUGraph::set_output(tensor::MSTensor *out_tensor, int index) {
+  MS_ASSERT(index < outputs_.size());
+  auto origin_tensor = this->outputs_[index];
+  for (auto kernel : all_kernels_) {
+    for (size_t i = 0; i < kernel->outputs().size(); i++) {
+      if (kernel->outputs()[i] == origin_tensor) {
+        kernel->set_output(out_tensor, i);
+      }
+    }
+  }
+  this->outputs_[index] = out_tensor;
+}
+
+int NPUGraph::Init() {
+  all_kernels_.clear();
+  std::map<const NPUOp *, bool> is_visited;
+  for (auto op : npu_ops_) {
+    is_visited[op] = false;
+  }
+
+  while (npu_ops_.size() > 0) {
+    auto head_op_iter = std::find_if(npu_ops_.begin(), npu_ops_.end(), [&](const NPUOp *op) {
+      if (is_visited[op]) {
+        return false;
+      }
+      return true;
+    });
+    if (head_op_iter == npu_ops_.end()) {
+      break;
+    }
+    auto head_op = *head_op_iter;
+    if (head_op->type() != schema::PrimitiveType_Transpose) {
+      // If npu_kernel does not equal nullptr, this kernel can be supported by delegate
+      auto npu_ops = FindSubgraphOps(head_op, &is_visited);
+      auto subgraph_kernel = CreateNPUSubgraphKernel(npu_ops);
+      if (subgraph_kernel == nullptr) {
+        MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
+        return RET_ERROR;
+      }
+      all_kernels_.push_back(subgraph_kernel);
+    } else {
+      auto transpose_kernel = CreateNPUTransposeKernel(head_op);
+      if (transpose_kernel == nullptr) {
+        MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
+        return RET_ERROR;
+      }
+      all_kernels_.push_back(transpose_kernel);
+      is_visited[head_op] = true;
+    }
+  }
+  return RET_OK;
+}
+
+std::vector<NPUOp *> NPUGraph::FindPreOps(NPUOp *cur_op) {
+  std::vector<NPUOp *> in_ops;
+  for (auto in_tensor : cur_op->inputs()) {
+    for (auto op : npu_ops_) {
+      if (find(op->outputs().begin(), op->outputs().end(), in_tensor) != op->outputs().end()) {
+        in_ops.push_back(op);
+      }
+    }
+  }
+  return in_ops;
+}
+
+std::vector<NPUOp *> NPUGraph::FindNextOps(NPUOp *cur_op) {
+  std::vector<NPUOp *> out_ops;
+  for (auto out_tensor : cur_op->outputs()) {
+    for (auto op : npu_ops_) {
+      if (find(op->inputs().begin(), op->inputs().end(), out_tensor) != op->inputs().end()) {
+        out_ops.push_back(op);
+      }
+    }
+  }
+  return out_ops;
+}
+
+int NPUGraph::FindPreNextOps() {
+  for (auto op : npu_ops_) {
+    auto in_ops = FindPreOps(op);
+    op->set_in_ops(in_ops);
+    auto out_ops = FindNextOps(op);
+    op->set_out_ops(out_ops);
+  }
+  return RET_OK;
+}
+
+std::vector<NPUOp *> NPUGraph::FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited) {
+  std::vector<NPUOp *> subgraph_ops;
+  subgraph_ops.push_back(head_op);
+  (*is_visited)[head_op] = true;
+  std::queue<NPUOp *> op_queue;
+  op_queue.emplace(head_op);
+  while (!op_queue.empty()) {
+    auto cur_op = op_queue.front();
+    op_queue.pop();
+    auto out_ops = cur_op->out_ops();
+    for (auto out_op : out_ops) {
+      if ((*is_visited)[out_op] == true) {
+        continue;
+      }
+      auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
+                                     [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
+      if (input_ready && out_op->type() != schema::PrimitiveType_Transpose) {
+        subgraph_ops.push_back(out_op);
+        (*is_visited)[out_op] = true;
+        op_queue.push(out_op);
+      }
+    }
+  }
+  return subgraph_ops;
+}
+
+kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
+  auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
+  if (subgraph == nullptr) {
+    MS_LOG(ERROR) << "New NPU Subgraph failed.";
+    return nullptr;
+  }
+  subgraph->set_inputs(NPUGraphUtils::GetGraphInTensors(npu_ops));
+  subgraph->set_outputs(NPUGraphUtils::GetGraphOutTensors(npu_ops));
+  auto ret = subgraph->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "NPU Subgraph Init failed.";
+    return nullptr;
+  }
+  return subgraph;
+}
+
+kernel::Kernel *NPUGraph::CreateNPUTransposeKernel(NPUOp *op) {
+  if (op->type() != schema::PrimitiveType_Transpose) {
+    MS_LOG(ERROR) << "Check npu transpose op failed.";
+    return nullptr;
+  }
+  auto transpose_op = static_cast<TransposeNPUOp *>(op);
+  auto transpose_kernel = new (std::nothrow)
+    TransposeNPUKernel(transpose_op->inputs(), transpose_op->outputs(), transpose_op->GetPerm(), transpose_op->name());
+  if (transpose_kernel == nullptr) {
+    MS_LOG(ERROR) << "New npu transpose kernel failed.";
+    return nullptr;
+  }
+  return transpose_kernel;
+}
+
+int NPUGraph::Prepare() {
+  for (int i = 0; i < all_kernels_.size(); i++) {
+    auto ret = all_kernels_[i]->Prepare();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " prepare failed.";
+      return RET_ERROR;
+    }
+    for (auto output : all_kernels_[i]->outputs()) {
+      if (find(outputs_.begin(), outputs_.end(), output) == outputs_.end()) {
+        output->MutableData();
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int NPUGraph::Execute() {
+  for (int i = 0; i < all_kernels_.size(); i++) {
+    // 1. malloc graph output data
+    for (auto output : all_kernels_[i]->outputs()) {
+      if (find(outputs_.begin(), outputs_.end(), output) != outputs_.end()) {
+        output->MutableData();
+      }
+    }
+    // 2. execute
+    auto ret = all_kernels_[i]->Execute();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " execute failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/npu_graph.h
+++ b/mindspore/lite/src/delegate/npu/npu_graph.h
@ -0,0 +1,79 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
+
+#include <vector>
+#include <map>
+#include <utility>
+#include "include/kernel.h"
+#include "src/delegate/npu/op/npu_op.h"
+#include "src/delegate/npu/npu_executor.h"
+
+namespace mindspore {
+class NPUGraph : public kernel::Kernel {
+ public:
+  NPUGraph(std::vector<NPUOp *> npu_ops, NPUManager *npu_manager, const std::vector<tensor::MSTensor *> &inputs,
+           const std::vector<tensor::MSTensor *> &outputs)
+      : kernel::Kernel(inputs, outputs, nullptr, nullptr), npu_ops_(std::move(npu_ops)), npu_manager_(npu_manager) {}
+
+  ~NPUGraph() override;
+
+  int Init();
+
+  int Prepare() override;
+
+  int Execute() override;
+
+  int ReSize() override {
+    MS_LOG(ERROR) << "NPU does not support the resize function temporarily.";
+    return lite::RET_ERROR;
+  }
+
+  void set_input(tensor::MSTensor *in_tensor, int index) override;
+
+  void set_output(tensor::MSTensor *out_tensor, int index) override;
+
+  int FindPreNextOps();
+
+  std::vector<NPUOp *> *GetOps() { return &npu_ops_; }
+
+  std::vector<tensor::MSTensor *> *GetInsertTensors() { return &insert_tensors_; }
+
+ protected:
+  std::vector<NPUOp *> FindPreOps(NPUOp *cur_op);
+
+  std::vector<NPUOp *> FindNextOps(NPUOp *cur_op);
+
+  std::vector<NPUOp *> FindSubgraphOps(NPUOp *head_op, std::map<const NPUOp *, bool> *is_visited);
+
+  kernel::Kernel *CreateNPUSubgraphKernel(std::vector<NPUOp *> ops);
+
+  kernel::Kernel *CreateNPUTransposeKernel(NPUOp *op);
+
+  std::vector<NPUOp *> npu_ops_{};
+
+  std::vector<kernel::Kernel *> all_kernels_{};
+
+  std::vector<tensor::MSTensor *> insert_tensors_;
+
+  NPUManager *npu_manager_ = nullptr;
+};
+
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_H_
--- a/mindspore/lite/src/delegate/npu/npu_graph_utils.cc
+++ b/mindspore/lite/src/delegate/npu/npu_graph_utils.cc
@ -0,0 +1,77 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/npu_graph_utils.h"
+namespace mindspore {
+std::vector<mindspore::tensor::MSTensor *> NPUGraphUtils::GetGraphInTensors(std::vector<NPUOp *> ops) {
+  std::vector<mindspore::tensor::MSTensor *> inputs;
+  auto is_op_output = [&](tensor::MSTensor *tensor) -> bool {
+    for (auto op : ops) {
+      auto out_tensors = op->outputs();
+      if (find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  for (auto op : ops) {
+    for (auto in_tensor : op->inputs()) {
+      if (in_tensor->data() == nullptr && !is_op_output(in_tensor)) {
+        inputs.push_back(in_tensor);
+      }
+    }
+  }
+  return inputs;
+}
+
+std::vector<mindspore::tensor::MSTensor *> NPUGraphUtils::GetGraphOutTensors(std::vector<NPUOp *> ops) {
+  std::vector<mindspore::tensor::MSTensor *> outputs;
+  auto is_op_input = [&](const tensor::MSTensor *tensor) -> bool {
+    for (auto op : ops) {
+      auto in_tensors = op->inputs();
+      if (find(in_tensors.begin(), in_tensors.end(), tensor) != in_tensors.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  for (auto op : ops) {
+    for (auto out_tensor : op->outputs()) {
+      if (!is_op_input(out_tensor)) {
+        outputs.push_back(out_tensor);
+      }
+    }
+  }
+
+  for (auto op : ops) {
+    for (auto out_op : op->out_ops()) {
+      if (find(ops.begin(), ops.end(), out_op) == ops.end()) {
+        // visit the out op that is not in the subgraph
+        for (auto tensor : op->outputs()) {
+          if (find(out_op->inputs().begin(), out_op->inputs().end(), tensor) != out_op->inputs().end()) {
+            // find the connected tensor
+            outputs.push_back(tensor);
+            break;
+          }
+        }
+      }
+    }
+  }
+  return outputs;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/npu_graph_utils.h
+++ b/mindspore/lite/src/delegate/npu/npu_graph_utils.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_UTILS_H_
+
+#include <vector>
+#include "include/ms_tensor.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class NPUGraphUtils {
+ public:
+  static std::vector<mindspore::tensor::MSTensor *> GetGraphInTensors(std::vector<NPUOp *> ops);
+
+  static std::vector<mindspore::tensor::MSTensor *> GetGraphOutTensors(std::vector<NPUOp *> ops);
+};
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_GRAPH_UTILS_H_
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.cc
@ -14,17 +14,16 @@
 * limitations under the License.
 */

-#include "src/runtime/agent/npu/npu_manager.h"
+#include "src/delegate/npu/npu_manager.h"
 #include <sys/system_properties.h>
 #include <sys/fcntl.h>
 #include <unistd.h>
 #include "include/hiai_ir_build.h"
 #include "include/HiAiModelManagerService.h"
-#include "include/errorcode.h"
 #include "src/common/file_utils.h"

-namespace mindspore::lite {
-const int max_model_num = 20;
+namespace mindspore {
+#define MAX_MODEL_NUM 20
 int NPUManager::CompareVersion(const string &version1, const string &version2) {
  std::istringstream iss1(version1);
  std::istringstream iss2(version2);
@ -62,8 +61,7 @@ void NPUManager::Reset() {
    client.reset();
  }
  clients_.clear();
-
-  index_ = 0;
+  subgraph_index_ = 0;
  domi::HiaiIrBuild ir_build;
  for (const auto &model_map : models_) {
    auto model = model_map.second;
@ -90,6 +88,7 @@ bool NPUManager::CheckDDKVersion() {
  }
  return true;
 }
+
 bool NPUManager::IsSupportNPU() {
  // Avoid multiple checks
  if (!is_check_version_) {
@ -143,11 +142,11 @@ bool NPUManager::IsKirinChip() {

 int NPUManager::AddModel(std::shared_ptr<domi::ModelBufferData> model_buffer_data, const std::string &model_name,
                         int frequency) {
-  auto model = std::make_shared<SubGraphModel>(index_, model_name, model_buffer_data);
+  auto model = std::make_shared<SubGraphModel>(subgraph_index_, model_name, model_buffer_data);
  auto desc = std::make_shared<hiai::AiModelDescription>(model_name, frequency, 0, 0, 0);
  model->desc_ = desc;
  models_.insert({model_name, model});
-  index_++;
+  subgraph_index_++;
  return RET_OK;
 }

@ -172,7 +171,7 @@ int NPUManager::LoadOMModel() {
  std::unordered_map<std::shared_ptr<hiai::AiModelBuilder>, hiai::MemBuffer *> builder_buffer_map;
  int total = 0;
  for (const auto &model_map : models_) {
-    if (total % max_model_num == 0) {
+    if (total % MAX_MODEL_NUM == 0) {
      client = CreateAiModelMngerClient();
      if (client == nullptr) {
        MS_LOG(ERROR) << "Create Client failed.";
@ -198,7 +197,7 @@ int NPUManager::LoadOMModel() {
    }
    builder_buffer_map.insert({mc_builder, buffer});
    model->desc_->SetModelBuffer(buffer->GetMemBufferData(), buffer->GetMemBufferSize());
-    if (models_desc.size() == max_model_num) {
+    if (models_desc.size() == MAX_MODEL_NUM) {
      auto ret = LoadModel(client, models_desc);
      if (ret != RET_ERROR) {
        MS_LOG(ERROR) << "Client load model failed.";
@ -231,8 +230,6 @@ std::shared_ptr<hiai::AiModelMngerClient> NPUManager::GetClient(const std::strin
  return models_[model_name]->client_;
 }

-int NPUManager::index() const { return index_; }
-
 int NPUManager::LoadModel(const std::shared_ptr<hiai::AiModelMngerClient> &client,
                          std::vector<std::shared_ptr<hiai::AiModelDescription>> desc_list) {
  auto ret = client->Load(desc_list);
@ -250,4 +247,4 @@ int NPUManager::LoadModel(const std::shared_ptr<hiai::AiModelMngerClient> &clien
  this->clients_.push_back(client);
  return RET_OK;
 }
-}  // namespace mindspore::lite
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/agent/npu/npu_manager.h
+++ b/mindspore/lite/src/runtime/agent/npu/npu_manager.h
@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_MANAGER_H_
 #include <string>
 #include <memory>
 #include <utility>
@ -24,9 +24,13 @@
 #include <set>
 #include "include/hiai_ir_build.h"
 #include "schema/model_generated.h"
+#include "include/errorcode.h"
 #include "include/HiAiModelManagerService.h"

-namespace mindspore::lite {
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {

 struct SubGraphModel {
 public:
@ -41,6 +45,7 @@ struct SubGraphModel {
  std::shared_ptr<hiai::AiModelMngerClient> client_ = nullptr;
  std::shared_ptr<hiai::AiModelDescription> desc_ = nullptr;
 };
+
 class NPUManager {
 public:
  NPUManager() = default;
@ -58,7 +63,7 @@ class NPUManager {
  // provide to executor.
  std::shared_ptr<hiai::AiModelMngerClient> GetClient(const std::string &model_name);

-  int index() const;
+  int SubGraphIndex() const { return subgraph_index_; }

  void Reset();

@ -77,12 +82,12 @@ class NPUManager {
  std::shared_ptr<hiai::AiModelMngerClient> CreateAiModelMngerClient();

 private:
-  int index_ = 0;
+  int subgraph_index_ = 0;
  bool is_check_version_ = false;
  bool is_support_ = false;
  std::unordered_map<std::string, std::shared_ptr<SubGraphModel>> models_;
  std::vector<std::shared_ptr<hiai::AiModelMngerClient>> clients_;
 };

-}  // namespace mindspore::lite
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_NPU_MANAGER_H_
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_MANAGER_H_
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.cc
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.cc
@ -0,0 +1,298 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/npu_subgraph.h"
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include "include/errorcode.h"
+#include "include/graph/operator.h"
+#include "include/graph/graph.h"
+#include "include/graph/op/const_defs.h"
+#include "include/graph/model.h"
+#include "include/hiai_ir_build.h"
+#include "include/version.h"
+#include "src/common/utils.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+#include "src/delegate/npu/npu_graph_utils.h"
+namespace mindspore {
+static std::set<mindspore::schema::PrimitiveType> npu_specific_weight_nodes = {
+  schema::PrimitiveType_Conv2DFusion,   schema::PrimitiveType_Conv2dTransposeFusion,
+  schema::PrimitiveType_ScaleFusion,    schema::PrimitiveType_BatchNorm,
+  schema::PrimitiveType_FullConnection, schema::PrimitiveType_InstanceNorm,
+  schema::PrimitiveType_TileFusion,     schema::PrimitiveType_PadFusion};
+
+NPUSubGraph::~NPUSubGraph() {
+  subgraph_input_ops_.clear();
+  subgraph_output_ops_.clear();
+  out_tensor_sorted_.clear();
+  for (auto op : op_buffer_) {
+    delete op;
+  }
+  if (executor_ != nullptr) {
+    delete executor_;
+  }
+  op_buffer_.clear();
+}
+
+void NPUSubGraph::set_input(tensor::MSTensor *in_tensor, int index) {
+  MS_ASSERT(index < inputs_.size());
+  auto origin_tensor = inputs_[index];
+  // only in_ops_ input tensors list used in execute function
+  for (auto op : in_ops_) {
+    for (size_t i = 0; i < op->inputs().size(); i++) {
+      if (op->inputs()[i] == origin_tensor) {
+        op->set_input(in_tensor, i);
+      }
+    }
+  }
+  this->inputs_[index] = in_tensor;
+}
+
+void NPUSubGraph::set_output(tensor::MSTensor *out_tensor, int index) {
+  MS_ASSERT(index < out_tensor_sorted_.size());
+  auto origin_tensor = outputs_[index];
+  for (size_t i = 0; i < out_tensor_sorted_.size(); i++) {
+    if (out_tensor_sorted_[i] == origin_tensor) {
+      out_tensor_sorted_[i] = out_tensor;
+    }
+  }
+  outputs_[index] = out_tensor;
+}
+
+int NPUSubGraph::GetGraphInOutOps() {
+  for (auto in_tensor : this->inputs()) {
+    for (auto op : npu_ops_) {
+      if (find(op->inputs().begin(), op->inputs().end(), in_tensor) != op->inputs().end() &&
+          find(in_ops_.begin(), in_ops_.end(), op) == in_ops_.end()) {
+        in_ops_.push_back(op);
+      }
+    }
+  }
+  if (in_ops_.empty()) {
+    MS_LOG(ERROR) << "Can't find the input ops for npu sub graph.";
+    return RET_ERROR;
+  }
+
+  for (auto out_tensor : this->outputs()) {
+    for (auto op : npu_ops_) {
+      if (find(op->outputs().begin(), op->outputs().end(), out_tensor) != op->outputs().end() &&
+          find(out_ops_.begin(), out_ops_.end(), op) == out_ops_.end()) {
+        out_ops_.push_back(op);
+      }
+    }
+  }
+  if (out_ops_.empty()) {
+    MS_LOG(ERROR) << "Can't find the output ops for npu sub graph.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+std::vector<NPUOp *> NPUSubGraph::FindPreOps(NPUOp *cur_op) {
+  std::vector<NPUOp *> in_ops;
+  for (auto in_tensor : cur_op->inputs()) {
+    for (auto op : npu_ops_) {
+      if (find(op->outputs().begin(), op->outputs().end(), in_tensor) != op->outputs().end()) {
+        in_ops.push_back(op);
+      }
+    }
+  }
+  return in_ops;
+}
+
+std::shared_ptr<domi::ModelBufferData> NPUSubGraph::BuildIRModel() {
+  ge::Graph graph("NPUGraph");
+
+  auto ret = BuildNPUInputOp();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Build NPU input operator failed.";
+    return nullptr;
+  }
+  ret = BuildNPUOutputOp();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Build NPU output operator failed.";
+    return nullptr;
+  }
+  graph.SetInputs(subgraph_input_ops_).SetOutputs(subgraph_output_ops_);
+  ge::Model model(GetOMModelName(), mindspore::lite::Version());
+  model.SetGraph(graph);
+  domi::HiaiIrBuild ir_build;
+  auto om_model_buff = std::make_shared<domi::ModelBufferData>();
+  if (om_model_buff == nullptr) {
+    MS_LOG(ERROR) << "OM model buffer is nullptr.";
+    return nullptr;
+  }
+  if (!ir_build.CreateModelBuff(model, *om_model_buff)) {
+    MS_LOG(ERROR) << "Create model buffer failed.";
+    return nullptr;
+  }
+  if (!ir_build.BuildIRModel(model, *om_model_buff)) {
+    MS_LOG(ERROR) << "Build IR model failed.";
+    ir_build.ReleaseModelBuff(*om_model_buff);
+    return nullptr;
+  }
+  return om_model_buff;
+}
+
+int NPUSubGraph::Execute() { return executor_->Run(inputs(), out_tensor_sorted_, in_ops_); }
+
+int NPUSubGraph::BuildNPUInputOp() {
+  int count = 0;
+  subgraph_input_ops_.clear();
+  op_buffer_.clear();
+  for (auto op : this->npu_ops_) {
+    std::vector<ge::Operator *> input_ops;
+    std::unordered_map<int, std::pair<ge::Operator *, int>> index2_multi_out_index;
+    for (int i = 0; i < op->inputs().size(); ++i) {
+      auto in_tensor = op->inputs()[i];
+      if (IsSubGraphInputTensor(in_tensor)) {
+        auto tensor_name = op->name() + "_" + std::to_string(count++);
+        hiai::op::Data *data;
+        data = ConverterToNPUData(in_tensor, tensor_name);
+        subgraph_input_ops_.push_back(*data);
+        input_ops.push_back(data);
+        op_buffer_.push_back(data);
+        continue;
+      }
+
+      bool is_weight_tensor = true;
+      // todo cpu in_ops can't be found
+      auto pre_ops = FindPreOps(op);
+      for (auto pre_op : pre_ops) {
+        if (find(pre_op->outputs().begin(), pre_op->outputs().end(), in_tensor) != pre_op->outputs().end()) {
+          // input come from npu
+          auto npu_op = reinterpret_cast<NPUOp *>(pre_op)->GetNPUOp();
+          if (npu_op == nullptr) {
+            MS_LOG(ERROR) << pre_op->name() << "'s npu operator is nullptr.";
+            return RET_ERROR;
+          }
+          input_ops.push_back(npu_op);
+          if (pre_op->outputs().size() != 1) {  // in_op has multi output, we record which output we want.
+            int out_index =
+              std::find(pre_op->outputs().begin(), pre_op->outputs().end(), in_tensor) - pre_op->outputs().begin();
+            index2_multi_out_index[i] = {npu_op, out_index};
+          }
+          is_weight_tensor = false;
+          break;
+        }
+      }
+
+      // weight tensor
+      if (is_weight_tensor) {
+        if (npu_specific_weight_nodes.find(op->type()) == npu_specific_weight_nodes.end()) {
+          auto name = op->name() + "_" + std::to_string(count++);
+          auto weight_const = new (std::nothrow) hiai::op::Const(op->name() + "_" + std::to_string(count++));
+          if (weight_const == nullptr) {
+            MS_LOG(ERROR) << "New weight const failed.";
+            return RET_ERROR;
+          }
+          auto weight_tensor = ConverterToNPUTensor(in_tensor);
+          weight_const->set_attr_value(weight_tensor);
+          input_ops.push_back(weight_const);
+          op_buffer_.push_back(weight_const);
+        }
+      }
+    }
+    // set input to NPU
+    int ret =
+      reinterpret_cast<NPUOp *>(op)->SetNPUInputs(op->inputs(), op->outputs(), input_ops, index2_multi_out_index);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << op->name() << " set npu inputs failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+bool NPUSubGraph::IsSubGraphInputTensor(tensor::MSTensor *input) {
+  if (find(this->inputs().begin(), this->inputs().end(), input) != this->inputs().end()) {
+    return true;
+  }
+  return false;
+}
+
+int NPUSubGraph::GetNPUOperators(const vector<NPUOp *> &ops) {
+  subgraph_output_ops_.reserve(ops.size());
+  for (int i = 0; i < ops.size(); i++) {
+    auto npu_op = reinterpret_cast<NPUOp *>(ops[i])->GetNPUOp();
+    if (npu_op == nullptr) {
+      MS_LOG(ERROR) << "Get NPU operator for " << ops[i]->name() << " failed.";
+      return RET_ERROR;
+    }
+    subgraph_output_ops_.push_back(*npu_op);
+  }
+  return RET_OK;
+}
+
+int NPUSubGraph::BuildNPUOutputOp() {
+  subgraph_output_ops_.clear();
+  auto ret = GetNPUOperators(out_ops_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get NPU operators failed.";
+    return RET_ERROR;
+  }
+  out_tensor_sorted_.resize(outputs().size());
+  int i = 0;
+  for (auto node : out_ops_) {
+    for (auto tensor : node->outputs()) {
+      if (std::find(outputs().begin(), outputs().end(), tensor) != outputs().end())
+        this->out_tensor_sorted_[i++] = tensor;
+    }
+  }
+  if (subgraph_output_ops_.empty()) {
+    MS_LOG(ERROR) << "NPU subgraph output op is empty.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+std::string NPUSubGraph::GetOMModelName() { return this->name_ + ".om"; }
+
+int NPUSubGraph::Init() {
+  auto ret = GetGraphInOutOps();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get NPU subgraph input and output ops failed.";
+    return RET_ERROR;
+  }
+  name_ = "kNpuSubGraph" + std::to_string(npu_manager_->SubGraphIndex());
+  auto model_buffer_data = BuildIRModel();
+  if (model_buffer_data == nullptr) {
+    MS_LOG(ERROR) << "Build IR model failed.";
+    return RET_ERROR;
+  }
+
+  MS_ASSERT(npu_manager_ != nullptr);
+  // todo y00520784, get frequency
+  npu_manager_->AddModel(model_buffer_data, GetOMModelName(), 3);
+
+  executor_ = new (std::nothrow) NPUExecutor(GetOMModelName(), npu_manager_);
+
+  if (executor_ == nullptr) {
+    MS_LOG(ERROR) << "Create NPUExecutor failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NPUSubGraph::Prepare() {
+  if (executor_->Prepare() != RET_OK) {
+    MS_LOG(ERROR) << "NPU executor prepare failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/npu_subgraph.h
+++ b/mindspore/lite/src/delegate/npu/npu_subgraph.h
@ -0,0 +1,89 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_SUBGRAPH_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_SUBGRAPH_H_
+
+#include <memory>
+#include <vector>
+#include <string>
+#include "include/kernel.h"
+#include "src/delegate/npu/npu_executor.h"
+
+namespace mindspore {
+class NPUSubGraph : public kernel::Kernel {
+ public:
+  NPUSubGraph(const std::vector<NPUOp *> &npu_ops, NPUManager *npu_manager)
+      : npu_ops_(npu_ops), npu_manager_(npu_manager) {}
+
+  ~NPUSubGraph() override;
+
+  int Init();
+
+  int Prepare() override;
+
+  int Execute() override;
+
+  int ReSize() override {
+    MS_LOG(ERROR) << "NPU does not support the resize function temporarily.";
+    return lite::RET_ERROR;
+  }
+
+  void set_input(tensor::MSTensor *in_tensor, int index) override;
+
+  void set_output(tensor::MSTensor *out_tensor, int index) override;
+
+  int GetGraphInOutOps();
+
+  std::vector<NPUOp *> FindPreOps(NPUOp *cur_op);
+
+ private:
+  std::shared_ptr<domi::ModelBufferData> BuildIRModel();
+
+  int BuildNPUInputOp();
+
+  int BuildNPUOutputOp();
+
+  int GetNPUOperators(const std::vector<NPUOp *> &ops);
+
+  bool IsSubGraphInputTensor(tensor::MSTensor *input);
+
+  std::string GetOMModelName();
+
+  bool is_compiled_ = false;
+
+  std::vector<ge::Operator> subgraph_input_ops_;
+
+  std::vector<ge::Operator> subgraph_output_ops_;
+
+  std::vector<tensor::MSTensor *> out_tensor_sorted_;
+
+  std::vector<ge::Operator *> op_buffer_;
+
+  std::vector<NPUOp *> npu_ops_{};
+  // entry nodes in nodes
+  std::vector<NPUOp *> in_ops_{};
+  // exit nodes in nodes
+  std::vector<NPUOp *> out_ops_{};
+
+  NPUExecutor *executor_ = nullptr;
+
+  NPUManager *npu_manager_ = nullptr;
+};
+
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_NPU_SUBGRAPH_H_
--- a/mindspore/lite/src/delegate/npu/op/activation_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/activation_npu.cc
@ -0,0 +1,90 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/activation_npu.h"
+namespace mindspore {
+int ActivationNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto act_prim = primitive->value_as_Activation();
+  if (act_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  act_type_ = act_prim->activation_type();
+  if (act_type_ != schema::ActivationType_RELU && act_type_ != schema::ActivationType_RELU6 &&
+      act_type_ != schema::ActivationType_SIGMOID && act_type_ != schema::ActivationType_TANH &&
+      act_type_ != schema::ActivationType_HSIGMOID && act_type_ != schema::ActivationType_LEAKY_RELU) {
+    MS_LOG(WARNING) << "Unsupported activation type for activation op " << name_ << "when running npu";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ActivationNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  act_ = new (std::nothrow) hiai::op::Activation(name_);
+  if (act_ == nullptr) {
+    MS_LOG(ERROR) << "New activation npu operator for activation op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto act_prim = primitive->value_as_Activation();
+  if (act_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  switch (act_type_) {
+    case schema::ActivationType_SIGMOID:
+      act_->set_attr_mode(0);
+      break;
+    case schema::ActivationType_RELU:
+      act_->set_attr_mode(1);
+      break;
+    case schema::ActivationType_TANH:
+      act_->set_attr_mode(2);
+      break;
+    case schema::ActivationType_LEAKY_RELU:
+      act_->set_attr_mode(5);
+      act_->set_attr_negative_slope(act_prim->alpha());
+      break;
+    case schema::ActivationType_HSIGMOID:
+      act_->set_attr_mode(10);
+      break;
+    case schema::ActivationType_RELU6:
+      act_->set_attr_mode(14);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupported activation type for activation op " << name_ << "when running npu";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ActivationNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors,
+                                  const std::vector<ge::Operator *> &npu_inputs) {
+  act_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *ActivationNPUOp::GetNPUOp() { return act_; }
+
+ActivationNPUOp::~ActivationNPUOp() {
+  if (act_ != nullptr) {
+    delete act_;
+    act_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/activation_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/activation_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ACTIVATION_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ACTIVATION_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/graph/compatible/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class ActivationNPUOp : public NPUOp {
+ public:
+  ActivationNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                  const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ActivationNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::Activation *act_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ACTIVATION_NPU_H_
--- a/mindspore/lite/src/runtime/kernel/npu/argmax_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/argmax_npu.cc
@ -14,63 +14,63 @@
 * limitations under the License.
 */

-#include "src/runtime/kernel/npu/argmax_npu.h"
+#include "src/delegate/npu/op/argmax_npu.h"
 #include <memory>
-#include "include/graph/op/all_ops.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/agent/npu/npu_converter_utils.h"
+#include "src/delegate/npu/npu_converter_utils.h"

-using mindspore::kernel::KERNEL_ARCH::kNPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::schema::PrimitiveType_ArgMaxFusion;
-
-namespace mindspore::kernel {
-int ArgmaxNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                               OpParameter *opParameter) {
-  return RET_OK;
-}
-
-int ArgmaxNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                                  const std::vector<ge::Operator *> &npu_inputs) {
-  op_ = new (std::nothrow) hiai::op::ArgMaxExt2(name_);
-  if (op_ == nullptr) {
+namespace mindspore {
+int ArgmaxNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  argmax_ = new (std::nothrow) hiai::op::ArgMaxExt2(name_);
+  if (argmax_ == nullptr) {
    MS_LOG(ERROR) << "New argmax npu operator for " << name_ << " failed.";
    return RET_ERROR;
  }
-  op_->set_input_x(*npu_inputs[0]);
+  auto argmax_prim = primitive->value_as_ArgMaxFusion();
+  if (argmax_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+
  auto axis_const_ = new (std::nothrow) hiai::op::Const(name_ + "_axis");
  if (axis_const_ == nullptr) {
    MS_LOG(ERROR) << "New weight const failed.";
    return RET_ERROR;
  }
+  std::vector<int> axis = {static_cast<int>(argmax_prim->axis())};
  ge::TensorDesc tensor_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_INT32);
  std::shared_ptr<ge::Tensor> ge_tensor =
-    std::make_shared<ge::Tensor>(tensor_desc, reinterpret_cast<const uint8_t *>(&(param_->axis_)), sizeof(int));
+    std::make_shared<ge::Tensor>(tensor_desc, reinterpret_cast<const uint8_t *>(axis.data()), sizeof(int));
  if (ge_tensor == nullptr) {
    MS_LOG(ERROR) << "new ge_tensor failed.";
    return RET_ERROR;
  }
  axis_const_->set_attr_value(ge_tensor);
-  op_->set_input_axis(*axis_const_);
-  op_->set_attr_keep_dims(param_->keep_dims_);
-  op_->set_attr_outmaxval(param_->out_value_);
-  op_->set_attr_topk(param_->topk_);
+  argmax_->set_input_axis(*axis_const_);

+  argmax_->set_attr_keep_dims(argmax_prim->keep_dims());
+  argmax_->set_attr_outmaxval(argmax_prim->out_max_value());
+  argmax_->set_attr_topk(argmax_prim->top_k());
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::ArgmaxNPUKernel::GetNPUOp() { return op_; }
+int ArgmaxNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  argmax_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}

-ArgmaxNPUKernel::~ArgmaxNPUKernel() {
-  if (op_ != nullptr) {
-    delete op_;
-    op_ = nullptr;
+ge::Operator *ArgmaxNPUOp::GetNPUOp() { return argmax_; }
+
+ArgmaxNPUOp::~ArgmaxNPUOp() {
+  if (argmax_ != nullptr) {
+    delete argmax_;
+    argmax_ = nullptr;
  }
  if (axis_const_ != nullptr) {
    delete axis_const_;
    axis_const_ = nullptr;
  }
 }
-
-REG_KERNEL(kNPU, kNumberTypeFloat32, PrimitiveType_ArgMaxFusion, NPUKernelCreator<ArgmaxNPUKernel>)
-}  // namespace mindspore::kernel
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/argmax_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/argmax_npu.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARGMAX_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARGMAX_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/graph/compatible/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+
+class ArgmaxNPUOp : public NPUOp {
+ public:
+  ArgmaxNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ArgmaxNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::ArgMaxExt2 *argmax_ = nullptr;
+  hiai::op::Const *axis_const_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARGMAX_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/arithmetic_npu.cc
@ -0,0 +1,241 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/arithmetic_npu.h"
+#include "include/graph/op/all_ops.h"
+namespace mindspore {
+constexpr int RELU_MODE = 1;
+constexpr int RELU6_MODE = 14;
+int ArithmeticNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors[0]->shape() != in_tensors[1]->shape()) {
+    MS_LOG(WARNING) << name_ << " for the two inputs, the corresponding dimensions must have the same value."
+                    << " shape 1 is:" << in_tensors[0]->shape() << " shape 2 is:" << in_tensors[1]->shape();
+    return RET_NOT_SUPPORT;
+  }
+  auto type = primitive->value_type();
+  if (type == mindspore::schema::PrimitiveType_Less && in_tensors[0]->shape().size() == 1) {
+    MS_LOG(WARNING) << name_ << " not support input 1d";
+    return RET_NOT_SUPPORT;
+  }
+  if (type == mindspore::schema::PrimitiveType_Equal && in_tensors[0]->shape().size() == 2) {
+    MS_LOG(WARNING) << name_ << " not support input 2d";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+template <typename T>
+ge::Operator *CreateOperator(const std::string &name) {
+  auto op = new (std::nothrow) T(name);
+  if (op == nullptr) {
+    MS_LOG(ERROR) << name << " op is nullptr";
+    return nullptr;
+  }
+  return op;
+}
+
+int ArithmeticNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  switch (type_) {
+    case schema::PrimitiveType_MulFusion:
+      op_ = CreateOperator<hiai::op::Mul>(name_);
+      act_type_ = primitive->value_as_MulFusion()->activation_type();
+      break;
+    case schema::PrimitiveType_AddFusion:
+      op_ = CreateOperator<hiai::op::Add>(name_);
+      act_type_ = primitive->value_as_AddFusion()->activation_type();
+      break;
+    case schema::PrimitiveType_SubFusion:
+      op_ = CreateOperator<hiai::op::Sub>(name_);
+      act_type_ = primitive->value_as_SubFusion()->activation_type();
+      break;
+    case schema::PrimitiveType_DivFusion:
+      op_ = CreateOperator<hiai::op::RealDiv>(name_);
+      act_type_ = primitive->value_as_DivFusion()->activation_type();
+      break;
+    case schema::PrimitiveType_FloorMod:
+      op_ = CreateOperator<hiai::op::FloorMod>(name_);
+      break;
+    case schema::PrimitiveType_FloorDiv:
+      op_ = CreateOperator<hiai::op::FloorDiv>(name_);
+      break;
+    case schema::PrimitiveType_LogicalAnd:
+      op_ = CreateOperator<hiai::op::LogicalAnd>(name_);
+      break;
+    case schema::PrimitiveType_LogicalOr:
+      op_ = CreateOperator<hiai::op::LogicalOr>(name_);
+      break;
+    case schema::PrimitiveType_Maximum:
+      op_ = CreateOperator<hiai::op::Maximum>(name_);
+      break;
+    case schema::PrimitiveType_Minimum:
+      op_ = CreateOperator<hiai::op::Minimum>(name_);
+      break;
+    case schema::PrimitiveType_NotEqual:
+      op_ = CreateOperator<hiai::op::NotEqual>(name_);
+      break;
+    case schema::PrimitiveType_Equal:
+      op_ = CreateOperator<hiai::op::Equal>(name_);
+      break;
+    case schema::PrimitiveType_Less:
+      op_ = CreateOperator<hiai::op::Less>(name_);
+      break;
+    case schema::PrimitiveType_LessEqual:
+      op_ = CreateOperator<hiai::op::LessEqual>(name_);
+      break;
+    case schema::PrimitiveType_Greater:
+      op_ = CreateOperator<hiai::op::Greater>(name_);
+      break;
+    case schema::PrimitiveType_GreaterEqual:
+      op_ = CreateOperator<hiai::op::GreaterEqual>(name_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupported primitive type: " << schema::EnumNamePrimitiveType(type_);
+      return RET_ERROR;
+  }
+  auto ret = SetActivation();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Arithmetic npu op set activation failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ArithmeticNPUOp::SetActivation() {
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    act_ = new (std::nothrow) hiai::op::Activation(name_ + "_act");
+    if (act_ == nullptr) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+    if (act_type_ == schema::ActivationType_RELU) {
+      act_->set_attr_mode(RELU_MODE);
+    } else if (act_type_ == schema::ActivationType_RELU6) {
+      act_->set_attr_mode(RELU6_MODE);
+    } else {
+      MS_LOG(ERROR) << "Unsupported activation type for op " << name_;
+      return RET_ERROR;
+    }
+    act_->set_input_x(*op_);
+  }
+  return RET_OK;
+}
+
+template <typename T>
+void SetInputs(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *op) {
+  auto cur_op = reinterpret_cast<T *>(op);
+  cur_op->set_input_x1(*npu_inputs[0]);
+  cur_op->set_input_x2(*npu_inputs[1]);
+  return;
+}
+
+int ArithmeticNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors,
+                                  const std::vector<ge::Operator *> &npu_inputs) {
+  switch (type_) {
+    case schema::PrimitiveType_MulFusion:
+      SetInputs<hiai::op::Mul>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_AddFusion:
+      SetInputs<hiai::op::Add>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_SubFusion:
+      SetInputs<hiai::op::Sub>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_DivFusion:
+      SetInputs<hiai::op::RealDiv>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_FloorMod:
+      SetInputs<hiai::op::FloorMod>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_FloorDiv:
+      op_ = CreateOperator<hiai::op::FloorDiv>(name_);
+      break;
+    case schema::PrimitiveType_LogicalAnd:
+      SetInputs<hiai::op::LogicalAnd>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_LogicalOr:
+      SetInputs<hiai::op::LogicalOr>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Maximum:
+      SetInputs<hiai::op::Maximum>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Minimum:
+      SetInputs<hiai::op::Minimum>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_NotEqual:
+      SetInputs<hiai::op::NotEqual>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Equal:
+      SetInputs<hiai::op::Equal>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Less:
+      SetInputs<hiai::op::Less>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_LessEqual:
+      SetInputs<hiai::op::LessEqual>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Greater:
+      SetInputs<hiai::op::Greater>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_GreaterEqual:
+      SetInputs<hiai::op::GreaterEqual>(npu_inputs, op_);
+      break;
+    default:
+      MS_LOG(ERROR) << "SetInputs for npu op " << name_ << " failed.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ArithmeticNPUOp::SetNPUInputs(
+  const std::vector<tensor::MSTensor *> &in_tensors, const std::vector<tensor::MSTensor *> &out_tensors,
+  const std::vector<ge::Operator *> &npu_inputs,
+  const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
+  auto ret = SetNPUInputs(in_tensors, out_tensors, npu_inputs);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ArithmeticNPUOp SetNPUInputs failed";
+    return RET_ERROR;
+  }
+  if (index2_multi_out_index.empty()) {
+    return RET_OK;
+  }
+  for (auto it : index2_multi_out_index) {
+    MS_LOG(INFO) << name_ << "set input " << it.first << " from " << it.second.first << " output " << it.second.second;
+    op_->SetInput(it.first, *it.second.first, it.second.second);
+  }
+  return RET_OK;
+}
+
+ge::Operator *ArithmeticNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return op_;
+  }
+  return act_;
+}
+
+ArithmeticNPUOp::~ArithmeticNPUOp() {
+  if (op_ != nullptr) {
+    delete op_;
+    op_ = nullptr;
+  }
+  if (act_ != nullptr) {
+    delete act_;
+    act_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/arithmetic_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/arithmetic_npu.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARITHMETIC_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARITHMETIC_NPU_H_
+#include <vector>
+#include <string>
+#include <utility>
+#include <unordered_map>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class ArithmeticNPUOp : public NPUOp {
+ public:
+  ArithmeticNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                  const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ArithmeticNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors, const std::vector<ge::Operator *> &npu_inputs,
+                   const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetActivation();
+  int act_type_;
+  ge::Operator *op_ = nullptr;
+  hiai::op::Activation *act_ = nullptr;
+};  // namespace mindspore
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARITHMETIC_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/arithmetic_self_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/arithmetic_self_npu.cc
@ -0,0 +1,144 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/arithmetic_self_npu.h"
+#include <string>
+#include "include/graph/op/all_ops.h"
+
+namespace mindspore {
+template <typename T>
+ge::Operator *CreateOperator(const std::string &name) {
+  auto op = new (std::nothrow) T(name);
+  if (op == nullptr) {
+    MS_LOG(ERROR) << name << " op is nullptr";
+    return nullptr;
+  }
+  return op;
+}
+
+int ArithmeticSelfNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors) {
+  switch (type_) {
+    case schema::PrimitiveType_Cos:
+      op_ = CreateOperator<hiai::op::Cos>(name_);
+      break;
+    case schema::PrimitiveType_Log:
+      op_ = CreateOperator<hiai::op::Log>(name_);
+      break;
+    case schema::PrimitiveType_Square:
+      op_ = CreateOperator<hiai::op::Square>(name_);
+      break;
+    case schema::PrimitiveType_Sqrt:
+      op_ = CreateOperator<hiai::op::Sqrt>(name_);
+      break;
+    case schema::PrimitiveType_Rsqrt:
+      op_ = CreateOperator<hiai::op::Rsqrt>(name_);
+      break;
+    case schema::PrimitiveType_Sin:
+      op_ = CreateOperator<hiai::op::Sin>(name_);
+      break;
+    case schema::PrimitiveType_LogicalNot:
+      op_ = CreateOperator<hiai::op::LogicalNot>(name_);
+      break;
+    case schema::PrimitiveType_Floor:
+      op_ = CreateOperator<hiai::op::Floor>(name_);
+      break;
+    case schema::PrimitiveType_Ceil:
+      op_ = CreateOperator<hiai::op::Ceil>(name_);
+      break;
+    case schema::PrimitiveType_Round:
+      op_ = CreateOperator<hiai::op::Round>(name_);
+      break;
+    case schema::PrimitiveType_Neg:
+      op_ = CreateOperator<hiai::op::Neg>(name_);
+      break;
+    case schema::PrimitiveType_Reciprocal:
+      op_ = CreateOperator<hiai::op::Reciprocal>(name_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Unsupported primitive type: " << schema::EnumNamePrimitiveType(type_);
+      return RET_ERROR;
+  }
+  if (op_ == nullptr) {
+    MS_LOG(ERROR) << "Arithmetic self create operator return nullptr.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+template <typename T>
+void SetInputs(const std::vector<ge::Operator *> &npu_inputs, ge::Operator *op) {
+  auto cur_op = reinterpret_cast<T *>(op);
+  cur_op->set_input_x(*npu_inputs[0]);
+  return;
+}
+
+int ArithmeticSelfNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                      const std::vector<tensor::MSTensor *> &out_tensors,
+                                      const std::vector<ge::Operator *> &npu_inputs) {
+  switch (type_) {
+    case schema::PrimitiveType_Cos:
+      SetInputs<hiai::op::Cos>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Log:
+      SetInputs<hiai::op::Log>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Square:
+      SetInputs<hiai::op::Square>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Sqrt:
+      SetInputs<hiai::op::Sqrt>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Rsqrt:
+      SetInputs<hiai::op::Rsqrt>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Sin:
+      SetInputs<hiai::op::Sin>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_LogicalNot:
+      SetInputs<hiai::op::LogicalNot>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Floor:
+      SetInputs<hiai::op::Floor>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Ceil:
+      SetInputs<hiai::op::Ceil>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Round:
+      SetInputs<hiai::op::Round>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Neg:
+      SetInputs<hiai::op::Neg>(npu_inputs, op_);
+      break;
+    case schema::PrimitiveType_Reciprocal:
+      SetInputs<hiai::op::Reciprocal>(npu_inputs, op_);
+      break;
+    default:
+      MS_LOG(ERROR) << "SetInputs for npu op " << name_ << " failed.";
+      return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+ge::Operator *ArithmeticSelfNPUOp::GetNPUOp() { return this->op_; }
+
+ArithmeticSelfNPUOp::~ArithmeticSelfNPUOp() {
+  if (op_ != nullptr) {
+    delete op_;
+    op_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/arithmetic_self_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/arithmetic_self_npu.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARITHMETICSELF_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ARITHMETICSELF_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/math_defs.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class ArithmeticSelfNPUOp : public NPUOp {
+ public:
+  ArithmeticSelfNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ArithmeticSelfNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  ge::Operator *op_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_ARITHMETICSELF_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/avg_pooling_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/avg_pooling_npu.cc
@ -0,0 +1,123 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/avg_pooling_npu.h"
+namespace mindspore {
+int AvgPoolingNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto pooling_prim = primitive->value_as_AvgPoolFusion();
+  if (pooling_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto stride_h = static_cast<int>(*(pooling_prim->strides()->begin()));
+  auto stride_w = static_cast<int>(*(pooling_prim->strides()->begin() + 1));
+  auto pad_u = static_cast<int>(*(pooling_prim->pad()->begin()));
+  auto pad_l = static_cast<int>(*(pooling_prim->pad()->begin() + 2));
+  if (pad_u > stride_h || pad_l > stride_w) {
+    MS_LOG(WARNING) << "Npu pooling does not support pad > stride.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int AvgPoolingNPUOp::SetPoolingParam(const schema::AvgPoolFusion *pooling_prim) {
+  pooling_->set_attr_mode(1);
+  if (pooling_prim->global()) {
+    pooling_->set_attr_global_pooling(pooling_prim->global());
+  } else {
+    auto window_h = static_cast<int>(*(pooling_prim->kernel_size()->begin()));
+    auto window_w = static_cast<int>(*(pooling_prim->kernel_size()->begin() + 1));
+    pooling_->set_attr_window(ge::AttrValue::LIST_INT({window_h, window_w}));
+  }
+  auto stride_h = static_cast<int>(*(pooling_prim->strides()->begin()));
+  auto stride_w = static_cast<int>(*(pooling_prim->strides()->begin() + 1));
+  pooling_->set_attr_stride(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+  if (pooling_prim->pad_mode() == schema::PadMode_SAME) {
+    pooling_->set_attr_pad_mode(6);
+    pooling_->set_attr_pad({0, 0, 0, 0});
+  } else if (pooling_prim->pad_mode() == schema::PadMode_VALID) {
+    pooling_->set_attr_pad_mode(5);
+    pooling_->set_attr_pad({0, 0, 0, 0});
+  } else {
+    pooling_->set_attr_pad_mode(0);
+    auto pad_u = static_cast<int>(*(pooling_prim->pad()->begin()));
+    auto pad_d = static_cast<int>(*(pooling_prim->pad()->begin() + 1));
+    auto pad_l = static_cast<int>(*(pooling_prim->pad()->begin() + 2));
+    auto pad_r = static_cast<int>(*(pooling_prim->pad()->begin() + 3));
+    pooling_->set_attr_pad(ge::AttrValue::LIST_INT({pad_u, pad_d, pad_l, pad_r}));
+  }
+
+  if (pooling_prim->round_mode() == schema::RoundMode_FLOOR) {  // no use in cpu
+    pooling_->set_attr_ceil_mode(0);
+    pooling_->set_attr_data_mode(1);
+  } else {
+    pooling_->set_attr_ceil_mode(1);
+    pooling_->set_attr_data_mode(0);
+  }
+  return RET_OK;
+}
+
+int AvgPoolingNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  pooling_ = new (std::nothrow) hiai::op::PoolingD(name_ + "_pooling");
+  if (pooling_ == nullptr) {
+    MS_LOG(ERROR) << "New pooling npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto pooling_prim = primitive->value_as_AvgPoolFusion();
+  if (pooling_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto ret = SetPoolingParam(pooling_prim);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_type_ = pooling_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(pooling_, pooling_prim->activation_type());
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int AvgPoolingNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors,
+                                  const std::vector<ge::Operator *> &npu_inputs) {
+  pooling_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *AvgPoolingNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return pooling_;
+  } else {
+    return act_;
+  }
+}
+
+AvgPoolingNPUOp::~AvgPoolingNPUOp() {
+  if (pooling_ != nullptr) {
+    delete pooling_;
+    pooling_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/avg_pooling_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/avg_pooling_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_AVG_POOLING_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_AVG_POOLING_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+namespace mindspore {
+class AvgPoolingNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  AvgPoolingNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                  const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~AvgPoolingNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetPoolingParam(const schema::AvgPoolFusion *pooling_prim);
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::PoolingD *pooling_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_AVG_POOLING_NPU_H_
--- a/mindspore/lite/src/runtime/kernel/npu/batchnorm_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/batchnorm_npu.cc
@ -14,37 +14,38 @@
 * limitations under the License.
 */

-#include "src/runtime/kernel/npu/batchnorm_npu.h"
-#include "include/graph/op/all_ops.h"
-#include "src/kernel_registry.h"
-#include "src/runtime/agent/npu/npu_converter_utils.h"
+#include "src/delegate/npu/op/batchnorm_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"

-using mindspore::kernel::KERNEL_ARCH::kNPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::schema::PrimitiveType_FusedBatchNorm;
-
-namespace mindspore::kernel {
-int BatchnormNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                                  OpParameter *opParameter) {
-  return RET_OK;
-}
-
-int BatchnormNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
-                                     const std::vector<lite::Tensor *> &outputs,
-                                     const std::vector<ge::Operator *> &npu_inputs) {
+namespace mindspore {
+int BatchnormNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                         const std::vector<tensor::MSTensor *> &out_tensors) {
  batchnorm_ = new (std::nothrow) ge::op::BatchNormExt2(name_);
  if (batchnorm_ == nullptr) {
    MS_LOG(ERROR) << "New batchnorm npu operator for batchnorm op " << name_ << " failed.";
    return RET_ERROR;
  }
-  batchnorm_->set_input_x(*npu_inputs[0]);
+  auto batchnorm_prim = primitive->value_as_FusedBatchNorm();
+  if (batchnorm_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  batchnorm_->set_attr_epsilon(batchnorm_prim->epsilon());
+  batchnorm_->set_attr_momentum(batchnorm_prim->momentum());
+  batchnorm_->set_attr_mode(1);
+  return RET_OK;
+}

+int BatchnormNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                 const std::vector<tensor::MSTensor *> &out_tensors,
+                                 const std::vector<ge::Operator *> &npu_inputs) {
+  batchnorm_->set_input_x(*npu_inputs[0]);
  auto scale = new (std::nothrow) hiai::op::Const(name_ + "_scale");
  if (scale == nullptr) {
    MS_LOG(ERROR) << "New scale const failed.";
    return RET_ERROR;
  }
-  auto scale_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
+  auto scale_tensor = mindspore::ConverterToNPUTensor(in_tensors[1]);
  scale->set_attr_value(scale_tensor);
  batchnorm_->set_input_scale(*scale);

@ -53,7 +54,7 @@ int BatchnormNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
    MS_LOG(ERROR) << "New offset const failed.";
    return RET_ERROR;
  }
-  auto offset_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
+  auto offset_tensor = mindspore::ConverterToNPUTensor(in_tensors[2]);
  offset->set_attr_value(offset_tensor);
  batchnorm_->set_input_offset(*offset);

@ -62,7 +63,7 @@ int BatchnormNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
    MS_LOG(ERROR) << "New mean const failed.";
    return RET_ERROR;
  }
-  auto mean_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
+  auto mean_tensor = mindspore::ConverterToNPUTensor(in_tensors[3]);
  mean->set_attr_value(mean_tensor);
  batchnorm_->set_input_mean(*mean);

@ -71,24 +72,18 @@ int BatchnormNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
    MS_LOG(ERROR) << "New variance const failed.";
    return RET_ERROR;
  }
-  auto variance_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
+  auto variance_tensor = mindspore::ConverterToNPUTensor(in_tensors[4]);
  variance->set_attr_value(variance_tensor);
  batchnorm_->set_input_variance(*variance);
-
-  batchnorm_->set_attr_epsilon(batchnorm_param_->epsilon_);
-  batchnorm_->set_attr_momentum(batchnorm_param_->momentum_);
-  batchnorm_->set_attr_mode(1);
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::BatchnormNPUKernel::GetNPUOp() { return batchnorm_; }
+ge::Operator *BatchnormNPUOp::GetNPUOp() { return batchnorm_; }

-BatchnormNPUKernel::~BatchnormNPUKernel() {
+BatchnormNPUOp::~BatchnormNPUOp() {
  if (batchnorm_ != nullptr) {
    delete batchnorm_;
    batchnorm_ = nullptr;
  }
 }
-
-REG_KERNEL(kNPU, kNumberTypeFloat32, PrimitiveType_FusedBatchNorm, NPUKernelCreator<BatchnormNPUKernel>)
-}  // namespace mindspore::kernel
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/batchnorm_npu.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_BATCHNORM_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_BATCHNORM_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/graph/compatible/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class BatchnormNPUOp : public NPUOp {
+ public:
+  BatchnormNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                 const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~BatchnormNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  ge::op::BatchNormExt2 *batchnorm_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_BATCHNORM_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/cast_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/cast_npu.cc
@ -0,0 +1,59 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/cast_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int CastNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                         const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() >= 2 && in_tensors[1]->ElementsNum() == 1) {
+    dst_type_ = static_cast<int *>(in_tensors[1]->data())[0];
+  } else {
+    MS_LOG(WARNING) << "NPU dst dtype is attribute.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int CastNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors) {
+  cast_ = new (std::nothrow) hiai::op::CastT(name_);
+  if (cast_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  cast_->set_attr_dst_dtype(ConverterToNPUDataType(static_cast<TypeId>(dst_type_)));
+  cast_->set_attr_src_dtype(ConverterToNPUDataType(static_cast<TypeId>(in_tensors[0]->data_type())));
+  return RET_OK;
+}
+
+int CastNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors,
+                            const std::vector<ge::Operator *> &npu_inputs) {
+  cast_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *CastNPUOp::GetNPUOp() { return this->cast_; }
+
+CastNPUOp::~CastNPUOp() {
+  if (cast_ != nullptr) {
+    delete cast_;
+    cast_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/cast_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/cast_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CAST_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CAST_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class CastNPUOp : public NPUOp {
+ public:
+  CastNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+            const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~CastNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::CastT *cast_ = nullptr;
+  int dst_type_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CAST_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/concat_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/concat_npu.cc
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/concat_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int ConcatNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  concat_ = new (std::nothrow) hiai::op::ConcatD(name_);
+  if (concat_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  auto concat_prim = primitive->value_as_Concat();
+  if (concat_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  axis_ = concat_prim->axis();
+  return RET_OK;
+}
+
+int ConcatNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  concat_->set_attr_concat_dim(axis_);
+  concat_->set_attr_N(npu_inputs.size());
+  concat_->create_dynamic_input_x(npu_inputs.size());
+  for (int i = 0; i < npu_inputs.size(); ++i) {
+    concat_->set_dynamic_input_x(i + 1, *npu_inputs[i]);
+  }
+  return RET_OK;
+}
+
+ge::Operator *ConcatNPUOp::GetNPUOp() { return this->concat_; }
+
+int ConcatNPUOp::HandleAxis() {
+  axis_ = TransFormAxis(axis_);
+  return RET_OK;
+}
+
+ConcatNPUOp::~ConcatNPUOp() {
+  if (concat_ != nullptr) {
+    delete concat_;
+    concat_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/concat_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/concat_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONCAT_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONCAT_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class ConcatNPUOp : public NPUOp {
+ public:
+  ConcatNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ConcatNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+  int HandleAxis();
+
+ private:
+  hiai::op::ConcatD *concat_ = nullptr;
+  int axis_ = 0;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONCAT_NPU_H_
--- a/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/convolution_base_npu.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,22 +14,11 @@
 * limitations under the License.
 */

-#include "src/runtime/kernel/npu/convolution_base_npu.h"
-#include "src/runtime/agent/npu/npu_converter_utils.h"
-#include "nnacl/pack.h"
-
-namespace mindspore::kernel {
-namespace {
-constexpr int BATCH_INDEX = 0;
-constexpr int HEIGHT_INDEX = 1;
-constexpr int WIDTH_INDEX = 2;
-constexpr int CHANNEL_INDEX = 3;
-constexpr size_t WITH_BIAS_SIZE = 3;
-constexpr int BIAS_INDEX = 2;
-constexpr int RELU_MODE = 1;
-constexpr int RELU6_MODE = 14;
-}  // namespace
-ConvolutionBaseNPUKernel::~ConvolutionBaseNPUKernel() {
+#include "src/delegate/npu/op/convolution_base_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+#include "src/delegate/npu/transpose_kernel.h"
+namespace mindspore {
+ConvolutionBaseNPUOp::~ConvolutionBaseNPUOp() {
  if (act_ != nullptr) {
    delete act_;
    act_ = nullptr;
@ -44,30 +33,28 @@ ConvolutionBaseNPUKernel::~ConvolutionBaseNPUKernel() {
  }
 }

-int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector<lite::Tensor *> &inputs) {
+int ConvolutionBaseNPUOp::InitWeightConst(const std::vector<tensor::MSTensor *> &inputs) {
  weight_ = new (std::nothrow) hiai::op::Const(name_ + "_w");
  if (weight_ == nullptr) {
    MS_LOG(ERROR) << "New weight const failed.";
    return RET_ERROR;
  }
  auto w_shape = inputs[1]->shape();
-  auto nhwc_data = inputs[1]->data_c();
+  auto nhwc_data = inputs[1]->data();
  auto nchw_data = reinterpret_cast<float *>(malloc(inputs[1]->ElementsNum() * sizeof(float)));
  if (nchw_data == nullptr) {
    MS_LOG(ERROR) << "Malloc buffer failed.";
    return RET_ERROR;
  }
-  PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[BATCH_INDEX], w_shape[HEIGHT_INDEX] * w_shape[WIDTH_INDEX],
-                     w_shape[CHANNEL_INDEX], 0, 0);
+  PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]);

-  std::shared_ptr<ge::Tensor> weight_tensor = std::make_shared<ge::Tensor>();
+  std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
  if (weight_tensor == nullptr) {
    MS_LOG(ERROR) << "new weight_tensor failed.";
    return RET_ERROR;
  }
-  ge::TensorDesc tensor_desc(lite::ConverterToNPUShape({w_shape[BATCH_INDEX], w_shape[CHANNEL_INDEX],
-                                                        w_shape[HEIGHT_INDEX], w_shape[WIDTH_INDEX]}),
-                             ge::FORMAT_NCHW, lite::ConverterToNPUDataType(inputs[1]->data_type()));
+  ge::TensorDesc tensor_desc(ConverterToNPUShape({w_shape[0], w_shape[3], w_shape[1], w_shape[2]}), ge::FORMAT_NCHW,
+                             ConverterToNPUDataType(inputs[1]->data_type()));
  weight_tensor->SetTensorDesc(tensor_desc);
  weight_tensor->SetData(reinterpret_cast<const uint8_t *>(nchw_data), inputs[1]->Size());

@ -76,36 +63,42 @@ int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector<lite::Tensor *>
  return RET_OK;
 }

-int ConvolutionBaseNPUKernel::InitBiasConst(const std::vector<lite::Tensor *> &inputs) {
-  if (inputs.size() >= WITH_BIAS_SIZE) {
+int ConvolutionBaseNPUOp::InitBiasConst(const std::vector<tensor::MSTensor *> &inputs) {
+  if (inputs.size() >= 3) {
    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_b");
    if (bias_ == nullptr) {
      MS_LOG(ERROR) << "New bias const failed.";
      return RET_ERROR;
    }
-    inputs[BIAS_INDEX]->set_format(mindspore::NCHW);
-    auto bias_tensor = mindspore::lite::ConverterToNPUTensor(inputs[BIAS_INDEX]);
+    std::shared_ptr<ge::Tensor> bias_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+    if (bias_tensor == nullptr) {
+      MS_LOG(ERROR) << "new bias_tensor failed.";
+      return RET_ERROR;
+    }
+    ge::TensorDesc tensor_desc(ConverterToNPUShape({inputs[2]->shape()[0]}), ge::FORMAT_NCHW,
+                               ConverterToNPUDataType(inputs[2]->data_type()));
+    bias_tensor->SetTensorDesc(tensor_desc);
+    bias_tensor->SetData(reinterpret_cast<const uint8_t *>(inputs[2]->data()), inputs[2]->Size());
    bias_->set_attr_value(bias_tensor);
-    inputs[BIAS_INDEX]->set_format(mindspore::NHWC);
  }
  return RET_OK;
 }

-int ConvolutionBaseNPUKernel::SetActivation(const ge::Operator *input, ActType act_type) {
+int ConvolutionBaseNPUOp::SetActivation(const ge::Operator *input, schema::ActivationType act_type) {
  act_ = new (std::nothrow) hiai::op::Activation(name_ + "_act");
  if (act_ == nullptr) {
    MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
    return RET_ERROR;
  }
  act_->set_input_x(*input);
-  if (act_type == ActType_Relu) {
-    act_->set_attr_mode(RELU_MODE);
-  } else if (act_type == ActType_Relu6) {
-    act_->set_attr_mode(RELU6_MODE);
+  if (act_type == schema::ActivationType_RELU) {
+    act_->set_attr_mode(1);
+  } else if (act_type == schema::ActivationType_RELU6) {
+    act_->set_attr_mode(14);
  } else {
    MS_LOG(ERROR) << "Unsupported activation type for convolution.";
    return RET_ERROR;
  }
  return RET_OK;
 }
-}  // namespace mindspore::kernel
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/convolution_base_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/convolution_base_npu.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_BASE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_BASE_NPU_H_
+
+#include <vector>
+#include <memory>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class ConvolutionBaseNPUOp : public NPUOp {
+ public:
+  ConvolutionBaseNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ConvolutionBaseNPUOp() override;
+
+ protected:
+  int InitWeightConst(const std::vector<tensor::MSTensor *> &inputs);
+  int InitBiasConst(const std::vector<tensor::MSTensor *> &inputs);
+  int SetActivation(const ge::Operator *input, schema::ActivationType act_type);
+  hiai::op::Activation *act_ = nullptr;
+  hiai::op::Const *weight_ = nullptr;
+  hiai::op::Const *bias_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_BASE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/convolution_depthwise_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/convolution_depthwise_npu.cc
@ -0,0 +1,109 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/convolution_depthwise_npu.h"
+namespace mindspore {
+int ConvolutionDepthwiseNPUOp::SetConvDwParam(const schema::Conv2DFusion *conv_prim) {
+  auto stride_h = static_cast<int>(*(conv_prim->stride()->begin()));
+  auto stride_w = static_cast<int>(*(conv_prim->stride()->begin() + 1));
+  auto dilation_h = static_cast<int>(*(conv_prim->dilation()->begin()));
+  auto dilation_w = static_cast<int>(*(conv_prim->dilation()->begin() + 1));
+  conv_dw_->set_attr_strides(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+  conv_dw_->set_attr_dilations(ge::AttrValue::LIST_INT({dilation_h, dilation_w}));
+
+  if (conv_prim->pad_mode() == schema::PadMode_SAME) {
+    conv_dw_->set_attr_pad_mode(ge::AttrValue::STR{"SAME"});
+    conv_dw_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else if (conv_prim->pad_mode() == schema::PadMode_VALID) {
+    conv_dw_->set_attr_pad_mode(ge::AttrValue::STR{"VALID"});
+    conv_dw_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else {
+    conv_dw_->set_attr_pad_mode(ge::AttrValue::STR{"VALID"});
+    auto pad_u = static_cast<int>(*(conv_prim->pad_list()->begin()));
+    auto pad_d = static_cast<int>(*(conv_prim->pad_list()->begin() + 1));
+    auto pad_l = static_cast<int>(*(conv_prim->pad_list()->begin() + 2));
+    auto pad_r = static_cast<int>(*(conv_prim->pad_list()->begin() + 3));
+    conv_dw_->set_attr_pads(ge::AttrValue::LIST_INT({pad_u, pad_d, pad_l, pad_r}));
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseNPUOp::Init(const schema::Primitive *primitive,
+                                    const std::vector<tensor::MSTensor *> &in_tensors,
+                                    const std::vector<tensor::MSTensor *> &out_tensors) {
+  conv_dw_ = new (std::nothrow) hiai::op::ConvolutionDepthwise(name_ + "_conv_depthwise");
+  if (conv_dw_ == nullptr) {
+    MS_LOG(ERROR) << "New convolution depthwise operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto conv_prim = primitive->value_as_Conv2DFusion();
+  if (conv_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto ret = SetConvDwParam(conv_prim);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set npu op parameter for convolution depthwise op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_type_ = conv_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(conv_dw_, conv_prim->activation_type());
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                            const std::vector<tensor::MSTensor *> &out_tensors,
+                                            const std::vector<ge::Operator *> &npu_inputs) {
+  auto ret = InitWeightConst(in_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set weight and bias for convolution depthwise op " << name_ << " failed when running npu";
+    return RET_ERROR;
+  }
+  conv_dw_->set_input_filter(*weight_);
+
+  if (in_tensors.size() == 3) {
+    ret = InitBiasConst(in_tensors);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Set bias for convolution depthwise op " << name_ << " failed when running npu";
+      return RET_ERROR;
+    }
+    conv_dw_->set_input_bias(*bias_);
+  }
+  conv_dw_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *ConvolutionDepthwiseNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return conv_dw_;
+  } else {
+    return act_;
+  }
+}
+
+ConvolutionDepthwiseNPUOp::~ConvolutionDepthwiseNPUOp() {
+  if (conv_dw_ != nullptr) {
+    delete conv_dw_;
+    conv_dw_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/convolution_depthwise_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/convolution_depthwise_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_DEPTHWISE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_DEPTHWISE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/graph/compatible/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+namespace mindspore {
+class ConvolutionDepthwiseNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  ConvolutionDepthwiseNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ConvolutionDepthwiseNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetConvDwParam(const schema::Conv2DFusion *conv_prim);
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::ConvolutionDepthwise *conv_dw_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_DEPTHWISE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/convolution_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/convolution_npu.cc
@ -0,0 +1,166 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/convolution_npu.h"
+#include "src/delegate/npu/op/convolution_depthwise_npu.h"
+namespace mindspore {
+int ConvolutionNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                                const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto conv_prim = primitive->value_as_Conv2DFusion();
+  if (conv_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto stride_h = static_cast<int>(*(conv_prim->stride()->begin()));
+  auto stride_w = static_cast<int>(*(conv_prim->stride()->begin() + 1));
+  auto in_shape = in_tensors[0]->shape();  // default format: nhwc, RunPass not called
+  if (stride_h > in_shape[1] || stride_w > in_shape[2]) {
+    MS_LOG(WARNING) << "Npu convolution does not support stride greater than input size.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ConvolutionNPUOp::SetConvParam(const schema::Conv2DFusion *conv_prim) {
+  auto group = static_cast<int>(conv_prim->group());
+  auto stride_h = static_cast<int>(*(conv_prim->stride()->begin()));
+  auto stride_w = static_cast<int>(*(conv_prim->stride()->begin() + 1));
+  auto dilation_h = static_cast<int>(*(conv_prim->dilation()->begin()));
+  auto dilation_w = static_cast<int>(*(conv_prim->dilation()->begin() + 1));
+  conv_->set_attr_strides(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+  conv_->set_attr_dilations(ge::AttrValue::LIST_INT({dilation_h, dilation_w}));
+  conv_->set_attr_groups(group);
+
+  if (conv_prim->pad_mode() == schema::PadMode_SAME) {
+    conv_->set_attr_pad_mode(ge::AttrValue::STR{"SAME"});
+    conv_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else if (conv_prim->pad_mode() == schema::PadMode_VALID) {
+    conv_->set_attr_pad_mode(ge::AttrValue::STR{"VALID"});
+    conv_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else {
+    conv_->set_attr_pad_mode(ge::AttrValue::STR{"SPECIFIC"});
+    auto pad_u = static_cast<int>(*(conv_prim->pad_list()->begin()));
+    auto pad_d = static_cast<int>(*(conv_prim->pad_list()->begin() + 1));
+    auto pad_l = static_cast<int>(*(conv_prim->pad_list()->begin() + 2));
+    auto pad_r = static_cast<int>(*(conv_prim->pad_list()->begin() + 3));
+    conv_->set_attr_pads(ge::AttrValue::LIST_INT({pad_u, pad_d, pad_l, pad_r}));
+  }
+  return RET_OK;
+}
+
+int ConvolutionNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors) {
+  // set conv attr param
+  conv_ = new (std::nothrow) hiai::op::Convolution(name_ + "_conv");
+  if (conv_ == nullptr) {
+    MS_LOG(ERROR) << "New convolution operator for convolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto conv_prim = primitive->value_as_Conv2DFusion();
+  if (conv_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto ret = SetConvParam(conv_prim);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_type_ = conv_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(conv_, act_type_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                   const std::vector<tensor::MSTensor *> &out_tensors,
+                                   const std::vector<ge::Operator *> &npu_inputs) {
+  auto ret = InitWeightConst(in_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set weight and bias for convolution op " << name_ << " failed when running npu";
+    return RET_ERROR;
+  }
+  conv_->set_input_filter(*weight_);
+  if (in_tensors.size() == 3) {
+    ret = InitBiasConst(in_tensors);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Set bias for convolution op " << name_ << " failed when running npu";
+      return RET_ERROR;
+    }
+    conv_->set_input_bias(*bias_);
+  }
+  conv_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *ConvolutionNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return conv_;
+  } else {
+    return act_;
+  }
+}
+
+ConvolutionNPUOp::~ConvolutionNPUOp() {
+  if (conv_ != nullptr) {
+    delete conv_;
+    conv_ = nullptr;
+  }
+}
+
+NPUOp *GetNPUConvOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors, std::string name) {
+  auto shape = out_tensors.front()->shape();
+  if (std::find(shape.begin(), shape.end(), -1) != shape.end()) {
+    MS_LOG(ERROR) << "NPU does not support runtime inference shape.";
+    return nullptr;
+  }
+
+  if (in_tensors[0]->shape().size() > 4) {
+    MS_LOG(ERROR) << "Npu does not support input tensor dims greater than 4";
+    return nullptr;
+  }
+
+  NPUOp *op = nullptr;
+  auto conv_prim = primitive->value_as_Conv2DFusion();
+  auto group = static_cast<int>(conv_prim->group());
+  auto input_channel = in_tensors.front()->shape()[3];
+  auto output_channel = out_tensors.front()->shape()[3];
+  if (group == input_channel && group == output_channel) {
+    op = new (std::nothrow) ConvolutionDepthwiseNPUOp(primitive, in_tensors, out_tensors, name);
+  } else {
+    op = new (std::nothrow) ConvolutionNPUOp(primitive, in_tensors, out_tensors, name);
+  }
+
+  auto ret = op->IsSupport(primitive, in_tensors, out_tensors);
+  if (ret != RET_OK) {
+    delete op;
+    return nullptr;
+  }
+  ret = op->Init(primitive, in_tensors, out_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "NPU op init failed.";
+    delete op;
+    return nullptr;
+  }
+  return op;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/convolution_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/convolution_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+namespace mindspore {
+class ConvolutionNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  ConvolutionNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ConvolutionNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetConvParam(const schema::Conv2DFusion *conv_prim);
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::Convolution *conv_ = nullptr;
+};
+
+NPUOp *GetNPUConvOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors, std::string name);
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CONVOLUTION_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/crop_and_resize_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/crop_and_resize_npu.cc
@ -0,0 +1,83 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/crop_and_resize_npu.h"
+namespace mindspore {
+int CropAndResizeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() < 4) {
+    MS_LOG(WARNING) << "NPU CropAndResize got nput inputs size < 4";
+    return RET_NOT_SUPPORT;
+  }
+  auto crop_and_resize_prim = primitive->value_as_CropAndResize();
+  if (crop_and_resize_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  // support only 0 linear and 1 nearest
+  if (crop_and_resize_prim->method() != schema::ResizeMethod_LINEAR &&
+      crop_and_resize_prim->method() != schema::ResizeMethod_NEAREST) {
+    MS_LOG(WARNING) << "NPU CropAndResize only support method bilinear 0 and nearest 1, got "
+                    << crop_and_resize_prim->method();
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int CropAndResizeNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                             const std::vector<tensor::MSTensor *> &out_tensors) {
+  crop_and_resize_ = new (std::nothrow) hiai::op::CropAndResize(name_);
+  if (crop_and_resize_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+
+  auto crop_and_resize_prim = primitive->value_as_CropAndResize();
+  if (crop_and_resize_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  crop_and_resize_->set_attr_extrapolation_value(crop_and_resize_prim->extrapolation_value());
+  if (crop_and_resize_prim->method() == schema::ResizeMethod_LINEAR) {
+    crop_and_resize_->set_attr_method("bilinear");
+  } else if (crop_and_resize_prim->method() == schema::ResizeMethod_NEAREST) {
+    crop_and_resize_->set_attr_method("nearest");
+  } else {
+    MS_LOG(ERROR) << "NPU CropAndResize only support method bilinear and nearest";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int CropAndResizeNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                     const std::vector<tensor::MSTensor *> &out_tensors,
+                                     const std::vector<ge::Operator *> &npu_inputs) {
+  crop_and_resize_->set_input_x(*npu_inputs[0]);
+  crop_and_resize_->set_input_boxes(*npu_inputs[1]);
+  crop_and_resize_->set_input_box_index(*npu_inputs[2]);
+  crop_and_resize_->set_input_crop_size(*npu_inputs[3]);
+  return RET_OK;
+}
+
+ge::Operator *CropAndResizeNPUOp::GetNPUOp() { return this->crop_and_resize_; }
+
+CropAndResizeNPUOp::~CropAndResizeNPUOp() {
+  if (crop_and_resize_ != nullptr) {
+    delete crop_and_resize_;
+    crop_and_resize_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/crop_and_resize_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/crop_and_resize_npu.h
@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CROP_AND_RESIZE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CROP_AND_RESIZE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class CropAndResizeNPUOp : public NPUOp {
+ public:
+  CropAndResizeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~CropAndResizeNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::CropAndResize *crop_and_resize_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_CROP_AND_RESIZE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/deconvolution_npu.cc
@ -0,0 +1,135 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/deconvolution_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int DeconvolutionNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto deconv_prim = primitive->value_as_Conv2dTransposeFusion();
+  if (deconv_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  if (static_cast<int>(deconv_prim->group()) != 1) {
+    MS_LOG(WARNING) << "Only support group equals 1 for npu deconvolution op";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionNPUOp::SetDeconvParam(const schema::Conv2dTransposeFusion *conv_prim) {
+  auto group = static_cast<int>(conv_prim->group());
+  auto stride_h = static_cast<int>(*(conv_prim->stride()->begin()));
+  auto stride_w = static_cast<int>(*(conv_prim->stride()->begin() + 1));
+  auto dilation_h = static_cast<int>(*(conv_prim->dilation()->begin()));
+  auto dilation_w = static_cast<int>(*(conv_prim->dilation()->begin() + 1));
+  deconv_->set_attr_strides(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+  deconv_->set_attr_dilations(ge::AttrValue::LIST_INT({dilation_h, dilation_w}));
+  deconv_->set_attr_groups(group);
+
+  if (conv_prim->pad_mode() == schema::PadMode_SAME) {
+    deconv_->set_attr_pad_mode(ge::AttrValue::STR{"SAME"});
+    deconv_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else if (conv_prim->pad_mode() == schema::PadMode_VALID) {
+    deconv_->set_attr_pad_mode(ge::AttrValue::STR{"VALID"});
+    deconv_->set_attr_pads(ge::AttrValue::LIST_INT({0, 0, 0, 0}));
+  } else {
+    deconv_->set_attr_pad_mode(ge::AttrValue::STR{"SPECIFIC"});
+    auto pad_u = static_cast<int>(*(conv_prim->pad_list()->begin()));
+    auto pad_d = static_cast<int>(*(conv_prim->pad_list()->begin() + 1));
+    auto pad_l = static_cast<int>(*(conv_prim->pad_list()->begin() + 2));
+    auto pad_r = static_cast<int>(*(conv_prim->pad_list()->begin() + 3));
+    deconv_->set_attr_pads(ge::AttrValue::LIST_INT({pad_u, pad_d, pad_l, pad_r}));
+  }
+  return RET_OK;
+}
+
+int DeconvolutionNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                             const std::vector<tensor::MSTensor *> &out_tensors) {
+  // set deconv attr param
+  deconv_ = new (std::nothrow) hiai::op::ConvTranspose(name_ + "_deconv");
+  if (deconv_ == nullptr) {
+    MS_LOG(ERROR) << "New deconvolution operator for deconvolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+
+  auto deconv_prim = primitive->value_as_Conv2dTransposeFusion();
+  if (deconv_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto ret = SetDeconvParam(deconv_prim);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_type_ = deconv_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(deconv_, act_type_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int DeconvolutionNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                     const std::vector<tensor::MSTensor *> &out_tensors,
+                                     const std::vector<ge::Operator *> &npu_inputs) {
+  auto ret = InitWeightConst(in_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set weight and bias for deconvolution op " << name_ << " failed when running npu";
+    return RET_ERROR;
+  }
+  deconv_->set_input_filter(*weight_);
+  if (in_tensors.size() == 3) {
+    ret = InitBiasConst(in_tensors);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Set bias for deconvolution op " << name_ << " failed when running npu";
+      return RET_ERROR;
+    }
+    deconv_->set_input_bias(*bias_);
+  }
+  deconv_->set_input_x(*npu_inputs[0]);
+
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(deconv_, act_type_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+ge::Operator *DeconvolutionNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return deconv_;
+  } else {
+    return act_;
+  }
+}
+
+DeconvolutionNPUOp::~DeconvolutionNPUOp() {
+  if (deconv_ != nullptr) {
+    delete deconv_;
+    deconv_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/deconvolution_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/deconvolution_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_DECONVOLUTION_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_DECONVOLUTION_NPU_H_
+
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+
+namespace mindspore {
+class DeconvolutionNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  DeconvolutionNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+  ~DeconvolutionNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetDeconvParam(const schema::Conv2dTransposeFusion *conv_prim);
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::ConvTranspose *deconv_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_DECONVOLUTION_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/eltwise_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/eltwise_npu.cc
@ -0,0 +1,59 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/eltwise_npu.h"
+#include "include/graph/op/all_ops.h"
+#include "src/kernel_registry.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int EltwiseNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors) {
+  eltwise_ = new (std::nothrow) hiai::op::Eltwise(name_);
+  if (eltwise_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  auto eltwise_prim = primitive->value_as_Eltwise();
+  if (eltwise_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  eltwise_->set_attr_mode(ConverterToNPUEltwiseMode(eltwise_prim->mode()));
+  int size = in_tensors.size();
+  eltwise_->create_dynamic_input_x(size);
+  eltwise_->set_attr_N(size);
+  return RET_OK;
+}
+
+int EltwiseNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors,
+                               const std::vector<ge::Operator *> &npu_inputs) {
+  for (int i = 0; i < npu_inputs.size(); ++i) {
+    eltwise_->set_dynamic_input_x(i + 1, *npu_inputs[i]);
+  }
+  return RET_OK;
+}
+
+ge::Operator *EltwiseNPUOp::GetNPUOp() { return this->eltwise_; }
+
+EltwiseNPUOp::~EltwiseNPUOp() {
+  if (eltwise_ != nullptr) {
+    delete eltwise_;
+    eltwise_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/eltwise_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/eltwise_npu.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ELTWISE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ELTWISE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class EltwiseNPUOp : public NPUOp {
+ public:
+  EltwiseNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+               const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~EltwiseNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Eltwise *eltwise_ = nullptr;
+  schema::EltwiseMode mode_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_ELTWISE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/expand_dims_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/expand_dims_npu.cc
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/expand_dims_npu.h"
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int ExpandDimsNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  expand_dims_ = new (std::nothrow) hiai::op::ExpandDims(name_);
+  if (expand_dims_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ExpandDimsNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors,
+                                  const std::vector<ge::Operator *> &npu_inputs) {
+  expand_dims_->set_input_x(*npu_inputs[0]);
+  expand_dims_->set_input_axis(*npu_inputs[1]);
+  return RET_OK;
+}
+
+ge::Operator *ExpandDimsNPUOp::GetNPUOp() { return this->expand_dims_; }
+
+ExpandDimsNPUOp::~ExpandDimsNPUOp() {
+  if (expand_dims_ != nullptr) {
+    delete expand_dims_;
+    expand_dims_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/expand_dims_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/expand_dims_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_EXPAND_DIMS_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_EXPAND_DIMS_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class ExpandDimsNPUOp : public NPUOp {
+ public:
+  ExpandDimsNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                  const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+  ~ExpandDimsNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::ExpandDims *expand_dims_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_EXPAND_DIMS_NPU_H_
--- a/mindspore/lite/src/runtime/kernel/npu/fullconnection_npu.cc
+++ b/mindspore/lite/src/runtime/kernel/npu/fullconnection_npu.cc
@ -14,30 +14,26 @@
 * limitations under the License.
 */

-#include "src/runtime/kernel/npu/fullconnection_npu.h"
+#include "src/delegate/npu/op/fullconnection_npu.h"
 #include <memory>
-#include "src/kernel_registry.h"
-#include "src/runtime/agent/npu/npu_converter_utils.h"
-using mindspore::kernel::KERNEL_ARCH::kNPU;
-using mindspore::lite::KernelRegistrar;
-using mindspore::schema::PrimitiveType_FullConnection;
+#include "src/delegate/npu/npu_converter_utils.h"

-namespace mindspore::kernel {
-int FullconnectionNPUKernel::IsSupport(const std::vector<lite::Tensor *> &inputs,
-                                       const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter) {
-  return RET_OK;
-}
-
-int FullconnectionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inputs,
-                                          const std::vector<lite::Tensor *> &outputs,
-                                          const std::vector<ge::Operator *> &npu_inputs) {
-  auto input_shape = inputs[0]->shape();
+namespace mindspore {
+int FullconnectionNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto fc_prim = primitive->value_as_FullConnection();
+  if (fc_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  act_type_ = fc_prim->activation_type();
+  auto input_shape = in_tensors[0]->shape();
  reshape_ = new (std::nothrow) hiai::op::Reshape(name_ + "_reshape");
  if (reshape_ == nullptr) {
    MS_LOG(ERROR) << "New reshape operator for fullconnection op " << name_ << " failed.";
    return RET_ERROR;
  }
-  reshape_->set_input_x(*npu_inputs[0]);
+
  int col = 1;
  for (int i = 1; i < input_shape.size(); i++) {
    col *= input_shape[i];
@ -55,6 +51,13 @@ int FullconnectionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inp
    MS_LOG(ERROR) << "New matmul operator for fullconnection op " << name_ << " failed.";
    return RET_ERROR;
  }
+  return RET_OK;
+}
+
+int FullconnectionNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                      const std::vector<tensor::MSTensor *> &out_tensors,
+                                      const std::vector<ge::Operator *> &npu_inputs) {
+  reshape_->set_input_x(*npu_inputs[0]);
  fc_->set_input_x1(*reshape_);

  weight_ = new (std::nothrow) hiai::op::Const(name_ + "_w");
@ -62,20 +65,21 @@ int FullconnectionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inp
    MS_LOG(ERROR) << "New weight const failed.";
    return RET_ERROR;
  }
-  inputs[1]->set_format(mindspore::NCHW);
-  auto weight_tensor = mindspore::lite::ConverterToNPUTensor(inputs[1]);
+  auto weight_tensor = mindspore::ConverterToNPUTensor(in_tensors[1]);
  weight_->set_attr_value(weight_tensor);
-  inputs[1]->set_format(mindspore::NHWC);
  fc_->set_input_x2(*weight_).set_attr_transpose_x2(true);

-  if (fc_param_->has_bias_) {
+  if (in_tensors.size() >= 3) {
+    has_bias_ = true;
+  }
+  if (has_bias_) {
    biasadd_ = new (std::nothrow) hiai::op::BiasAdd(name_ + "_biasadd");
    if (biasadd_ == nullptr) {
      MS_LOG(ERROR) << "New biasadd operator for fullconnection op " << name_ << " failed.";
      return RET_ERROR;
    }

-    auto ret = InitBiasConst(inputs);
+    auto ret = InitBiasConst(in_tensors);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Set bias for convolution op " << name_ << " failed when running npu";
      return RET_ERROR;
@ -83,9 +87,8 @@ int FullconnectionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inp
    biasadd_->set_input_x(*fc_).set_input_bias(*bias_);
  }

-  if (fc_param_->act_type_ != ActType_No) {
-    auto ret =
-      biasadd_ == nullptr ? SetActivation(fc_, fc_param_->act_type_) : SetActivation(biasadd_, fc_param_->act_type_);
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    auto ret = biasadd_ == nullptr ? SetActivation(fc_, act_type_) : SetActivation(biasadd_, act_type_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
      return RET_ERROR;
@ -94,17 +97,17 @@ int FullconnectionNPUKernel::SetNPUInputs(const std::vector<lite::Tensor *> &inp
  return RET_OK;
 }

-ge::Operator *mindspore::kernel::FullconnectionNPUKernel::GetNPUOp() {
-  if (fc_param_->act_type_ != ActType_No) {
+ge::Operator *FullconnectionNPUOp::GetNPUOp() {
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
    return act_;
  }
-  if (fc_param_->has_bias_) {
+  if (has_bias_) {
    return biasadd_;
  }
  return fc_;
 }

-FullconnectionNPUKernel::~FullconnectionNPUKernel() {
+FullconnectionNPUOp::~FullconnectionNPUOp() {
  if (reshape_ != nullptr) {
    delete reshape_;
    reshape_ = nullptr;
@ -122,5 +125,4 @@ FullconnectionNPUKernel::~FullconnectionNPUKernel() {
    reshape_op_ = nullptr;
  }
 }
-REG_KERNEL(kNPU, kNumberTypeFloat32, PrimitiveType_FullConnection, NPUKernelCreator<FullconnectionNPUKernel>)
-}  // namespace mindspore::kernel
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/fullconnection_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/fullconnection_npu.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FULLCONNECTION_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FULLCONNECTION_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+
+namespace mindspore {
+class FullconnectionNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  FullconnectionNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~FullconnectionNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  bool has_bias_ = false;
+  hiai::op::Reshape *reshape_ = nullptr;
+  hiai::op::MatMul *fc_ = nullptr;
+  hiai::op::BiasAdd *biasadd_ = nullptr;
+  hiai::op::Const *reshape_op_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_FULLCONNECTION_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/gather_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/gather_npu.cc
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/gather_npu.h"
+
+namespace mindspore {
+int GatherNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors[1]->data_type() != kNumberTypeInt32) {
+    MS_LOG(WARNING) << "Gather indices only support Int32";
+    return RET_NOT_SUPPORT;
+  }
+  if (in_tensors.size() >= 3 && in_tensors[2]->ElementsNum() == 1) {
+    axis_ = static_cast<int *>(in_tensors[2]->data())[0];
+  } else {
+    MS_LOG(WARNING) << "NPU axis is attribute.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int GatherNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  gather_ = new (std::nothrow) hiai::op::GatherV2D(name_);
+  if (gather_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  gather_->set_attr_axis(axis_);
+  return RET_OK;
+}
+
+int GatherNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  gather_->set_input_x(*npu_inputs[0]);
+  gather_->set_input_indices(*npu_inputs[1]);
+  return RET_OK;
+}
+
+ge::Operator *GatherNPUOp::GetNPUOp() { return this->gather_; }
+
+GatherNPUOp::~GatherNPUOp() {
+  if (gather_ != nullptr) {
+    delete gather_;
+    gather_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/gather_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/gather_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_GATHER_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_GATHER_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class GatherNPUOp : public NPUOp {
+ public:
+  GatherNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~GatherNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::GatherV2D *gather_ = nullptr;
+  int axis_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_GATHER_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/instance_norm_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/instance_norm_npu.cc
@ -0,0 +1,97 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/instance_norm_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int InstanceNormNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors) {
+  instance_norm_ = new (std::nothrow) hiai::op::InstanceNorm(name_);
+  if (instance_norm_ == nullptr) {
+    MS_LOG(ERROR) << "New instance norm npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto instance_norm_prim = primitive->value_as_InstanceNorm();
+  if (instance_norm_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  instance_norm_->set_attr_epsilon(instance_norm_prim->epsilon());
+  return RET_OK;
+}
+
+int InstanceNormNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                    const std::vector<tensor::MSTensor *> &out_tensors,
+                                    const std::vector<ge::Operator *> &npu_inputs) {
+  instance_norm_->set_input_x(*npu_inputs[0]);
+
+  auto gamma_shape = in_tensors[1]->shape();
+  std::shared_ptr<ge::Tensor> gamma_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+  if (gamma_tensor == nullptr) {
+    MS_LOG(ERROR) << "new gamma_tensor failed.";
+    return RET_ERROR;
+  }
+  ge::TensorDesc gamma_tensor_desc(ConverterToNPUShape({1, gamma_shape[0], 1, 1}), ge::FORMAT_NCHW,
+                                   ConverterToNPUDataType(in_tensors[1]->data_type()));
+  gamma_tensor->SetTensorDesc(gamma_tensor_desc);
+  gamma_tensor->SetData(reinterpret_cast<const uint8_t *>(in_tensors[1]->data()), in_tensors[1]->Size());
+  gamma_ = new (std::nothrow) hiai::op::Const(name_ + "_gamma");
+  if (gamma_ == nullptr) {
+    MS_LOG(ERROR) << "New gamma_ const failed.";
+    return RET_ERROR;
+  }
+  gamma_->set_attr_value(gamma_tensor);
+  instance_norm_->set_input_gamma(*gamma_);
+
+  auto beta_shape = in_tensors[2]->shape();
+  std::shared_ptr<ge::Tensor> beta_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+  if (beta_tensor == nullptr) {
+    MS_LOG(ERROR) << "new beta_tensor failed.";
+    return RET_ERROR;
+  }
+  ge::TensorDesc beta_tensor_desc(ConverterToNPUShape({1, beta_shape[0], 1, 1}), ge::FORMAT_NCHW,
+                                  ConverterToNPUDataType(in_tensors[2]->data_type()));
+  beta_tensor->SetTensorDesc(beta_tensor_desc);
+  beta_tensor->SetData(reinterpret_cast<const uint8_t *>(in_tensors[2]->data()), in_tensors[2]->Size());
+  beta_ = new (std::nothrow) hiai::op::Const(name_ + "_beta");
+  if (beta_ == nullptr) {
+    MS_LOG(ERROR) << "New beta_ const failed.";
+    return RET_ERROR;
+  }
+  beta_->set_attr_value(beta_tensor);
+  instance_norm_->set_input_beta(*beta_);
+  return RET_OK;
+}
+
+ge::Operator *InstanceNormNPUOp::GetNPUOp() { return this->instance_norm_; }
+
+InstanceNormNPUOp::~InstanceNormNPUOp() {
+  if (instance_norm_ != nullptr) {
+    delete instance_norm_;
+    instance_norm_ = nullptr;
+  }
+  if (gamma_ != nullptr) {
+    delete gamma_;
+    gamma_ = nullptr;
+  }
+  if (beta_ != nullptr) {
+    delete beta_;
+    beta_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/instance_norm_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/instance_norm_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_INSTANCE_NORM_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_INSTANCE_NORM_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class InstanceNormNPUOp : public NPUOp {
+ public:
+  InstanceNormNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~InstanceNormNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::InstanceNorm *instance_norm_ = nullptr;
+  hiai::op::Const *gamma_ = nullptr;
+  hiai::op::Const *beta_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_INSTANCE_NORM_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/matmul_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/matmul_npu.cc
@ -0,0 +1,108 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/matmul_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+namespace mindspore {
+int MatMulNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() == 3) {
+    if (in_tensors[2]->shape().size() != 1) {
+      return RET_NOT_SUPPORT;
+    }
+  }
+  return RET_OK;
+}
+
+int MatMulNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  matmul_ = new (std::nothrow) hiai::op::MatMul(name_);
+  if (matmul_ == nullptr) {
+    MS_LOG(ERROR) << "New matmul npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  if (in_tensors.size() == 3) {
+    has_bias_ = true;
+  }
+  auto matmul_prim = primitive->value_as_MatMul();
+  if (matmul_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  matmul_->set_attr_transpose_x1(matmul_prim->transpose_a());
+  matmul_->set_attr_transpose_x2(matmul_prim->transpose_b());
+  return RET_OK;
+}
+
+int MatMulNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  matmul_->set_input_x1(*npu_inputs[0]);
+  matmul_->set_input_x2(*npu_inputs[1]);
+  if (has_bias_) {
+    add_op_ = new (std::nothrow) hiai::op::Add(name_ + "_add");
+    if (add_op_ == nullptr) {
+      MS_LOG(ERROR) << "new add op failed.";
+      return RET_ERROR;
+    }
+    add_op_->set_input_x1(*matmul_);
+    auto bias_shape = in_tensors[2]->shape();
+    auto bias_tensor = std::make_shared<ge::Tensor>();
+    if (bias_tensor == nullptr) {
+      MS_LOG(ERROR) << "new bias_tensor failed.";
+      return RET_ERROR;
+    }
+    ge::TensorDesc bias_tensor_desc(ConverterToNPUShape({1, bias_shape[0], 1, 1}), ge::FORMAT_NCHW,
+                                    ConverterToNPUDataType(in_tensors[2]->data_type()));
+    if (out_tensors[0]->shape().size() == 2) {
+      bias_tensor_desc.SetShape(ConverterToNPUShape({1, bias_shape[0]}));
+    }
+    bias_tensor->SetTensorDesc(bias_tensor_desc);
+    bias_tensor->SetData(reinterpret_cast<const uint8_t *>(in_tensors[2]->data()), in_tensors[2]->Size());
+    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_bias");
+    if (bias_ == nullptr) {
+      MS_LOG(ERROR) << "new bias const failed.";
+      return RET_ERROR;
+    }
+    bias_->set_attr_value(bias_tensor);
+    add_op_->set_input_x2(*bias_);
+  }
+  return RET_OK;
+}
+
+ge::Operator *MatMulNPUOp::GetNPUOp() {
+  if (has_bias_) {
+    return add_op_;
+  }
+  return matmul_;
+}
+
+MatMulNPUOp::~MatMulNPUOp() {
+  if (matmul_ != nullptr) {
+    delete matmul_;
+    matmul_ = nullptr;
+  }
+  if (add_op_ != nullptr) {
+    delete add_op_;
+    add_op_ = nullptr;
+  }
+  if (bias_ != nullptr) {
+    delete bias_;
+    bias_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/matmul_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/matmul_npu.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MATMUL_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MATMUL_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class MatMulNPUOp : public NPUOp {
+ public:
+  MatMulNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~MatMulNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  bool has_bias_ = false;
+  hiai::op::MatMul *matmul_ = nullptr;
+  hiai::op::Add *add_op_ = nullptr;
+  hiai::op::Const *bias_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MATMUL_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/max_pooling_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/max_pooling_npu.cc
@ -0,0 +1,123 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/max_pooling_npu.h"
+namespace mindspore {
+int MaxPoolingNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto pooling_prim = primitive->value_as_MaxPoolFusion();
+  if (pooling_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto stride_h = static_cast<int>(*(pooling_prim->strides()->begin()));
+  auto stride_w = static_cast<int>(*(pooling_prim->strides()->begin() + 1));
+  auto pad_u = static_cast<int>(*(pooling_prim->pad()->begin()));
+  auto pad_l = static_cast<int>(*(pooling_prim->pad()->begin() + 2));
+  if (pad_u > stride_h || pad_l > stride_w) {
+    MS_LOG(WARNING) << "Npu pooling does not support pad > stride.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int MaxPoolingNPUOp::SetPoolingParam(const schema::MaxPoolFusion *pooling_prim) {
+  pooling_->set_attr_mode(0);
+  if (pooling_prim->global()) {
+    pooling_->set_attr_global_pooling(pooling_prim->global());
+  } else {
+    auto window_h = static_cast<int>(*(pooling_prim->kernel_size()->begin()));
+    auto window_w = static_cast<int>(*(pooling_prim->kernel_size()->begin() + 1));
+    pooling_->set_attr_window(ge::AttrValue::LIST_INT({window_h, window_w}));
+  }
+  auto stride_h = static_cast<int>(*(pooling_prim->strides()->begin()));
+  auto stride_w = static_cast<int>(*(pooling_prim->strides()->begin() + 1));
+  pooling_->set_attr_stride(ge::AttrValue::LIST_INT({stride_h, stride_w}));
+  if (pooling_prim->pad_mode() == schema::PadMode_SAME) {
+    pooling_->set_attr_pad_mode(6);
+    pooling_->set_attr_pad({0, 0, 0, 0});
+  } else if (pooling_prim->pad_mode() == schema::PadMode_VALID) {
+    pooling_->set_attr_pad_mode(5);
+    pooling_->set_attr_pad({0, 0, 0, 0});
+  } else {
+    pooling_->set_attr_pad_mode(0);
+    auto pad_u = static_cast<int>(*(pooling_prim->pad()->begin()));
+    auto pad_d = static_cast<int>(*(pooling_prim->pad()->begin() + 1));
+    auto pad_l = static_cast<int>(*(pooling_prim->pad()->begin() + 2));
+    auto pad_r = static_cast<int>(*(pooling_prim->pad()->begin() + 3));
+    pooling_->set_attr_pad(ge::AttrValue::LIST_INT({pad_u, pad_d, pad_l, pad_r}));
+  }
+
+  if (pooling_prim->round_mode() == schema::RoundMode_FLOOR) {  // no use in cpu
+    pooling_->set_attr_ceil_mode(0);
+    pooling_->set_attr_data_mode(1);
+  } else {
+    pooling_->set_attr_ceil_mode(1);
+    pooling_->set_attr_data_mode(0);
+  }
+  return RET_OK;
+}
+
+int MaxPoolingNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  pooling_ = new (std::nothrow) hiai::op::PoolingD(name_ + "_pooling");
+  if (pooling_ == nullptr) {
+    MS_LOG(ERROR) << "New pooling npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto pooling_prim = primitive->value_as_MaxPoolFusion();
+  if (pooling_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto ret = SetPoolingParam(pooling_prim);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Set npu op parameter for convolution op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_type_ = pooling_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    ret = SetActivation(pooling_, pooling_prim->activation_type());
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int MaxPoolingNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors,
+                                  const std::vector<ge::Operator *> &npu_inputs) {
+  pooling_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *MaxPoolingNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return pooling_;
+  } else {
+    return act_;
+  }
+}
+
+MaxPoolingNPUOp::~MaxPoolingNPUOp() {
+  if (pooling_ != nullptr) {
+    delete pooling_;
+    pooling_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/max_pooling_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/max_pooling_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MAX_POOLING_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MAX_POOLING_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/convolution_base_npu.h"
+namespace mindspore {
+class MaxPoolingNPUOp : public ConvolutionBaseNPUOp {
+ public:
+  MaxPoolingNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                  const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : ConvolutionBaseNPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~MaxPoolingNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  int SetPoolingParam(const schema::MaxPoolFusion *pooling_prim);
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::PoolingD *pooling_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_MAX_POOLING_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/npu_op.h
+++ b/mindspore/lite/src/delegate/npu/op/npu_op.h
@ -0,0 +1,155 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_NPU_OP_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_NPU_OP_
+#include <utility>
+#include <vector>
+#include <string>
+#include <set>
+#include <unordered_map>
+#include "include/errorcode.h"
+#include "include/ms_tensor.h"
+#include "schema/model_generated.h"
+#include "src/common/log_adapter.h"
+#include "include/graph/graph.h"
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NOT_SUPPORT;
+using mindspore::lite::RET_OK;
+namespace mindspore {
+class NPUOp {
+ public:
+  NPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+        const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : inputs_(std::move(in_tensors)), outputs_(std::move(out_tensors)), name_(name) {
+    if (primitive != nullptr) {
+      type_ = primitive->value_type();
+    }
+  }
+
+  virtual ~NPUOp() = default;
+
+  virtual int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                        const std::vector<tensor::MSTensor *> &out_tensors) {
+    return RET_ERROR;
+  }
+
+  virtual int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors) {
+    return RET_ERROR;
+  }
+
+  virtual int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors,
+                           const std::vector<ge::Operator *> &npu_inputs) {
+    return RET_ERROR;
+  }
+
+  virtual int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors,
+                           const std::vector<ge::Operator *> &npu_inputs,
+                           const std::unordered_map<int, std::pair<ge::Operator *, int>> &index2_multi_out_index) {
+    if (index2_multi_out_index.empty()) {
+      return SetNPUInputs(in_tensors, out_tensors, npu_inputs);
+    }
+    return RET_OK;
+  }
+
+  virtual ge::Operator *GetNPUOp() { return nullptr; }
+
+  void set_inputs(const std::vector<mindspore::tensor::MSTensor *> &in_tensors) { this->inputs_ = in_tensors; }
+
+  void set_input(mindspore::tensor::MSTensor *in_tensor, int index) {
+    MS_ASSERT(index < inputs_.size());
+    this->inputs_[index] = in_tensor;
+  }
+
+  void set_outputs(const std::vector<mindspore::tensor::MSTensor *> &out_tensors) { this->outputs_ = out_tensors; }
+
+  const std::vector<mindspore::tensor::MSTensor *> &inputs() { return this->inputs_; }
+
+  const std::vector<mindspore::tensor::MSTensor *> &outputs() { return this->outputs_; }
+
+  void set_in_ops(const std::vector<NPUOp *> &in_ops) { this->in_ops_ = in_ops; }
+
+  void set_out_ops(const std::vector<NPUOp *> &out_ops) { this->out_ops_ = out_ops; }
+
+  const std::vector<NPUOp *> &in_ops() const { return this->in_ops_; }
+
+  const std::vector<NPUOp *> &out_ops() const { return this->out_ops_; }
+
+  schema::PrimitiveType type() const { return type_; }
+
+  std::string name() const { return this->name_; }
+
+  void set_name(const std::string &name) { this->name_ = name; }
+
+ protected:
+  std::vector<mindspore::tensor::MSTensor *> inputs_;
+  std::vector<mindspore::tensor::MSTensor *> outputs_;
+  std::vector<NPUOp *> in_ops_;
+  std::vector<NPUOp *> out_ops_;
+  schema::PrimitiveType type_ = schema::PrimitiveType_NONE;
+  std::string name_;
+};
+
+typedef NPUOp *(*NPUGetOp)(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors, std::string name);
+
+template <class T>
+NPUOp *GetNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors, std::string name) {
+  auto shape = out_tensors.front()->shape();
+  if (std::find(shape.begin(), shape.end(), -1) != shape.end()) {
+    MS_LOG(ERROR) << "NPU does not support runtime inference shape.";
+    return nullptr;
+  }
+
+  if (in_tensors[0]->shape().size() > 4) {
+    MS_LOG(ERROR) << "Npu does not support input tensor dims greater than 4";
+    return nullptr;
+  }
+
+  std::set<schema::PrimitiveType> int32_lists = {schema::PrimitiveType_Cast, schema::PrimitiveType_StridedSlice};
+  auto support_int32 = in_tensors[0]->data_type() == kNumberTypeInt32 &&
+                       find(int32_lists.begin(), int32_lists.end(), primitive->value_type()) != int32_lists.end();
+  if (in_tensors[0]->data_type() != kNumberTypeFloat32 && !support_int32) {
+    MS_LOG(ERROR) << "Npu does not support datatype " << in_tensors[0]->data_type() << " for op type "
+                  << primitive->value_type();
+    return nullptr;
+  }
+
+  auto *op = new (std::nothrow) T(primitive, in_tensors, out_tensors, name);
+  if (op == nullptr) {
+    MS_LOG(ERROR) << "op is nullptr.";
+    return nullptr;
+  }
+  auto ret = op->IsSupport(primitive, in_tensors, out_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(WARNING) << "NPU op is not supported.";
+    delete op;
+    return nullptr;
+  }
+  ret = op->Init(primitive, in_tensors, out_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(WARNING) << "NPU op init failed.";
+    delete op;
+    return nullptr;
+  }
+  return op;
+}
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_NPU_OP_
--- a/mindspore/lite/src/delegate/npu/op/pad_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/pad_npu.cc
@ -0,0 +1,138 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/pad_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int PadNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                        const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto pad_prim = primitive->value_as_PadFusion();
+  if (pad_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  if (pad_prim->padding_mode() != schema::PaddingMode_CONSTANT) {
+    MS_LOG(WARNING) << "NPU only support CONSTANT padding mode";
+    return RET_NOT_SUPPORT;
+  }
+  if (pad_prim->paddings() != nullptr) {
+    return RET_OK;
+  }
+  if (in_tensors.size() >= 2 && in_tensors[1]->data() != nullptr) {
+    return RET_OK;
+  }
+  MS_LOG(WARNING) << "NPU pad only support constant pad size.";
+  return RET_ERROR;
+}
+
+int PadNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors) {
+  pad_ = new (std::nothrow) hiai::op::PadV2(name_);
+  if (pad_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  auto pad_prim = primitive->value_as_PadFusion();
+  if (pad_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  if (pad_prim->paddings() != nullptr) {
+    auto fb_paddings = pad_prim->paddings()->data();
+    if (fb_paddings == nullptr) {
+      MS_LOG(ERROR) << "paddings is nullptr";
+      return RET_ERROR;
+    }
+    for (auto fb_padding : *fb_paddings) {
+      auto paddings_data = fb_padding->data();
+      if (paddings_data == nullptr) {
+        MS_LOG(ERROR) << "paddings_data is nullptr";
+        return RET_ERROR;
+      }
+      auto paddings = std::vector<int64_t>(paddings_data->begin(), paddings_data->end());
+      paddings_vec_.insert(paddings_vec_.end(), paddings.begin(), paddings.end());
+    }
+  } else if (in_tensors.size() >= 2 && in_tensors[1]->data() != nullptr) {
+    for (int i = 0; i < in_tensors[1]->ElementsNum(); i++) {
+      paddings_vec_.push_back(static_cast<int *>(in_tensors[1]->data())[i]);
+    }
+  } else {
+    MS_LOG(ERROR) << "NPU pad only support constant pad size.";
+    return RET_ERROR;
+  }
+
+  ge::TensorDesc constant_values_tensor_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+  ge::TensorPtr constant_values_tensor = std::make_shared<hiai::Tensor>(constant_values_tensor_desc);
+  vector<float> constant_values_data_value = {pad_prim->constant_value()};
+  constant_values_tensor->SetData(reinterpret_cast<uint8_t *>(constant_values_data_value.data()), 1 * sizeof(float));
+  constant_value_ = new hiai::op::Const(name_ + "constant");
+  constant_value_->set_attr_value(constant_values_tensor);
+  pad_->set_input_constant_values(*constant_value_);
+  return RET_OK;
+}
+
+int PadNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors,
+                           const std::vector<ge::Operator *> &npu_inputs) {
+  int size = static_cast<int>(paddings_vec_.size() / 2);
+  ge::TensorDesc padding_tensor_desc(ge::Shape({size, 2}), ge::FORMAT_NCHW, ge::DT_INT32);
+  ge::TensorPtr padding_tensor = std::make_shared<hiai::Tensor>(padding_tensor_desc);
+  padding_tensor->SetData(reinterpret_cast<uint8_t *>(paddings_vec_.data()), 2 * size * sizeof(int));
+  paddings_ = new hiai::op::Const(name_ + "paddings");
+  paddings_->set_attr_value(padding_tensor);
+  pad_->set_input_paddings(*paddings_);
+
+  pad_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *PadNPUOp::GetNPUOp() { return this->pad_; }
+
+int PadNPUOp::HandleAxis() {
+  if (paddings_vec_.size() != 8) {
+    return RET_ERROR;
+  }
+  int c1 = paddings_vec_[6];
+  int c2 = paddings_vec_[7];
+  // 0 1 2 3 4 5 6 7
+  // n n h h w w c c
+  // n n c c h h w w
+  paddings_vec_[6] = paddings_vec_[4];
+  paddings_vec_[7] = paddings_vec_[5];
+  paddings_vec_[4] = paddings_vec_[2];
+  paddings_vec_[5] = paddings_vec_[3];
+  paddings_vec_[2] = c1;
+  paddings_vec_[3] = c2;
+  return RET_OK;
+}
+
+PadNPUOp::~PadNPUOp() {
+  if (pad_ != nullptr) {
+    delete pad_;
+    pad_ = nullptr;
+  }
+  if (paddings_ != nullptr) {
+    delete paddings_;
+    paddings_ = nullptr;
+  }
+  if (constant_value_ != nullptr) {
+    delete constant_value_;
+    constant_value_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/pad_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/pad_npu.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_PAD_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_PAD_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class PadNPUOp : public NPUOp {
+ public:
+  PadNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~PadNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+  int HandleAxis();
+
+ private:
+  hiai::op::PadV2 *pad_ = nullptr;
+  hiai::op::Const *paddings_ = nullptr;
+  hiai::op::Const *constant_value_ = nullptr;
+  std::vector<int> paddings_vec_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_PAD_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/reduce_npu.cc
@ -0,0 +1,80 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/reduce_npu.h"
+#include <memory>
+
+namespace mindspore {
+int ReduceNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto reduce_prim = primitive->value_as_ReduceFusion();
+  if (reduce_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  reduce_mode_ = reduce_prim->mode();
+  if (reduce_mode_ != schema::ReduceMode_ReduceMean) {
+    MS_LOG(WARNING) << "Npu does not support reduce mode " << reduce_prim->mode() << " for op " << name_;
+    return RET_NOT_SUPPORT;
+  }
+  if (reduce_prim->reduce_to_end()) {
+    MS_LOG(WARNING) << "Npu reduce op does not support attribute reduce_to_end";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ReduceNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto reduce_prim = primitive->value_as_ReduceFusion();
+  if (reduce_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  if (reduce_mode_ == schema::ReduceMode_ReduceMean) {
+    auto reduce_mean = new (std::nothrow) hiai::op::ReduceMean(name_);
+    if (reduce_ == nullptr) {
+      MS_LOG(ERROR) << "New reduce operator for op " << name_ << " failed.";
+      return RET_ERROR;
+    }
+    reduce_mean->set_attr_keep_dims(reduce_prim->keep_dims());
+    reduce_ = reduce_mean;
+  } else {
+    MS_LOG(ERROR) << "Npu does not support reduce mode " << reduce_prim->mode() << " for op " << name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ReduceNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  if (reduce_mode_ == schema::ReduceMode_ReduceMean) {
+    auto reduce_mean = reinterpret_cast<hiai::op::ReduceMean *>(reduce_);
+    reduce_mean->set_input_x(*npu_inputs[0]).set_input_axes(*npu_inputs[1]);
+  }
+  return RET_OK;
+}
+
+ge::Operator *ReduceNPUOp::GetNPUOp() { return this->reduce_; }
+
+ReduceNPUOp::~ReduceNPUOp() {
+  if (reduce_ != nullptr) {
+    delete reduce_;
+    reduce_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/reduce_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/reduce_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_REDUCE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_REDUCE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class ReduceNPUOp : public NPUOp {
+ public:
+  ReduceNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ReduceNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  schema::ReduceMode reduce_mode_ = schema::ReduceMode_ReduceMean;
+  hiai::Operator *reduce_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_REDUCE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/reshape_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/reshape_npu.cc
@ -0,0 +1,62 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/reshape_npu.h"
+#include <memory>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+namespace mindspore {
+int ReshapeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() != 2) {
+    MS_LOG(WARNING) << "Npu op should have w2 input tensors.";
+    return RET_NOT_SUPPORT;
+  }
+  auto shape_tensor = in_tensors.at(1);
+  if (shape_tensor->data() == nullptr) {
+    MS_LOG(WARNING) << "Npu reshape op only supports const shape.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ReshapeNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors) {
+  reshape_ = new (std::nothrow) hiai::op::Reshape(name_);
+  if (reshape_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ReshapeNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors,
+                               const std::vector<ge::Operator *> &npu_inputs) {
+  reshape_->set_input_x(*npu_inputs[0]);
+  reshape_->set_input_shape(*npu_inputs[1]);
+  return RET_OK;
+}
+
+ge::Operator *ReshapeNPUOp::GetNPUOp() { return this->reshape_; }
+
+ReshapeNPUOp::~ReshapeNPUOp() {
+  if (reshape_ != nullptr) {
+    delete reshape_;
+    reshape_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/reshape_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/reshape_npu.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_NPU_OP_RESHAPE_NPU_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_NPU_OP_RESHAPE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class ReshapeNPUOp : public NPUOp {
+ public:
+  ReshapeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+               const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ReshapeNPUOp() override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Reshape *reshape_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_NPU_OP_RESHAPE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/resize_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/resize_npu.cc
@ -0,0 +1,128 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/resize_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int ResizeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                           const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto resize_prim = primitive->value_as_Resize();
+  if (resize_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  resize_method_ = resize_prim->method();
+  if (resize_method_ != schema::ResizeMethod_LINEAR && resize_method_ != schema::ResizeMethod_NEAREST) {
+    MS_LOG(WARNING) << "Unsupported resize method type: " << resize_method_;
+    return RET_NOT_SUPPORT;
+  }
+
+  if (in_tensors[0]->shape()[1] > out_tensors[0]->shape()[1] ||
+      in_tensors[0]->shape()[2] > out_tensors[0]->shape()[2]) {
+    MS_LOG(WARNING) << "Npu resize does not support reduction.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ResizeNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                      const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto resize_prim = primitive->value_as_Resize();
+  if (resize_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() == 1) {
+    new_height_ = resize_prim->new_height();
+    new_width_ = resize_prim->new_width();
+  } else if (in_tensors.size() == 2) {
+    auto out_size = in_tensors.at(1)->data();
+    if (out_size == nullptr) {
+      MS_LOG(ERROR) << "Out size is not assigned";
+      return RET_ERROR;
+    }
+    new_height_ = out_tensors.at(0)->shape().at(1);
+    new_width_ = out_tensors.at(0)->shape().at(2);
+  } else {
+    MS_LOG(ERROR) << "Get resize op new_height and new_width error.";
+    return RET_ERROR;
+  }
+
+  ge::TensorDesc sizeTensorDesc(ge::Shape({2}), ge::FORMAT_NCHW, ge::DT_INT32);
+  ge::TensorPtr sizeTensor = std::make_shared<hiai::Tensor>(sizeTensorDesc);
+  vector<int32_t> dataValue = {static_cast<int32_t>(new_height_), static_cast<int32_t>(new_width_)};
+  sizeTensor->SetData(reinterpret_cast<uint8_t *>(dataValue.data()), 2 * sizeof(int32_t));
+  out_size_ = new (std::nothrow) hiai::op::Const(name_ + "_size");
+  out_size_->set_attr_value(sizeTensor);
+
+  if (resize_method_ == schema::ResizeMethod_LINEAR) {
+    auto resize_bilinear = new (std::nothrow) hiai::op::ResizeBilinearV2(name_);
+    if (resize_bilinear == nullptr) {
+      MS_LOG(ERROR) << " resize_ is nullptr.";
+      return RET_ERROR;
+    }
+    resize_bilinear->set_attr_align_corners(resize_prim->coordinate_transform_mode() ==
+                                            schema::CoordinateTransformMode_ALIGN_CORNERS);
+    resize_bilinear->set_input_size(*out_size_);
+    resize_bilinear->set_attr_half_pixel_centers(resize_prim->preserve_aspect_ratio());
+    resize_ = resize_bilinear;
+  } else if (resize_method_ == schema::ResizeMethod_NEAREST) {
+    auto resize_nearest = new (std::nothrow) hiai::op::ResizeNearestNeighborV2(name_);
+    if (resize_nearest == nullptr) {
+      MS_LOG(ERROR) << " resize_ is nullptr.";
+      return RET_ERROR;
+    }
+    resize_nearest->set_attr_align_corners(resize_prim->coordinate_transform_mode() ==
+                                           schema::CoordinateTransformMode_ALIGN_CORNERS);
+    resize_nearest->set_input_size(*out_size_);
+  } else {
+    MS_LOG(WARNING) << "Unsupported resize method type:" << resize_method_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ResizeNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors,
+                              const std::vector<ge::Operator *> &npu_inputs) {
+  if (resize_method_ == schema::ResizeMethod_LINEAR) {
+    auto resize_bilinear = reinterpret_cast<hiai::op::ResizeBilinearV2 *>(resize_);
+    resize_bilinear->set_input_x(*npu_inputs[0]);
+  } else if (resize_method_ == schema::ResizeMethod_NEAREST) {
+    auto resize_nearest = reinterpret_cast<hiai::op::ResizeNearestNeighborV2 *>(resize_);
+    resize_nearest->set_input_x(*npu_inputs[0]);
+  } else {
+    MS_LOG(WARNING) << "Unsupported resize method type:" << resize_method_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+ge::Operator *ResizeNPUOp::GetNPUOp() { return this->resize_; }
+
+ResizeNPUOp::~ResizeNPUOp() {
+  if (resize_ != nullptr) {
+    delete resize_;
+    resize_ = nullptr;
+  }
+  if (out_size_ != nullptr) {
+    delete out_size_;
+    out_size_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/resize_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/resize_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_RESIZE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_RESIZE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class ResizeNPUOp : public NPUOp {
+ public:
+  ResizeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+              const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ResizeNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  schema::ResizeMethod resize_method_ = schema::ResizeMethod_UNKNOWN;
+  int new_height_ = 0;
+  int new_width_ = 0;
+  ge::Operator *resize_ = nullptr;
+  hiai::op::Const *out_size_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_RESIZE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.cc
@ -0,0 +1,154 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/scale_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int ScaleNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                          const std::vector<tensor::MSTensor *> &out_tensors) {
+  auto scale_prim = primitive->value_as_ScaleFusion();
+  if (scale_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  axis_ = scale_prim->axis();
+  if (axis_ < 0) {
+    axis_ = axis_ + in_tensors[0]->shape().size();
+  }
+  if (axis_ != 1 && axis_ != 3) {
+    MS_LOG(WARNING) << "Npu scale axis attr only support 1 or channel, now is " << axis_;
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int ScaleNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors) {
+  op_ = new (std::nothrow) hiai::op::Scale(name_);
+  if (op_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  op_->set_attr_axis(1);  // only support axis 1 now
+
+  auto scale_prim = primitive->value_as_ScaleFusion();
+  if (scale_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  act_type_ = scale_prim->activation_type();
+  if (act_type_ != schema::ActivationType_NO_ACTIVATION) {
+    auto ret = SetActivation(op_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+      return ret;
+    }
+  }
+  return RET_OK;
+}
+
+int ScaleNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                             const std::vector<tensor::MSTensor *> &out_tensors,
+                             const std::vector<ge::Operator *> &npu_inputs) {
+  op_->set_input_x(*npu_inputs.at(0));
+  MS_ASSERT(in_tensors.size() > 1);
+  auto scale_shape = in_tensors.at(1)->shape();
+  std::shared_ptr<ge::Tensor> scale_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+  if (scale_tensor == nullptr) {
+    MS_LOG(ERROR) << "new scale_tensor failed.";
+    return RET_ERROR;
+  }
+  ge::TensorDesc scale_tensor_desc(ConverterToNPUShape({1, scale_shape[0], 1, 1}), ge::FORMAT_NCHW,
+                                   ConverterToNPUDataType(in_tensors[1]->data_type()));
+  scale_tensor->SetTensorDesc(scale_tensor_desc);
+  scale_tensor->SetData(reinterpret_cast<const uint8_t *>(in_tensors[1]->data()), in_tensors[1]->Size());
+  scale_ = new (std::nothrow) hiai::op::Const(name_ + "_scale");
+  if (scale_ == nullptr) {
+    MS_LOG(ERROR) << "New scale_ const failed.";
+    return RET_ERROR;
+  }
+  scale_->set_attr_value(scale_tensor);
+  op_->set_input_scale(*scale_);
+
+  if (in_tensors.size() > 2 && in_tensors[2] != nullptr) {
+    auto bias_shape = in_tensors[2]->shape();
+    std::shared_ptr<ge::Tensor> bias_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor());
+    if (bias_tensor == nullptr) {
+      MS_LOG(ERROR) << "new bias_tensor failed.";
+      return RET_ERROR;
+    }
+    ge::TensorDesc bias_tensor_desc(ConverterToNPUShape({1, bias_shape[0], 1, 1}), ge::FORMAT_NCHW,
+                                    ConverterToNPUDataType(in_tensors[2]->data_type()));
+    bias_tensor->SetTensorDesc(bias_tensor_desc);
+    bias_tensor->SetData(reinterpret_cast<const uint8_t *>(in_tensors[2]->data()), in_tensors[2]->Size());
+    bias_ = new (std::nothrow) hiai::op::Const(name_ + "_beta");
+    if (bias_ == nullptr) {
+      MS_LOG(ERROR) << "New beta_ const failed.";
+      return RET_ERROR;
+    }
+    bias_->set_attr_value(bias_tensor);
+    op_->set_input_bias(*bias_);
+  }
+  return RET_OK;
+}
+
+ge::Operator *ScaleNPUOp::GetNPUOp() {
+  if (act_type_ == schema::ActivationType_NO_ACTIVATION) {
+    return op_;
+  } else {
+    return act_;
+  }
+}
+
+int ScaleNPUOp::SetActivation(const ge::Operator *input) {
+  act_ = new (std::nothrow) hiai::op::Activation(name_ + "_act");
+  if (act_ == nullptr) {
+    MS_LOG(ERROR) << "New activation npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  act_->set_input_x(*input);
+  if (act_type_ == schema::ActivationType_RELU) {
+    act_->set_attr_mode(1);
+  } else if (act_type_ == schema::ActivationType_RELU6) {
+    act_->set_attr_mode(14);
+  } else {
+    MS_LOG(ERROR) << "Unsupported activation type for scale.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+ScaleNPUOp::~ScaleNPUOp() {
+  if (op_ != nullptr) {
+    delete op_;
+    op_ = nullptr;
+  }
+  if (scale_ != nullptr) {
+    delete scale_;
+    scale_ = nullptr;
+  }
+  if (bias_ != nullptr) {
+    delete bias_;
+    bias_ = nullptr;
+  }
+  if (act_ != nullptr) {
+    delete act_;
+    act_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/scale_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/scale_npu.h
@ -0,0 +1,59 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SCALE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SCALE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/graph/op/nn_defs.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class ScaleNPUOp : public NPUOp {
+ public:
+  ScaleNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+             const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~ScaleNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+  int GetAxis() { return axis_; }
+
+ private:
+  int SetActivation(const ge::Operator *input);
+
+  int axis_ = 0;
+  schema::ActivationType act_type_ = schema::ActivationType_NO_ACTIVATION;
+  hiai::op::Scale *op_ = nullptr;
+  hiai::op::Const *scale_ = nullptr;
+  hiai::op::Const *bias_ = nullptr;
+  hiai::op::Activation *act_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SCALE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/slice_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/slice_npu.cc
@ -0,0 +1,48 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/slice_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int SliceNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors) {
+  slice_ = new (std::nothrow) hiai::op::Slice(name_);
+  if (slice_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int SliceNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                             const std::vector<tensor::MSTensor *> &out_tensors,
+                             const std::vector<ge::Operator *> &npu_inputs) {
+  slice_->set_input_x(*npu_inputs[0]);
+  slice_->set_input_offsets(*npu_inputs[1]);
+  slice_->set_input_size(*npu_inputs[2]);
+  return RET_OK;
+}
+
+ge::Operator *SliceNPUOp::GetNPUOp() { return this->slice_; }
+
+SliceNPUOp::~SliceNPUOp() {
+  if (slice_ != nullptr) {
+    delete slice_;
+    slice_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/slice_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/slice_npu.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SLICE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SLICE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class SliceNPUOp : public NPUOp {
+ public:
+  SliceNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+             const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~SliceNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Slice *slice_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SLICE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/softmax_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/softmax_npu.cc
@ -0,0 +1,55 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/softmax_npu.h"
+namespace mindspore {
+int SoftmaxNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors) {
+  softmax_ = new (std::nothrow) hiai::op::Softmax(name_);
+  if (softmax_ == nullptr) {
+    MS_LOG(ERROR) << name_ << " op is nullptr";
+    return RET_ERROR;
+  }
+  auto softmax_prim = primitive->value_as_Softmax();
+  if (softmax_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  auto axis = static_cast<int>(*(softmax_prim->axis()->begin()));
+  if (axis == -1) {
+    softmax_->set_attr_axis(in_tensors[0]->shape().size() + axis);
+  } else {
+    softmax_->set_attr_axis(axis);
+  }
+  return RET_OK;
+}
+
+int SoftmaxNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors,
+                               const std::vector<ge::Operator *> &npu_inputs) {
+  softmax_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *SoftmaxNPUOp::GetNPUOp() { return this->softmax_; }
+
+SoftmaxNPUOp::~SoftmaxNPUOp() {
+  if (softmax_ != nullptr) {
+    delete softmax_;
+    softmax_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/softmax_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/softmax_npu.h
@ -0,0 +1,51 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SOFTMAX_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SOFTMAX_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/nn_defs.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class SoftmaxNPUOp : public NPUOp {
+ public:
+  SoftmaxNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+               const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~SoftmaxNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Softmax *softmax_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SOFTMAX_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/split_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/split_npu.cc
@ -0,0 +1,92 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/split_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int SplitNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors) {
+  split_ = new (std::nothrow) hiai::op::SplitV(name_);
+  if (split_ == nullptr) {
+    MS_LOG(ERROR) << "New split npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto split_prim = primitive->value_as_Split();
+  if (split_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+
+  auto sizes_split = split_prim->size_splits();
+  std::vector<int> sizes_split_vec;
+  if (sizes_split != nullptr) {
+    sizes_split_vec = std::vector<int>(sizes_split->begin(), sizes_split->end());
+  } else {
+    return RET_ERROR;
+  }
+  int size = split_prim->output_num();
+  ge::TensorDesc size_splits_tensor_desc(ge::Shape({size}), ge::FORMAT_NCHW, ge::DT_INT32);
+  ge::TensorPtr size_splits_tensor = std::make_shared<hiai::Tensor>(size_splits_tensor_desc);
+  size_splits_tensor->SetData(reinterpret_cast<uint8_t *>(sizes_split_vec.data()), size * sizeof(int));
+  size_splits_ = new hiai::op::Const(name_ + "_size");
+  size_splits_->set_attr_value(size_splits_tensor);
+  split_->set_input_size_splits(*size_splits_);
+
+  axis_ = static_cast<int>(split_prim->axis());
+  split_->set_attr_num_split(size);
+  split_->create_dynamic_output_y(size);
+  return RET_OK;
+}
+
+int SplitNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                             const std::vector<tensor::MSTensor *> &out_tensors,
+                             const std::vector<ge::Operator *> &npu_inputs) {
+  ge::TensorDesc split_dim_tensor_desc(ge::Shape({1}), ge::FORMAT_NCHW, ge::DT_INT32);
+  ge::TensorPtr split_dim_tensor = std::make_shared<hiai::Tensor>(split_dim_tensor_desc);
+  vector<int32_t> split_dim_data_value = {axis_};
+  split_dim_tensor->SetData(reinterpret_cast<uint8_t *>(split_dim_data_value.data()), 1 * sizeof(int));
+  split_dim_ = new hiai::op::Const(name_ + "_dim");
+  split_dim_->set_attr_value(split_dim_tensor);
+  split_->set_input_split_dim(*split_dim_);
+
+  split_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *SplitNPUOp::GetNPUOp() { return this->split_; }
+
+int SplitNPUOp::HandleAxis() {
+  axis_ = TransFormAxis(axis_);
+  return RET_OK;
+}
+
+SplitNPUOp::~SplitNPUOp() {
+  if (split_ != nullptr) {
+    delete split_;
+    split_ = nullptr;
+  }
+  if (size_splits_ != nullptr) {
+    delete size_splits_;
+    size_splits_ = nullptr;
+  }
+  if (split_dim_ != nullptr) {
+    delete split_dim_;
+    split_dim_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/kernel/npu/split_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/split_npu.h
@ -17,30 +17,40 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_SPLIT_NPU_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_SPLIT_NPU_H_
 #include <vector>
-#include "nnacl/split_parameter.h"
-#include "src/runtime/kernel/npu/npu_kernel.h"
+#include <string>
 #include "include/graph/op/all_ops.h"
-namespace mindspore::kernel {
-class SplitNPUKernel : public NPUKernel {
- public:
-  SplitNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                 const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : NPUKernel(parameter, inputs, outputs, ctx) {
-    param_ = reinterpret_cast<SplitParameter *>(parameter);
-  }
-  ~SplitNPUKernel() override;
+#include "src/delegate/npu/op/npu_op.h"

-  int IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                OpParameter *opParameter) override;
-  int SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+namespace mindspore {
+class SplitNPUOp : public NPUOp {
+ public:
+  SplitNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+             const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~SplitNPUOp();
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  int HandleAxis();
+
  ge::Operator *GetNPUOp() override;

 private:
-  hiai::op::SplitV *op_ = nullptr;
-  SplitParameter *param_;
+  hiai::op::SplitV *split_ = nullptr;
  hiai::op::Const *size_splits_ = nullptr;
  hiai::op::Const *split_dim_ = nullptr;
+  int axis_ = 0;
 };
-}  // namespace mindspore::kernel
+}  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_SPLIT_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/squeeze_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/squeeze_npu.cc
@ -0,0 +1,53 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/squeeze_npu.h"
+namespace mindspore {
+int SqueezeNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors) {
+  squeeze_ = new (std::nothrow) hiai::op::Squeeze(name_);
+  if (squeeze_ == nullptr) {
+    MS_LOG(ERROR) << "New squeeze npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto squeeze_prim = primitive->value_as_Squeeze();
+  auto axis = squeeze_prim->axis();
+  vector<int64_t> axes;
+  if (axis != nullptr) {
+    for (size_t i = 0; i < axis->size(); i++) {
+      axes.push_back(*(axis->begin() + i));
+    }
+  }
+  squeeze_->set_attr_axis(axes);
+  return RET_OK;
+}
+
+int SqueezeNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                               const std::vector<tensor::MSTensor *> &out_tensors,
+                               const std::vector<ge::Operator *> &npu_inputs) {
+  squeeze_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *SqueezeNPUOp::GetNPUOp() { return this->squeeze_; }
+
+SqueezeNPUOp::~SqueezeNPUOp() {
+  if (squeeze_ != nullptr) {
+    delete squeeze_;
+    squeeze_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/squeeze_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/squeeze_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SQUEEZE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SQUEEZE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class SqueezeNPUOp : public NPUOp {
+ public:
+  SqueezeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+               const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~SqueezeNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::Squeeze *squeeze_ = nullptr;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_SQUEEZE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/strided_slice_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/strided_slice_npu.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/strided_slice_npu.h"
+#include "src/delegate/npu/npu_converter_utils.h"
+#include "src/delegate/npu/pass/npu_pass_utils.h"
+
+namespace mindspore {
+int StridedSliceNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                                 const std::vector<tensor::MSTensor *> &out_tensors) {
+  // Only onnx StridedSlice has 5 in_tensors, of which the 4th input is axes and the 5th input is strides.
+  if (in_tensors.size() == 5) {
+    vector<int> axes;
+    size_t size = in_tensors[3]->shape()[0];
+    axes.resize(size);
+    memcpy(axes.data(), in_tensors[3]->data(), sizeof(int) * size);
+    for (int i = 0; i < axes.size(); ++i) {
+      if (i != axes[i]) {
+        MS_LOG(WARNING) << "Does not support setting axis, so the axis must be continuous.";
+        return RET_NOT_SUPPORT;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int StridedSliceNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors) {
+  strided_slice_ = new (std::nothrow) hiai::op::StridedSlice(name_);
+  if (strided_slice_ == nullptr) {
+    MS_LOG(ERROR) << "New stridedSlice npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  auto strided_slice_prim = primitive->value_as_StridedSlice();
+  if (strided_slice_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  begins_mask_ = strided_slice_prim->begin_mask();
+  ends_mask_ = strided_slice_prim->end_mask();
+  ellipsis_mask_ = strided_slice_prim->ellipsis_mask();
+  new_axis_mask_ = strided_slice_prim->new_axis_mask();
+  shrink_axis_mask_ = strided_slice_prim->shrink_axis_mask();
+  return RET_OK;
+}
+
+int StridedSliceNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                    const std::vector<tensor::MSTensor *> &out_tensors,
+                                    const std::vector<ge::Operator *> &npu_inputs) {
+  strided_slice_->set_attr_begin_mask(begins_mask_);
+  strided_slice_->set_attr_ellipsis_mask(ellipsis_mask_);
+  strided_slice_->set_attr_end_mask(ends_mask_);
+  strided_slice_->set_attr_shrink_axis_mask(shrink_axis_mask_);
+  strided_slice_->set_attr_new_axis_mask(new_axis_mask_);
+  // StridedSliceV2 supports setting axes, but it will cause an endless loop.
+  strided_slice_->set_input_x(*npu_inputs[0]);
+  strided_slice_->set_input_begin(*npu_inputs[1]);
+  strided_slice_->set_input_end(*npu_inputs[2]);
+
+  // The strides position of onnx is the 5th, and the others are the 4th.
+  if (npu_inputs.size() == 5) {
+    strided_slice_->set_input_strides(*npu_inputs[4]);
+  } else {
+    strided_slice_->set_input_strides(*npu_inputs[3]);
+  }
+  return RET_OK;
+}
+
+ge::Operator *StridedSliceNPUOp::GetNPUOp() { return this->strided_slice_; }
+
+int StridedSliceNPUOp::HandleAxis() {
+  begins_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(begins_mask_);
+  ends_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(ends_mask_);
+  ellipsis_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(ellipsis_mask_);
+  shrink_axis_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(shrink_axis_mask_);
+  new_axis_mask_ = NPUPassUtils::MaskDataNHWC2NCHW(new_axis_mask_);
+  return RET_OK;
+}
+
+StridedSliceNPUOp::~StridedSliceNPUOp() {
+  if (strided_slice_ != nullptr) {
+    delete strided_slice_;
+    strided_slice_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/strided_slice_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/strided_slice_npu.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_STRIDEDSLICE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_STRIDEDSLICE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+
+namespace mindspore {
+class StridedSliceNPUOp : public NPUOp {
+ public:
+  StridedSliceNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~StridedSliceNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+  int HandleAxis();
+
+ private:
+  hiai::op::StridedSlice *strided_slice_ = nullptr;
+  int begins_mask_ = 0;
+  int ends_mask_ = 0;
+  int ellipsis_mask_ = 0;
+  int new_axis_mask_ = 0;
+  int shrink_axis_mask_ = 0;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_STRIDEDSLICE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/tile_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/tile_npu.cc
@ -0,0 +1,83 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/tile_npu.h"
+#include <memory>
+#include "src/delegate/npu/npu_converter_utils.h"
+
+namespace mindspore {
+int TileNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                         const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() != 2) {
+    return RET_ERROR;
+  }
+  auto multiple_tensor = in_tensors[1];
+  if (multiple_tensor->ElementsNum() > 4 || multiple_tensor->data() == nullptr) {
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int TileNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                    const std::vector<tensor::MSTensor *> &out_tensors) {
+  tile_ = new (std::nothrow) hiai::op::Tile(name_);
+  if (tile_ == nullptr) {
+    MS_LOG(ERROR) << "New tile npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int TileNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &out_tensors,
+                            const std::vector<ge::Operator *> &npu_inputs) {
+  tile_->set_input_x(*npu_inputs[0]);
+
+  std::vector<int> multiples;
+  auto multiple_data = reinterpret_cast<int *>(in_tensors[1]->data());
+  if (multiple_data == nullptr) {
+    return RET_ERROR;
+  }
+  for (int i = 0; i < in_tensors[1]->ElementsNum(); ++i) {
+    multiples.push_back(multiple_data[i]);
+  }
+  ge::TensorDesc multiple_tensor_desc(ge::Shape({static_cast<int64_t>(multiples.size())}), ge::FORMAT_NCHW,
+                                      ge::DT_INT32);
+  ge::TensorPtr multiple_tensor = std::make_shared<hiai::Tensor>(multiple_tensor_desc);
+  multiple_tensor->SetData(reinterpret_cast<uint8_t *>(multiples.data()), multiples.size() * sizeof(int));
+  multiple_ = new hiai::op::Const(name_ + "multiples");
+  if (multiple_ == nullptr) {
+    MS_LOG(ERROR) << "New multiple const for tile npu operator failed.";
+    return RET_ERROR;
+  }
+  multiple_->set_attr_value(multiple_tensor);
+  tile_->set_input_multiples(*multiple_);
+  return RET_OK;
+}
+
+ge::Operator *TileNPUOp::GetNPUOp() { return this->tile_; }
+
+TileNPUOp::~TileNPUOp() {
+  if (tile_ != nullptr) {
+    delete tile_;
+    tile_ = nullptr;
+  }
+  if (multiple_ != nullptr) {
+    delete multiple_;
+    multiple_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/kernel/npu/tile_npu.h
+++ b/mindspore/lite/src/runtime/kernel/npu/tile_npu.h
@ -17,31 +17,34 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
 #include <vector>
-#include "src/runtime/kernel/npu/npu_kernel.h"
+#include <string>
 #include "include/graph/op/all_ops.h"
-#include "nnacl/base/tile_base.h"
+#include "src/delegate/npu/op/npu_op.h"

-namespace mindspore::kernel {
-class TileNPUKernel : public NPUKernel {
+namespace mindspore {
+class TileNPUOp : public NPUOp {
 public:
-  TileNPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
-      : NPUKernel(parameter, inputs, outputs, ctx) {
-    param_ = reinterpret_cast<TileParameter *>(parameter);
-  }
-  ~TileNPUKernel() override;
+  TileNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+            const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}

-  int IsSupport(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
-                OpParameter *opParameter) override;
-  int SetNPUInputs(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+  ~TileNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
                   const std::vector<ge::Operator *> &npu_inputs) override;

  ge::Operator *GetNPUOp() override;

 private:
-  hiai::op::Tile *op_ = nullptr;
+  hiai::op::Tile *tile_ = nullptr;
  hiai::op::Const *multiple_ = nullptr;
-  TileParameter *param_ = nullptr;
 };
-}  // namespace mindspore::kernel
+}  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_TILE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/transpose_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/transpose_npu.cc
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/transpose_npu.h"
+namespace mindspore {
+int TransposeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors.size() < 2) {
+    MS_LOG(ERROR) << "Npu transpose must get fixed values of transpose axis.";
+    return RET_ERROR;
+  }
+  auto perm_num = in_tensors.at(1)->ElementsNum();
+  auto perm_data = reinterpret_cast<int *>(in_tensors.at(1)->data());
+  if (perm_data == nullptr) {
+    MS_LOG(ERROR) << "Npu transpose must get fixed values of transpose axis.";
+    return RET_ERROR;
+  }
+  for (int i = 0; i < perm_num; i++) {
+    perm_.push_back(perm_data[i]);
+  }
+  std::vector<int> nh2nc_perm = {0, 3, 1, 2};
+  std::vector<int> nc2nh_perm = {0, 2, 3, 1};
+  if (perm_ != nh2nc_perm && perm_ != nc2nh_perm) {
+    MS_LOG(WARNING) << "NPU transpose op only supports nhwc->nchw or nchw->nhwc.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/transpose_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/transpose_npu.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_TRANSPOSE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_TRANSPOSE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class TransposeNPUOp : public NPUOp {
+ public:
+  TransposeNPUOp(const std::vector<tensor::MSTensor *> &in_tensors, const std::vector<tensor::MSTensor *> &out_tensors,
+                 std::vector<int> perm, std::string name)
+      : NPUOp(nullptr, in_tensors, out_tensors, name) {
+    perm_ = perm;
+    type_ = schema::PrimitiveType_Transpose;
+  }
+
+  TransposeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                 const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~TransposeNPUOp() override = default;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override {
+    return RET_OK;
+  }
+
+  std::vector<int> GetPerm() { return perm_; }
+
+ protected:
+  std::vector<int> perm_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_TRANSPOSE_NPU_H_
--- a/mindspore/lite/src/delegate/npu/op/unsqueeze_npu.cc
+++ b/mindspore/lite/src/delegate/npu/op/unsqueeze_npu.cc
@ -0,0 +1,73 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/op/unsqueeze_npu.h"
+#include <memory>
+
+namespace mindspore {
+int UnsqueezeNPUOp::IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                              const std::vector<tensor::MSTensor *> &out_tensors) {
+  if (in_tensors[0]->shape().size() > 3) {
+    MS_LOG(WARNING) << "The dimension of output not support bigger than 4.";
+    return RET_NOT_SUPPORT;
+  }
+  return RET_OK;
+}
+
+int UnsqueezeNPUOp::Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                         const std::vector<tensor::MSTensor *> &out_tensors) {
+  unsqueeze_ = new (std::nothrow) hiai::op::ExpandDims(name_);
+  if (unsqueeze_ == nullptr) {
+    MS_LOG(ERROR) << "New unsqueeze npu operator for op " << name_ << " failed.";
+    return RET_ERROR;
+  }
+
+  auto unsqueeze_prim = primitive->value_as_Unsqueeze();
+  if (unsqueeze_prim == nullptr) {
+    MS_LOG(ERROR) << "Get null primitive value for op ." << name_;
+    return RET_ERROR;
+  }
+  axis_ = std::vector<int>(unsqueeze_prim->axis()->begin(), unsqueeze_prim->axis()->end());
+  int size = axis_.size();
+  ge::TensorDesc desc(ge::Shape({size}), ge::FORMAT_NCHW, ge::DT_INT32);
+  ge::TensorPtr tensor = std::make_shared<hiai::Tensor>(desc);
+  tensor->SetData(reinterpret_cast<uint8_t *>(axis_.data()), size * sizeof(int));
+  axis_const_ = new hiai::op::Const(name_ + "_axis");
+  axis_const_->set_attr_value(tensor);
+  unsqueeze_->set_input_axis(*axis_const_);
+  return RET_OK;
+}
+
+int UnsqueezeNPUOp::SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                                 const std::vector<tensor::MSTensor *> &out_tensors,
+                                 const std::vector<ge::Operator *> &npu_inputs) {
+  unsqueeze_->set_input_x(*npu_inputs[0]);
+  return RET_OK;
+}
+
+ge::Operator *UnsqueezeNPUOp::GetNPUOp() { return this->unsqueeze_; }
+
+UnsqueezeNPUOp::~UnsqueezeNPUOp() {
+  if (unsqueeze_ != nullptr) {
+    delete unsqueeze_;
+    unsqueeze_ = nullptr;
+  }
+  if (axis_const_ != nullptr) {
+    delete axis_const_;
+    axis_const_ = nullptr;
+  }
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/op/unsqueeze_npu.h
+++ b/mindspore/lite/src/delegate/npu/op/unsqueeze_npu.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_UNSQUEEZE_NPU_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_OP_UNSQUEEZE_NPU_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "src/delegate/npu/op/npu_op.h"
+namespace mindspore {
+class UnsqueezeNPUOp : public NPUOp {
+ public:
+  UnsqueezeNPUOp(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                 const std::vector<tensor::MSTensor *> &out_tensors, std::string name)
+      : NPUOp(primitive, in_tensors, out_tensors, name) {}
+
+  ~UnsqueezeNPUOp() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+                const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int Init(const schema::Primitive *primitive, const std::vector<tensor::MSTensor *> &in_tensors,
+           const std::vector<tensor::MSTensor *> &out_tensors) override;
+
+  int SetNPUInputs(const std::vector<tensor::MSTensor *> &in_tensors,
+                   const std::vector<tensor::MSTensor *> &out_tensors,
+                   const std::vector<ge::Operator *> &npu_inputs) override;
+
+  ge::Operator *GetNPUOp() override;
+
+ private:
+  hiai::op::ExpandDims *unsqueeze_ = nullptr;
+  hiai::op::Const *axis_const_ = nullptr;
+  vector<int> axis_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_NPU_UNSQUEEZE_NPU_H_
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_base_pass.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_base_pass.h
@ -14,13 +14,15 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_BASE_PASS_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_BASE_PASS_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_BASE_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_BASE_PASS_H_
 #include <string>
-namespace mindspore::lite {
+#include "src/delegate/npu/npu_graph.h"
+
+namespace mindspore {
 class NPUBasePass {
 public:
-  virtual int Run() = 0;
+  virtual int Run(NPUGraph *subgraph) = 0;

  virtual ~NPUBasePass() = default;

@ -29,6 +31,6 @@ class NPUBasePass {
 protected:
  std::string name_;
 };
-}  // namespace mindspore::lite
+}  // namespace mindspore

-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_BASE_PASS_H_
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_BASE_PASS_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.cc
@ -0,0 +1,460 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/pass/npu_fusion_pass.h"
+#include <vector>
+#include "src/delegate/npu/pass/npu_pass_utils.h"
+#include "src/delegate/npu/op/concat_npu.h"
+#include "src/delegate/npu/op/split_npu.h"
+#include "src/delegate/npu/op/pad_npu.h"
+#include "src/delegate/npu/op/strided_slice_npu.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace {
+constexpr int kNumDims = 4;
+constexpr int kNumInputSize = 4;
+}  // namespace
+
+namespace mindspore {
+bool CheckFusion(NPUOp *cur_op) {
+  if (cur_op->in_ops().empty() || cur_op->out_ops().empty()) {
+    return false;
+  }
+  auto pre_flag = std::all_of(cur_op->in_ops().begin(), cur_op->in_ops().end(), [](NPUOp *in_op) {
+    return NPUPassUtils::IsNchw2Nhwc(in_op) && in_op->out_ops().size() == 1;
+  });
+  if (!pre_flag) {
+    return false;
+  }
+  auto post_flag = std::all_of(cur_op->out_ops().begin(), cur_op->out_ops().end(),
+                               [](NPUOp *out_op) { return NPUPassUtils::IsNhwc2Nchw(out_op); });
+  return post_flag;
+}
+
+bool CheckFormatFusion(NPUOp *cur_op) {
+  if (cur_op->out_ops().empty()) {
+    return false;
+  }
+  if (NPUPassUtils::IsNhwc2Nchw(cur_op)) {
+    return std::all_of(cur_op->out_ops().begin(), cur_op->out_ops().end(),
+                       [](NPUOp *cur_op) { return NPUPassUtils::IsNchw2Nhwc(cur_op); });
+  }
+  if (NPUPassUtils::IsNchw2Nhwc(cur_op)) {
+    return std::all_of(cur_op->out_ops().begin(), cur_op->out_ops().end(),
+                       [](NPUOp *cur_op) { return NPUPassUtils::IsNhwc2Nchw(cur_op); });
+  }
+  return false;
+}
+
+void NPUFusionPass::RemoveAndFreeOp(NPUOp *cur_op) {
+  auto itr = find(all_ops_->begin(), all_ops_->end(), cur_op);
+  if (itr != all_ops_->end()) {
+    all_ops_->erase(itr);
+  }
+  delete cur_op;
+}
+
+int NPUFusionPass::UpdatePreOps(NPUOp *cur_op) {
+  for (auto in_op : cur_op->in_ops()) {
+    // graph in op
+    if (in_op->in_ops().empty()) {
+      continue;
+    }
+    auto pre_op = in_op->in_ops()[0];
+
+    auto pre_out_ops = pre_op->out_ops();
+    for (size_t i = 0; i < pre_out_ops.size(); i++) {
+      if (pre_out_ops[i] == in_op) {
+        pre_out_ops[i] = cur_op;
+        break;
+      }
+    }
+    pre_op->set_out_ops(pre_out_ops);
+
+    auto cur_in_ops = cur_op->in_ops();
+    for (size_t i = 0; i < cur_in_ops.size(); i++) {
+      if (cur_in_ops[i] == in_op) {
+        cur_in_ops[i] = pre_op;
+        break;
+      }
+    }
+    cur_op->set_in_ops(cur_in_ops);
+    RemoveAndFreeOp(in_op);
+  }
+  return RET_OK;
+}
+
+int NPUFusionPass::UpdatePostOps(NPUOp *cur_op) {
+  auto cur_out_ops = cur_op->out_ops();
+  for (auto out_op : cur_op->out_ops()) {
+    // graph out op
+    if (out_op->out_ops().empty()) {
+      cur_out_ops.erase(find(cur_out_ops.begin(), cur_out_ops.end(), out_op));
+    } else {
+      auto post_op = out_op->out_ops()[0];
+      auto post_in_ops = post_op->in_ops();
+      for (size_t i = 0; i < post_in_ops.size(); i++) {
+        if (post_in_ops[i] == out_op) {
+          post_in_ops[i] = cur_op;
+          break;
+        }
+      }
+      post_op->set_in_ops(post_in_ops);
+
+      for (size_t i = 0; i < cur_out_ops.size(); i++) {
+        if (cur_out_ops[i] == out_op) {
+          cur_out_ops[i] = post_op;
+          break;
+        }
+      }
+    }
+    RemoveAndFreeOp(out_op);
+  }
+  cur_op->set_out_ops(cur_out_ops);
+  return RET_OK;
+}
+
+int UpdatePreTensors(NPUOp *cur_op) {
+  auto tensors_vec = NPUPassUtils::GetNonConstInputs(cur_op);
+  for (auto in_op : cur_op->in_ops()) {
+    if (in_op->inputs().empty() || in_op->outputs().empty() || in_op->in_ops().empty()) {
+      MS_LOG(ERROR) << "in_tensors/out_tensors/in_ops is empty.";
+      return RET_ERROR;
+    }
+    tensor::MSTensor *cur_tensor = nullptr;
+    auto in_tensor = in_op->inputs()[0];
+    auto out_tensor = in_op->outputs()[0];
+    auto pre_op = in_op->in_ops()[0];
+    for (size_t i = 0; i < pre_op->outputs().size(); i++) {
+      if (pre_op->outputs()[i] == in_tensor) {
+        cur_tensor = pre_op->outputs()[i];
+      }
+    }
+    for (size_t i = 0; i < tensors_vec.size(); i++) {
+      if (tensors_vec[i] == out_tensor) {
+        tensors_vec[i] = cur_tensor;
+      }
+    }
+  }
+  // add constant inputs back
+  if (nodes2const_index.find(cur_op->type()) != nodes2const_index.end()) {
+    tensors_vec.resize(cur_op->inputs().size());
+    auto const_index = nodes2const_index[cur_op->type()];
+    for (auto index : const_index) {
+      tensors_vec[index] = cur_op->inputs()[index];
+    }
+  }
+  cur_op->set_inputs(tensors_vec);
+  return RET_OK;
+}
+
+int UpdatePostTensors(NPUOp *cur_op) {
+  auto tensor = cur_op->outputs()[0];
+
+  // in case: node->nh2nc->nc2nh(graph output) --->>> node->nc2nh, node out_tensor should be put to nnc2nh out tensors
+  auto out_ops = cur_op->out_ops();
+  if (!out_ops.empty() && out_ops.size() == 1 && out_ops[0]->out_ops().size() == 1 &&
+      out_ops[0]->out_ops()[0]->out_ops().empty() &&
+      out_ops[0]->out_ops()[0]->type() == schema::PrimitiveType_Transpose) {
+    auto nc_tensor = out_ops[0]->outputs()[0];  // nh2nc's out tensor
+    cur_op->set_outputs({nc_tensor});
+    auto post_post_op = out_ops[0]->out_ops()[0];
+    // nc2nh op set in_tensor out_tensor
+    auto post_post_k_in_tensors = post_post_op->inputs();
+    post_post_k_in_tensors[0] = nc_tensor;
+    post_post_op->set_inputs(post_post_k_in_tensors);
+    post_post_op->set_outputs({tensor});
+    return RET_OK;
+  }
+
+  auto nhwc_shape = tensor->shape();
+  if (nhwc_shape.size() < kNumDims) {
+    MS_LOG(ERROR) << "nhwc_shape < " << kNumDims;
+    return RET_ERROR;
+  }
+  tensor->set_shape({nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]});
+  for (auto out_op : cur_op->out_ops()) {
+    auto out_tensor = out_op->outputs()[0];
+    if (out_op->out_ops().empty()) {
+      cur_op->set_outputs({out_op->outputs()[0]});
+    }
+    for (auto post_op : out_op->out_ops()) {
+      auto tensors_vec = post_op->inputs();
+      for (int i = 0; i < tensors_vec.size(); i++) {
+        if (tensors_vec[i] == out_tensor) {
+          tensors_vec[i] = tensor;
+        }
+      }
+      post_op->set_inputs(tensors_vec);
+    }
+  }
+  return RET_OK;
+}
+
+int NPUFusionPass::UpdateOp(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    MS_LOG(ERROR) << "kernel is nullptr.";
+    return RET_ERROR;
+  }
+  auto ret = UpdatePreTensors(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdatePreTensors failed.";
+    return RET_ERROR;
+  }
+  ret = UpdatePostTensors(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdatePostTensors failed.";
+    return RET_ERROR;
+  }
+  ret = UpdatePreOps(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdatePreOps failed.";
+    return RET_ERROR;
+  }
+  ret = UpdatePostOps(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdatePostOps failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NPUFusionPass::CommonFusion(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto ret = UpdateOp(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdateOp failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int NPUFusionPass::ConcatFusion(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto ret = UpdateOp(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdateOp failed.";
+    return RET_ERROR;
+  }
+  if (cur_op->type() != schema::PrimitiveType_Concat) {
+    return RET_ERROR;
+  }
+  auto concat_op = static_cast<ConcatNPUOp *>(cur_op);
+  concat_op->HandleAxis();
+  return RET_OK;
+}
+
+int NPUFusionPass::SplitFusion(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto ret = UpdateOp(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdateOp failed.";
+    return RET_ERROR;
+  }
+  if (cur_op->type() != schema::PrimitiveType_Split) {
+    return RET_ERROR;
+  }
+  auto split_op = static_cast<SplitNPUOp *>(cur_op);
+  split_op->HandleAxis();
+  return RET_OK;
+}
+
+int NPUFusionPass::PadFusion(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto ret = UpdateOp(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdateOp failed.";
+    return RET_ERROR;
+  }
+  if (cur_op->type() != schema::PrimitiveType_PadFusion) {
+    return RET_ERROR;
+  }
+  auto pad_op = static_cast<PadNPUOp *>(cur_op);
+  pad_op->HandleAxis();
+  return RET_OK;
+}
+
+int NPUFusionPass::StridedSliceFusion(NPUOp *cur_op) {
+  // basic requirement: input is nhwc 4d
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto ret = UpdateOp(cur_op);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "UpdateOp failed.";
+    return RET_ERROR;
+  }
+  if (cur_op->inputs().size() < kNumInputSize) {
+    MS_LOG(ERROR) << "in tensors size < " << kNumInputSize;
+    return RET_ERROR;
+  }
+  if (cur_op->type() != schema::PrimitiveType_StridedSlice) {
+    return RET_ERROR;
+  }
+  auto begin_tensor = cur_op->inputs().at(1);
+  int *begin = reinterpret_cast<int *>(begin_tensor->data());
+  (void)NPUPassUtils::AssistDataNHWC2NCHW(begin, 1);
+  auto end_tensor = cur_op->inputs().at(2);
+  int *end = reinterpret_cast<int *>(end_tensor->data());
+  NPUPassUtils::AssistDataNHWC2NCHW(end, 1);
+  auto stride_tensor = cur_op->inputs().at(3);
+  if (cur_op->inputs().size() == 5) {
+    stride_tensor = cur_op->inputs().at(4);
+  }
+  int *stride = reinterpret_cast<int *>(stride_tensor->data());
+  NPUPassUtils::AssistDataNHWC2NCHW(stride, 1);
+
+  auto stride_slice_op = static_cast<StridedSliceNPUOp *>(cur_op);
+  stride_slice_op->HandleAxis();
+  return RET_OK;
+}
+
+int NPUFusionPass::FormatFusion(NPUOp *cur_op) {
+  if (cur_op == nullptr) {
+    return RET_ERROR;
+  }
+  auto is_input_op = cur_op->in_ops().empty();
+  NPUOp *pre_op = nullptr;
+  if (!is_input_op) {
+    pre_op = cur_op->in_ops()[0];
+  }
+  auto in_tensor = cur_op->inputs()[0];
+  std::vector<NPUOp *> pre_insert_ops;
+  for (const auto &trans_op : cur_op->out_ops()) {
+    if (trans_op->out_ops().empty() && !is_input_op) {
+      // cur_op is a trans cur_op, it's input cur_op num and input tensor num must be 1
+      cur_op->in_ops()[0]->set_outputs({trans_op->outputs()[0]});
+      // in fp16 mode, tensor data type fp16 need to be changed back.
+      auto tensor = cur_op->in_ops()[0]->outputs()[0];
+      if (tensor->data_type() == kNumberTypeFloat16) {
+        tensor->set_data_type(kNumberTypeFloat32);
+      }
+    }
+    for (const auto &post_op : trans_op->out_ops()) {
+      // update tensor
+      auto tensors_vec = post_op->inputs();
+      for (size_t i = 0; i < tensors_vec.size(); i++) {
+        if (tensors_vec[i] == trans_op->outputs()[0]) {
+          tensors_vec[i] = in_tensor;
+          break;
+        }
+      }
+      post_op->set_inputs(tensors_vec);
+
+      // update op
+      auto post_in_ops = post_op->in_ops();
+      for (size_t i = 0; i < post_in_ops.size(); i++) {
+        if (post_in_ops[i] == trans_op) {
+          if (is_input_op) {
+            post_in_ops.erase(post_in_ops.begin() + i);
+          } else {
+            post_in_ops[i] = pre_op;
+          }
+          break;
+        }
+      }
+      post_op->set_in_ops(post_in_ops);
+      pre_insert_ops.push_back(post_op);
+    }
+    RemoveAndFreeOp(trans_op);
+  }
+  if (!is_input_op) {
+    auto pre_out_ops = pre_op->out_ops();
+    size_t cur_op_index = 0;
+    for (size_t index = 0; index < pre_out_ops.size(); index++) {
+      if (pre_out_ops[index] == cur_op) {
+        pre_out_ops.erase(pre_out_ops.begin() + index);
+        cur_op_index = index;
+      } else {
+        auto tensors_vec = pre_out_ops[index]->inputs();
+        for (size_t i = 0; i < tensors_vec.size(); i++) {
+          if (tensors_vec[i] == in_tensor) {
+            tensors_vec[i] = pre_op->outputs()[0];
+            break;
+          }
+        }
+        pre_out_ops[index]->set_inputs(tensors_vec);
+      }
+    }
+    pre_out_ops.insert(pre_out_ops.begin() + cur_op_index, pre_insert_ops.begin(), pre_insert_ops.end());
+    pre_op->set_out_ops(pre_out_ops);
+  }
+  RemoveAndFreeOp(cur_op);
+  return RET_OK;
+}
+
+int NPUFusionPass::Run(NPUGraph *subgraph) {
+  all_ops_ = subgraph->GetOps();
+  for (size_t i = 0; i < all_ops_->size(); i++) {
+    auto cur_op = (*all_ops_)[i];
+    auto ret = RET_OK;
+    if (CheckFusion(cur_op)) {
+      switch (cur_op->type()) {
+        case schema::PrimitiveType_Split:
+          i -= cur_op->in_ops().size();
+          ret = SplitFusion(cur_op);
+          continue;
+        case schema::PrimitiveType_Concat:
+          i -= cur_op->in_ops().size();
+          ret = ConcatFusion(cur_op);
+          continue;
+        case schema::PrimitiveType_PadFusion:
+          i -= cur_op->in_ops().size();
+          ret = PadFusion(cur_op);
+          continue;
+        case schema::PrimitiveType_StridedSlice:
+          i -= cur_op->in_ops().size();
+          ret = StridedSliceFusion(cur_op);
+          continue;
+        case schema::PrimitiveType_AddFusion:
+        case schema::PrimitiveType_Activation:
+        case schema::PrimitiveType_Eltwise:
+          i -= cur_op->in_ops().size();
+          ret = CommonFusion(cur_op);
+          continue;
+        default:
+          continue;
+      }
+    }
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Fusion failed.";
+      return RET_ERROR;
+    }
+  }
+  for (size_t i = 0; i < all_ops_->size(); ++i) {
+    auto cur_op = (*all_ops_)[i];
+    if (CheckFormatFusion(cur_op)) {
+      i--;
+      auto ret = FormatFusion(cur_op);
+      if (ret != RET_OK) {
+        MS_LOG(ERROR) << "FormatFusion failed.";
+        return RET_ERROR;
+      }
+    }
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_fusion_pass.h
@ -0,0 +1,46 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_FUSION_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_FUSION_PASS_H_
+#include <vector>
+#include "src/delegate/npu/op//npu_op.h"
+#include "src/delegate/npu/pass/npu_base_pass.h"
+
+namespace mindspore {
+class NPUFusionPass : public NPUBasePass {
+ public:
+  NPUFusionPass() { name_ = "NPUFusionPass"; }
+
+  int Run(NPUGraph *subgraph) override;
+
+ protected:
+  int UpdatePreOps(NPUOp *cur_op);
+  int UpdatePostOps(NPUOp *cur_op);
+  void RemoveAndFreeOp(NPUOp *cur_op);
+  int UpdateOp(NPUOp *cur_op);
+  int CommonFusion(NPUOp *cur_op);
+  int ConcatFusion(NPUOp *cur_op);
+  int SplitFusion(NPUOp *cur_op);
+  int PadFusion(NPUOp *cur_op);
+  int StridedSliceFusion(NPUOp *cur_op);
+  int FormatFusion(NPUOp *cur_op);
+
+ private:
+  std::vector<NPUOp *> *all_ops_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_FUSION_PASS_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.cc
@ -0,0 +1,302 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/delegate/npu/pass/npu_insert_transform_pass.h"
+#include <algorithm>
+#include <set>
+#include <string>
+#include "src/delegate/npu/pass/npu_pass_utils.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {
+enum InsertState { InsertNone, PreInsert, PostInsert, BothInsert };
+std::set<mindspore::schema::PrimitiveType> insert_nodes = {
+  schema::PrimitiveType_Concat,      schema::PrimitiveType_AddFusion, schema::PrimitiveType_Eltwise,
+  schema::PrimitiveType_Activation,  schema::PrimitiveType_Split,     schema::PrimitiveType_PadFusion,
+  schema::PrimitiveType_StridedSlice};
+
+// this pass goal is to minimize subgraphs generated
+// by inserting nchw2nhwc or nhwc2nchw before or after the operator (e.g. concat, add, etc..) together with
+// fusion pass. If transpose inserted are more than half of input output, we will insert remaining input
+// output with transpose and hopefully do a fusion pass. Otherwise, we don't insert anything.
+//
+// Typically concat accept output from nchw2nhwc, we fill other input with nh2nc and nc2nh so that inputs to concat are
+// format same and then fusion all nchw2nhwc op.
+// e.g.
+// original     (conv->nchw2nhwc, add(format nhwc)) -> concat-> (nhwc2nchw->conv)
+// current pass (conv->nchw2nhwc, add->nhwc2nchw->nchw2nhwc) -> concat -> (nhwc2nchw->conv)
+// fusion pass  (conv, add->nhwc2nchw) -> concat -> conv
+// original 2 cpusubgraph, after 2 pass, only 1 cpu subgraph
+//
+// node:
+// Such ops require inputs all have same format, could be nchw or nhwc or other format.
+// Their inputs outputs may not be 4d, or are already format ok,
+// so we won't insert nc2nh or nh2nc when op's in ops and out ops contains no nc2nh or nh2nc.
+// This pass should be run after npu_transform_pass, which insert transpose for nchw-input-limited op like conv2d.
+
+int NPUInsertTransformPass::GetInsertState(NPUOp *op) {
+  // filter out irrelevant op
+  if (insert_nodes.find(op->type()) == insert_nodes.end()) {
+    return InsertNone;
+  }
+
+  // current op is target op
+  // use out ops to count how many out lines from current op
+  std::vector<tensor::MSTensor *> inputs = NPUPassUtils::GetNonConstInputs(op);
+  size_t in_out_tensor_num =
+    inputs.size() + std::max(std::max(op->out_ops().size(), static_cast<size_t>(1)), op->outputs().size());
+  size_t transpose_input_num = 0;
+  size_t transpose_output_num = 0;
+  bool need_pre_insert = false;
+  bool need_post_insert = false;
+  // count number of input tensor from nc2nh and output tensor to nh2nc
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto in_op = NPUPassUtils::OpInputFromOp(op, inputs.at(i));
+    if (NPUPassUtils::IsNchw2Nhwc(in_op)) {
+      transpose_input_num++;
+    } else {
+      need_pre_insert = true;
+    }
+  }
+  if (op->out_ops().empty()) {
+    need_post_insert = true;
+  }
+  for (const auto out_op : op->out_ops()) {
+    if (NPUPassUtils::IsNhwc2Nchw(out_op)) {
+      transpose_output_num++;
+    } else {
+      need_post_insert = true;
+    }
+  }
+
+  // won't insert any thing if num of transpose tensor is smaller than half of total input output.
+  // won't insert if total input output are all transpose tensor, the fusion pass will handle this.
+  size_t transpose_tensor_num = transpose_input_num + transpose_output_num;
+  if (transpose_tensor_num == 0 || transpose_tensor_num * 2 < in_out_tensor_num ||
+      transpose_tensor_num == in_out_tensor_num) {
+    return InsertNone;
+  }
+  InsertState ret;
+  if (need_pre_insert && !need_post_insert) {
+    ret = PreInsert;
+  } else if (need_pre_insert && need_post_insert) {
+    ret = BothInsert;
+  } else if (!need_pre_insert && need_post_insert) {
+    ret = PostInsert;
+  } else {
+    ret = InsertNone;
+  }
+
+  return ret;
+}
+
+int NPUInsertTransformPass::InsertNode(NPUOp *op, NPUOp *post_op, size_t post_input_index,
+                                       std::vector<NPUOp *> *trans_ops) {
+  // Op and post_op can't be nullptr at the same time.
+  std::string op_name;
+  tensor::MSTensor *in_tensor = nullptr;
+
+  std::vector<NPUOp *> out_ops;
+  // If post_op equals nullptr, op is the output of whole graph.
+  if (post_op != nullptr) {
+    out_ops.push_back(post_op);
+    op_name = post_op->name() + "_pre";
+    in_tensor = post_op->inputs().at(post_input_index);
+  }
+  std::vector<NPUOp *> in_ops;
+  // If op equals nullptr, post_op is the input of whole graph.
+  if (op != nullptr && !op->outputs().empty()) {
+    in_ops.push_back(op);
+    op_name = op->name() + "_post";
+    in_tensor = op->outputs()[0];
+  }
+  std::vector<int> nhwc_shape = in_tensor->shape();
+  if (nhwc_shape.size() < 4) {
+    MS_LOG(ERROR) << "nhwc_shape size < " << 4;
+    return RET_ERROR;
+  }
+  std::vector<int> nchw_shape = {nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]};
+
+  auto nh2nc_name = op_name + "_nh2nc_" + std::to_string(total++);
+  auto nh2nc_tensor =
+    tensor::MSTensor::CreateTensor(nh2nc_name + "/output0", in_tensor->data_type(), nchw_shape, nullptr, 0);
+  if (nh2nc_tensor == nullptr) {
+    MS_LOG(ERROR) << "New nchw tensor failed when inserting nchw2nhwc op.";
+    return RET_ERROR;
+  }
+  nh2nc_tensor->set_tensor_name(nh2nc_name + "/output0");
+  std::vector<tensor::MSTensor *> nh2nc_tensors = {nh2nc_tensor};
+  all_tensors_->push_back(nh2nc_tensors[0]);
+
+  auto nc2nh_name = op_name + "_nc2nh_" + std::to_string(total++);
+  auto nc2nh_tensor =
+    tensor::MSTensor::CreateTensor(nc2nh_name + "/output0", in_tensor->data_type(), nhwc_shape, nullptr, 0);
+  if (nc2nh_tensor == nullptr) {
+    MS_LOG(ERROR) << "New nhwc tensor failed when inserting nhwc2nchw op.";
+    return RET_ERROR;
+  }
+  std::vector<tensor::MSTensor *> nc2nh_tensors = {nc2nh_tensor};
+  all_tensors_->push_back(nc2nh_tensors[0]);
+
+  auto *nh2nc_op = NPUPassUtils::CreateNhwc2NchwOp({in_tensor}, nh2nc_tensors, nh2nc_name);
+  trans_ops->push_back(nh2nc_op);
+
+  auto *nc2nh_op = NPUPassUtils::CreateNchw2NhwcOp(nh2nc_tensors, nc2nh_tensors, nc2nh_name);
+  trans_ops->push_back(nc2nh_op);
+
+  NPUPassUtils::UpdateOp(nh2nc_op, in_ops, {nc2nh_op}, {in_tensor}, nh2nc_tensors);
+  NPUPassUtils::UpdateOp(nc2nh_op, {nh2nc_op}, out_ops, {nh2nc_tensors[0]}, nc2nh_tensors);
+  if (op != nullptr) {
+    NPUPassUtils::UpdateNH2NCTransNodePreOp(op, nh2nc_op, post_op);
+  }
+  if (post_op != nullptr) {
+    NPUPassUtils::UpdateNC2NHTransNodePostOp(op, nc2nh_op, post_op);
+  } else {
+    // post_op nullptr mean output, we remain graph output tensor name unchanged
+    auto graph_output_name = in_tensor->tensor_name();
+    in_tensor->set_tensor_name(graph_output_name + "_before_" + name_);
+    nc2nh_tensor->set_tensor_name(graph_output_name);
+  }
+  return RET_OK;
+}
+
+int NPUInsertTransformPass::InsertForInputTensor(NPUOp *op, size_t in_tensor_index, NPUOp *pre_op,
+                                                 std::vector<NPUOp *> *trans_ops) {
+  // insert transpose nodes before target ops
+  return InsertNode(pre_op, op, in_tensor_index, trans_ops);
+}
+
+int NPUInsertTransformPass::InsertForOutputTensor(NPUOp *op, NPUOp *post_op, size_t post_in_tensor_index,
+                                                  std::vector<NPUOp *> *trans_ops) {
+  // insert transpose nodes after target ops
+  return InsertNode(op, post_op, post_in_tensor_index, trans_ops);
+}
+
+int NPUInsertTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
+  int ret = RET_OK;
+  auto inputs = NPUPassUtils::GetNonConstInputs(op);
+  for (auto tensor : inputs) {
+    auto pre_op = NPUPassUtils::OpInputFromOp(op, tensor);
+    if (NPUPassUtils::IsNchw2Nhwc(pre_op)) {
+      continue;
+    }
+    // if this tensor is input of graph, pre_op is nullptr.
+    auto it = find(op->inputs().begin(), op->inputs().end(), tensor);
+    if (it == op->inputs().end()) {
+      MS_LOG(ERROR) << "Find in tensor index error";
+      return RET_ERROR;
+    }
+    size_t index = it - op->inputs().begin();
+    ret = InsertForInputTensor(op, index, pre_op, trans_ops);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op before op " << op->name() << " failed.";
+      return ret;
+    }
+  }
+  return ret;
+}
+
+int NPUInsertTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
+  int ret = RET_OK;
+
+  for (const auto post_op : op->out_ops()) {
+    if (NPUPassUtils::IsNhwc2Nchw(post_op)) {
+      continue;
+    }
+    auto post_op_in_tensors = post_op->inputs();
+    // op's out tensor is one of post_op's input tensor
+    auto it = std::find(post_op_in_tensors.begin(), post_op_in_tensors.end(), op->outputs().at(0));
+    if (it == post_op_in_tensors.end()) {
+      return RET_ERROR;
+    }
+    size_t input_index = it - post_op_in_tensors.begin();
+    ret = InsertForOutputTensor(op, post_op, input_index, trans_ops);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+      return ret;
+    }
+  }
+  if (op->outputs().size() > op->out_ops().size()) {
+    // op out is graph output
+    ret = InsertForOutputTensor(op, nullptr, 0, trans_ops);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+      return ret;
+    }
+  }
+  return ret;
+}
+
+int NPUInsertTransformPass::Run(NPUGraph *subgraph) {
+  all_ops_ = subgraph->GetOps();
+  all_tensors_ = subgraph->GetInsertTensors();
+  std::vector<NPUOp *> insert_ops;
+  for (int j = 0; j < 2; ++j) {
+    for (size_t i = 0; i < all_ops_->size(); i++) {
+      auto op = (*all_ops_)[i];
+      auto insert_state = GetInsertState(op);
+      insert_ops.clear();
+      // If the every output op is nhwc2nchw, insert
+      // modify loop index add post_ops.size() to the next op in the origin vector
+      switch (insert_state) {
+        case PreInsert: {
+          auto ret = InsertPreNodes(op, &insert_ops);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op before op " << op->name() << " failed.";
+            return RET_ERROR;
+          }
+          all_ops_->insert(all_ops_->begin() + i, insert_ops.begin(), insert_ops.end());
+          i += insert_ops.size();
+          break;
+        }
+        case PostInsert: {
+          auto ret = InsertPostNodes(op, &insert_ops);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+            return RET_ERROR;
+          }
+          all_ops_->insert(all_ops_->begin() + i + 1, insert_ops.begin(), insert_ops.end());
+          i += insert_ops.size();
+          break;
+        }
+        case BothInsert: {
+          auto ret = InsertPreNodes(op, &insert_ops);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op before op " << op->name() << " failed.";
+            return RET_ERROR;
+          }
+          all_ops_->insert(all_ops_->begin() + i, insert_ops.begin(), insert_ops.end());
+          i += insert_ops.size();
+
+          insert_ops.clear();
+          ret = InsertPostNodes(op, &insert_ops);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "Insert nhwc2nchw op and nchw2nhwc op after op " << op->name() << " failed.";
+            return RET_ERROR;
+          }
+          all_ops_->insert(all_ops_->begin() + i + 1, insert_ops.begin(), insert_ops.end());
+          i += insert_ops.size();
+          break;
+        }
+        default:
+          MS_LOG(DEBUG) << "Insert Nothing on op " << op->name();
+      }
+    }
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_insert_transform_pass.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INSERT_TRANSFORM_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INSERT_TRANSFORM_PASS_H_
+#include <vector>
+#include "src/delegate/npu/op//npu_op.h"
+#include "src/delegate/npu/pass/npu_base_pass.h"
+namespace mindspore {
+class NPUInsertTransformPass : public NPUBasePass {
+ public:
+  NPUInsertTransformPass() { name_ = "NPUInsertTransformPass"; }
+
+  int Run(NPUGraph *subgraph) override;
+
+ private:
+  int GetInsertState(NPUOp *op);
+  int InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
+  int InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
+  int InsertNode(NPUOp *op, NPUOp *post_op, size_t post_input_index, std::vector<NPUOp *> *trans_ops);
+  int InsertForInputTensor(NPUOp *op, size_t in_tensor_index, NPUOp *pre_op, std::vector<NPUOp *> *trans_ops);
+  int InsertForOutputTensor(NPUOp *op, NPUOp *post_op, size_t post_in_tensor_index, std::vector<NPUOp *> *trans_ops);
+
+ private:
+  int total = 0;
+  std::vector<NPUOp *> *all_ops_;
+  std::vector<tensor::MSTensor *> *all_tensors_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_INSERT_TRANSFORM_PASS_H_
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_manager.cc
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_manager.cc
@ -14,25 +14,31 @@
 * limitations under the License.
 */

-#include "src/runtime/agent/npu/optimizer/npu_pass_manager.h"
+#include "src/delegate/npu/pass/npu_pass_manager.h"
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
-namespace mindspore::lite {
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {
 void NPUPassManager::AddPass(NPUBasePass *pass) { all_pass_.push_back(pass); }
-int NPUPassManager::Run() {
+
+int NPUPassManager::RunPass(NPUGraph *subgraph) {
  for (auto pass : all_pass_) {
-    auto ret = pass->Run();
+    auto ret = pass->Run(subgraph);
    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "NPU Pass Run failed. Pass name is:" << pass->name();
+      MS_LOG(ERROR) << "NPU Pass Run failed. Pass name is:" << pass->name() << " for subgraph " << subgraph->name();
      return ret;
    }
  }
  return RET_OK;
 }
+
 void NPUPassManager::Clear() {
  for (auto pass : all_pass_) {
    delete pass;
  }
  all_pass_.clear();
 }
-}  // namespace mindspore::lite
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_manager.h
+++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_manager.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -14,12 +14,11 @@
 * limitations under the License.
 */

-#ifndef MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_MANAGER_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_MANAGER_H_
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_MANAGER_H_
 #include <vector>
-#include "src/runtime/agent/npu/optimizer/npu_base_pass.h"
-#include "src/inner_context.h"
-namespace mindspore::lite {
+#include "src/delegate/npu/pass/npu_base_pass.h"
+namespace mindspore {
 class NPUPassManager {
 public:
  static NPUPassManager *GetInstance() {
@ -31,12 +30,12 @@ class NPUPassManager {

  void AddPass(NPUBasePass *pass);

-  int Run();
+  int RunPass(NPUGraph *subgraph);

  void Clear();

 private:
  std::vector<NPUBasePass *> all_pass_;
 };
-}  // namespace mindspore::lite
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_AGENT_NPU_OPTIMIZER_NPU_PASS_MANAGER_H_
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_MANAGER_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.cc
@ -0,0 +1,250 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/pass/npu_pass_utils.h"
+#include <algorithm>
+#include "nnacl/scale.h"
+#include "src/delegate/npu/op/scale_npu.h"
+#include "src/delegate/npu/op/transpose_npu.h"
+
+namespace mindspore {
+std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index{
+  {schema::PrimitiveType_Split, {1}},
+  {schema::PrimitiveType_PadFusion, {1}},
+  {schema::PrimitiveType_StridedSlice, {1, 2, 3}}};
+
+NPUOp *NPUPassUtils::CreateNchw2NhwcOp(const std::vector<tensor::MSTensor *> &in_tensors,
+                                       const std::vector<tensor::MSTensor *> &out_tensors, const std::string &name) {
+  std::vector<int> perm = {0, 2, 3, 1};
+  auto npu_op = new (std::nothrow) TransposeNPUOp(in_tensors, out_tensors, perm, name);
+  if (npu_op == nullptr) {
+    MS_LOG(ERROR) << "New Nchw2Nhwc NPUOp failed.";
+    return nullptr;
+  }
+  return npu_op;
+}
+
+NPUOp *NPUPassUtils::CreateNhwc2NchwOp(const std::vector<tensor::MSTensor *> &in_tensors,
+                                       const std::vector<tensor::MSTensor *> &out_tensors, const std::string &name) {
+  std::vector<int> perm = {0, 3, 1, 2};
+  auto npu_op = new (std::nothrow) TransposeNPUOp(in_tensors, out_tensors, perm, name);
+  if (npu_op == nullptr) {
+    MS_LOG(ERROR) << "New Nhwc2Nchw NPUOp failed.";
+    return nullptr;
+  }
+  return npu_op;
+}
+
+void NPUPassUtils::UpdateOp(NPUOp *op, const std::vector<NPUOp *> &in_ops, const std::vector<NPUOp *> &out_ops,
+                            const std::vector<tensor::MSTensor *> &in_tensors,
+                            const std::vector<tensor::MSTensor *> &outputs) {
+  op->set_inputs(in_tensors);
+  op->set_outputs(outputs);
+  op->set_in_ops(in_ops);
+  op->set_out_ops(out_ops);
+}
+
+void NPUPassUtils::UpdateNH2NCTransNodePreOp(NPUOp *pre_op, NPUOp *trans_op, NPUOp *op) {
+  // For op before trans, update the out_ops; the output tensor of op is the input tensor of trans.
+  std::vector<NPUOp *> out_ops = pre_op->out_ops();
+  for (size_t i = 0; i < out_ops.size(); i++) {
+    if (out_ops[i] == op) {
+      out_ops[i] = trans_op;
+      break;
+    }
+  }
+  if (out_ops.empty()) {
+    out_ops.push_back(trans_op);
+  }
+  pre_op->set_out_ops(out_ops);
+}
+
+void NPUPassUtils::UpdateNC2NHTransNodePreOp(NPUOp *pre_op, const std::vector<NPUOp *> &trans_ops,
+                                             const std::vector<NPUOp *> &ops) {
+  // For op before trans, there may be multiple outputs.
+  auto cur_out_ops = pre_op->out_ops();
+  for (size_t i = 0; i < ops.size(); i++) {
+    auto itr = find(cur_out_ops.begin(), cur_out_ops.end(), ops[i]);
+    if (itr != cur_out_ops.end()) {
+      cur_out_ops.erase(itr);
+    }
+  }
+  std::copy(trans_ops.begin(), trans_ops.end(), std::back_inserter(cur_out_ops));
+  pre_op->set_out_ops(cur_out_ops);
+  // For op before trans, the output tensor is used for output tensor of trans, so replace the output tensor
+  // with the input tensor of trans.
+  pre_op->set_outputs({trans_ops.at(0)->inputs().at(0)});
+}
+
+void NPUPassUtils::UpdateNH2NCTransNodePostOp(NPUOp *trans_op, NPUOp *post_op) {
+  auto cur_in_tensors = post_op->inputs();
+  cur_in_tensors[0] = trans_op->outputs()[0];
+  post_op->set_inputs(cur_in_tensors);
+  post_op->set_in_ops({trans_op});
+}
+
+void NPUPassUtils::UpdateNC2NHPostOpInTensors(NPUOp *op, NPUOp *trans_op, NPUOp *post_op) {
+  // For post_op that doesn't require insert trans op, because the output tensor of op(input tensor of
+  // trans_op) is updated, replace the input tensor of post_op.
+  auto post_in_tensors = post_op->inputs();
+  for (size_t i = 0; i < post_in_tensors.size(); i++) {
+    if (post_in_tensors[i] == op->outputs()[0]) {
+      post_in_tensors[i] = trans_op->inputs()[0];
+      break;
+    }
+  }
+  post_op->set_inputs(post_in_tensors);
+}
+
+void NPUPassUtils::UpdateNC2NHTransNodePostOp(NPUOp *op, NPUOp *trans_op, NPUOp *post_op) {
+  // The input tensor should be replaced with the output tensor of trans_op.
+  auto post_in_tensors = post_op->inputs();
+  tensor::MSTensor *old_in_tensor = nullptr;
+  // find out which input tensor of post_op should be updated
+  for (size_t i = 0; i < post_in_tensors.size(); ++i) {
+    if (OpInputFromOp(post_op, post_in_tensors.at(i)) == op) {
+      old_in_tensor = post_in_tensors.at(i);
+      break;
+    }
+  }
+  if (old_in_tensor == nullptr) {
+    MS_LOG(WARNING) << "Could not find in tensor index";
+    return;
+  }
+  std::replace(post_in_tensors.begin(), post_in_tensors.end(), old_in_tensor, trans_op->outputs().at(0));
+  post_op->set_inputs(post_in_tensors);
+
+  // For post_op after trans, op in in_ops should be replaced with trans_op.
+  auto post_in_ops = post_op->in_ops();
+  if (op == nullptr) {
+    post_in_ops.push_back(trans_op);
+  } else {
+    std::replace(post_in_ops.begin(), post_in_ops.end(), op, trans_op);
+  }
+  post_op->set_in_ops(post_in_ops);
+}
+
+bool NPUPassUtils::IsNhwc2Nchw(NPUOp *op) {
+  if (op == nullptr) {
+    return false;
+  }
+  if (op->type() != schema::PrimitiveType_Transpose) {
+    return false;
+  }
+  auto transpose_op = static_cast<TransposeNPUOp *>(op);
+  std::vector<int> perm = transpose_op->GetPerm();
+  std::vector<int> nh2nc_perm = {0, 3, 1, 2};
+  if (perm != nh2nc_perm) {
+    return false;
+  }
+  return true;
+}
+
+bool NPUPassUtils::IsNchw2Nhwc(NPUOp *op) {
+  if (op == nullptr) {
+    return false;
+  }
+  if (op->type() != schema::PrimitiveType_Transpose) {
+    return false;
+  }
+  auto transpose_op = static_cast<TransposeNPUOp *>(op);
+  std::vector<int> perm = transpose_op->GetPerm();
+  std::vector<int> nc2nh_perm = {0, 2, 3, 1};
+  if (perm != nc2nh_perm) {
+    return false;
+  }
+  return true;
+}
+
+NPUOp *NPUPassUtils::OpInputFromOp(NPUOp *op, tensor::MSTensor *in_tensor) {
+  // given op and input tensor index, get which op output this tensor.
+  // If input tensor is graph input, return nullptr.
+  if (op == nullptr) {
+    return nullptr;
+  }
+  auto in_ops = op->in_ops();
+  auto output_contain = [in_tensor](NPUOp *op) {
+    auto outputs = op->outputs();
+    return std::find(outputs.begin(), outputs.end(), in_tensor) != outputs.end();
+  };
+  auto it = std::find_if(in_ops.begin(), in_ops.end(), output_contain);
+  if (it == in_ops.end()) {
+    return nullptr;
+  }
+  return *it;
+}
+
+std::vector<tensor::MSTensor *> NPUPassUtils::GetNonConstInputs(NPUOp *op) {
+  if (op == nullptr) {
+    return std::vector<tensor::MSTensor *>{};
+  }
+  auto type = op->type();
+  auto it = nodes2const_index.find(type);
+  if (it != nodes2const_index.end()) {
+    auto const_input_indices = it->second;
+    std::vector<tensor::MSTensor *> non_const_in_tensors;
+    auto in_tensors = op->inputs();
+    for (auto i = 0; i < in_tensors.size(); ++i) {
+      if (const_input_indices.find(i) == const_input_indices.end()) {
+        non_const_in_tensors.push_back(in_tensors[i]);
+      }
+    }
+    return non_const_in_tensors;
+  }
+  return op->inputs();
+}
+
+bool NPUPassUtils::Scale4dCase(NPUOp *op) {
+  if (op == nullptr) {
+    return false;
+  }
+  if (op->type() != schema::PrimitiveType_ScaleFusion) {
+    return false;
+  }
+  auto scale_op = static_cast<ScaleNPUOp *>(op);
+  auto axis = scale_op->GetAxis();
+  auto in_tensor = op->inputs().at(0);
+  auto scale_tensor = op->inputs().at(1);
+  return in_tensor->shape().size() == 4 && scale_tensor->shape().size() == 1 && (axis == 3 || axis == -1);
+}
+
+void NPUPassUtils::AssistDataNHWC2NCHW(int *data, size_t unit_size) {
+  MS_ASSERT(data != nullptr);
+  for (size_t i = 0; i < unit_size; ++i) {
+    int c = data[3 * unit_size + i];
+    // n h w c
+    // n c h w
+    data[3 * unit_size + i] = data[2 * unit_size + i];
+    data[2 * unit_size + i] = data[unit_size + i];
+    data[unit_size + i] = c;
+  }
+}
+
+int NPUPassUtils::MaskDataNHWC2NCHW(int mask) {
+  int mask_vec[4];
+  for (int i = 0; i < 4; ++i) {
+    mask_vec[i] = (uint32_t)(mask) & (1 << i);
+  }
+  AssistDataNHWC2NCHW(mask_vec, 1);
+  int ret = 0;
+  for (int i = 0; i < 4; ++i) {
+    if (mask_vec[i]) {
+      ret += 1 << i;
+    }
+  }
+  return ret;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_pass_utils.h
@ -0,0 +1,73 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_UTILS_H_
+#include <vector>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include "mindspore/lite/src/delegate/npu/op//transpose_npu.h"
+#include "src/delegate/npu/op//npu_op.h"
+namespace mindspore {
+extern std::unordered_map<schema::PrimitiveType, std::set<int>> nodes2const_index;
+class NPUPassUtils {
+ public:
+  static NPUOp *CreateNchw2NhwcOp(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors, const std::string &name);
+
+  static NPUOp *CreateNhwc2NchwOp(const std::vector<tensor::MSTensor *> &in_tensors,
+                                  const std::vector<tensor::MSTensor *> &out_tensors, const std::string &name);
+
+  static void UpdateOp(NPUOp *op, const std::vector<NPUOp *> &in_ops, const std::vector<NPUOp *> &out_ops,
+                       const std::vector<tensor::MSTensor *> &in_tensors,
+                       const std::vector<tensor::MSTensor *> &out_tensors);
+
+  static void UpdateNH2NCTransNodePreOp(NPUOp *pre_op, NPUOp *trans_op, NPUOp *op);
+
+  static void UpdateNC2NHTransNodePreOp(NPUOp *pre_op, const std::vector<NPUOp *> &trans_ops,
+                                        const std::vector<NPUOp *> &ops);
+
+  static void UpdateNH2NCTransNodePostOp(NPUOp *trans_op, NPUOp *post_op);
+
+  static void UpdateNC2NHTransNodePostOp(NPUOp *op, NPUOp *trans_op, NPUOp *post_op);
+
+  static void UpdateNC2NHPostOpInTensors(NPUOp *op, NPUOp *trans_op, NPUOp *post_op);
+
+  static bool IsNhwc2Nchw(NPUOp *op);
+
+  static bool IsNchw2Nhwc(NPUOp *op);
+  static NPUOp *OpInputFromOp(NPUOp *op, tensor::MSTensor *in_tensor);
+  static std::vector<tensor::MSTensor *> GetNonConstInputs(NPUOp *op);
+  static bool Scale4dCase(NPUOp *op);
+  static void AssistDataNHWC2NCHW(int *data, size_t unit_size);
+  static int MaskDataNHWC2NCHW(int mask);
+};
+
+// todo y00520784 : refactor the code of transform op
+class RuntimePass {
+ public:
+  RuntimePass(std::vector<NPUOp *> *ops, std::vector<tensor::MSTensor *> *tensors)
+      : all_ops_(ops), all_tensors_(tensors) {}
+  int InsertPreOp(NPUOp *op, tensor::MSTensor *in_edges, schema::Primitive *primitive);
+  int InsertPostOp(NPUOp *op, NPUOp *out_edges, schema::Primitive *primitive);
+
+ private:
+  std::vector<NPUOp *> *all_ops_;
+  std::vector<tensor::MSTensor *> *all_tensors_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_PASS_UTILS_H_
--- a/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.cc
+++ b/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.cc
@ -0,0 +1,200 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/delegate/npu/pass/npu_transform_pass.h"
+#include <vector>
+#include "src/delegate/npu/pass/npu_pass_utils.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {
+std::set<mindspore::schema::PrimitiveType> nchw_nodes = {
+  schema::PrimitiveType_Conv2DFusion,  schema::PrimitiveType_Conv2dTransposeFusion, schema::PrimitiveType_Resize,
+  schema::PrimitiveType_MaxPoolFusion, schema::PrimitiveType_AvgPoolFusion,         schema::PrimitiveType_ScaleFusion,
+  schema::PrimitiveType_CropAndResize};
+
+int NPUTransformPass::InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops) {
+  bool is_input_op = op->in_ops().empty();
+  // not always single input (like CropAndResize), but we care about the input with 4d.
+  auto it = std::find_if(op->in_ops().begin(), op->in_ops().end(),
+                         [](NPUOp *k) { return k->outputs().size() > 0 && k->outputs()[0]->shape().size() == 4; });
+  if (!is_input_op && it == op->in_ops().end()) {
+    MS_LOG(ERROR) << "NPU Transform pass does not find in op with 4d output";
+    return RET_ERROR;
+  }
+  if (is_input_op || nchw_nodes.find((*it)->type()) == nchw_nodes.end()) {
+    NPUOp *pre_op = nullptr;
+    if (!is_input_op) {
+      pre_op = *it;
+    }
+
+    // Create pre transform op's out tensor.
+    auto name = op->name() + "_pre_trans" + "_Nhwc2Nchw_" + std::to_string(total++);
+    auto nhwc_shape = op->inputs()[0]->shape();
+    std::vector<int> nchw_shape = {nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]};
+    auto tensor =
+      tensor::MSTensor::CreateTensor(name + "/output0", op->inputs()[0]->data_type(), nchw_shape, nullptr, 0);
+    if (tensor == nullptr) {
+      MS_LOG(ERROR) << "New nchw tensor failed when inserting pre nhwc2nchw op.";
+      return RET_ERROR;
+    }
+    std::vector<tensor::MSTensor *> pre_trans_outputs = {tensor};
+    all_tensors_->push_back(pre_trans_outputs[0]);
+
+    // Create pre transform op: Nhwc2Nchw
+    auto *trans_op = NPUPassUtils::CreateNhwc2NchwOp({op->inputs()[0]}, pre_trans_outputs, name);
+
+    trans_ops->push_back(trans_op);
+
+    // Set in_ops, out_ops, inputs, outputs for transform op
+    std::vector<NPUOp *> pre_trans_in_ops;
+    if (!is_input_op) {
+      pre_trans_in_ops = {pre_op};
+    }
+    NPUPassUtils::UpdateOp(trans_op, pre_trans_in_ops, {op}, trans_op->inputs(), pre_trans_outputs);
+
+    if (pre_op != nullptr) {
+      NPUPassUtils::UpdateNH2NCTransNodePreOp(pre_op, trans_op, op);
+    }
+    NPUPassUtils::UpdateNH2NCTransNodePostOp(trans_op, op);
+  }
+  return RET_OK;
+}
+
+int NPUTransformPass::InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops,
+                                      std::vector<tensor::MSTensor *> graph_outputs) {
+  bool is_output_op = false;
+  if (op->out_ops().empty() ||
+      find(graph_outputs.begin(), graph_outputs.end(), op->outputs()[0]) != graph_outputs.end()) {
+    is_output_op = true;
+  }
+  // Get the post op that need insert trans op.
+  // If no need for inserting trans op, the post op must be npu and in trans_nodes.
+  std::vector<NPUOp *> post_insert_ops;
+  std::vector<NPUOp *> post_non_insert_ops;
+  for (int i = 0; i < op->out_ops().size(); i++) {
+    auto post_op = op->out_ops()[i];
+    if (nchw_nodes.find(post_op->type()) == nchw_nodes.end()) {
+      post_insert_ops.push_back(post_op);
+    } else {
+      post_non_insert_ops.push_back(post_op);
+    }
+  }
+  if (!is_output_op && post_insert_ops.empty()) {
+    return RET_OK;
+  }
+  // Create post transform op's in tensor.
+  auto name = op->name() + "_post_trans" + "_Nchw2Nhwc" + std::to_string(total++);
+
+  auto nhwc_shape = op->outputs()[0]->shape();
+  std::vector<int> nchw_shape = {nhwc_shape[0], nhwc_shape[3], nhwc_shape[1], nhwc_shape[2]};
+  auto nc2nh_tensor =
+    tensor::MSTensor::CreateTensor(name + "/input0", op->outputs()[0]->data_type(), nchw_shape, nullptr, 0);
+  if (nc2nh_tensor == nullptr) {
+    MS_LOG(ERROR) << "New nchw tensor failed when inserting post nchw2nhwc op.";
+    return RET_ERROR;
+  }
+  all_tensors_->push_back(nc2nh_tensor);
+
+  if (is_output_op) {
+    std::vector<tensor::MSTensor *> nc2nh_outputs{op->outputs().at(0)};
+    // Create post transform op: Nchw2Nhwc
+    auto *post_trans_op = NPUPassUtils::CreateNchw2NhwcOp({nc2nh_tensor}, nc2nh_outputs, name);
+    // Set in_ops, out_ops, inputs, outputs for transform op
+    NPUPassUtils::UpdateOp(post_trans_op, {op}, {}, post_trans_op->inputs(), post_trans_op->outputs());
+    trans_ops->push_back(post_trans_op);
+  }
+  // for each to-be-insert out op, create one transpose op, one perm tensor, one out tensor
+  // but using same one in_tensor.
+  for (auto i = 0; i < post_insert_ops.size(); ++i) {
+    auto post_insert_op = post_insert_ops.at(i);
+    // nc2nh op out tensor: 1st op uses original out_tensor, remaining ops use newly created out tensor.
+    std::vector<tensor::MSTensor *> nc2nh_outputs{nullptr};
+
+    auto origin_out_tensor = op->outputs().at(0);
+    auto out_tensor_name = op->name() + "_post_trans" + "_Nchw2Nhwc_" + std::to_string(i) + "_out_tensor";
+    auto out_tensor = tensor::MSTensor::CreateTensor(out_tensor_name, origin_out_tensor->data_type(),
+                                                     origin_out_tensor->shape(), nullptr, 0);
+    if (out_tensor == nullptr) {
+      MS_LOG(ERROR) << "New nhwc tensor failed when inserting post nchw2nhwc op.";
+      return RET_ERROR;
+    }
+    all_tensors_->push_back(out_tensor);
+    nc2nh_outputs[0] = out_tensor;
+
+    // Create post transform op: Nchw2Nhwc
+    auto *post_trans_op =
+      NPUPassUtils::CreateNchw2NhwcOp({nc2nh_tensor}, nc2nh_outputs, name + "_" + std::to_string(i));
+    // Set in_ops, out_ops, inputs, outputs for transform op
+    NPUPassUtils::UpdateOp(post_trans_op, {op}, {post_insert_op}, post_trans_op->inputs(), post_trans_op->outputs());
+    trans_ops->push_back(post_trans_op);
+    // update post op inputs in_ops
+    NPUPassUtils::UpdateNC2NHTransNodePostOp(op, post_trans_op, post_insert_op);
+  }
+  // for those non-insert post ops, update their in_tensor
+  for (auto non_insert_op : post_non_insert_ops) {
+    auto inputs = non_insert_op->inputs();
+    std::replace(inputs.begin(), inputs.end(), op->outputs().at(0), nc2nh_tensor);
+    non_insert_op->set_inputs(inputs);
+  }
+  // update origin op's out tensor and out op
+  NPUPassUtils::UpdateNC2NHTransNodePreOp(op, *trans_ops, post_insert_ops);
+  return RET_OK;
+}
+
+int NPUTransformPass::Run(NPUGraph *subgraph) {
+  all_ops_ = subgraph->GetOps();
+  all_tensors_ = subgraph->GetInsertTensors();
+  auto graph_outputs = subgraph->outputs();
+  for (size_t i = 0; i < all_ops_->size();) {
+    auto op = (*all_ops_)[i];
+    if (nchw_nodes.find(op->type()) == nchw_nodes.end()) {
+      i++;
+      continue;
+    }
+    if (op->type() == schema::PrimitiveType_ScaleFusion && !NPUPassUtils::Scale4dCase(op)) {
+      i++;
+      continue;
+    }
+    if (op->type() == schema::PrimitiveType_Resize && op->inputs()[0]->shape()[1] > op->outputs()[0]->shape()[1]) {
+      i++;
+      continue;
+    }
+    // insert pre_ops before op in vector
+    // modify loop index add (pre_ops.size() + 1) to the post_ops insert location
+    std::vector<NPUOp *> pre_ops;
+    auto ret = InsertPreNodes(op, &pre_ops);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Insert nhwc2nchw op before op " << op->name() << " failed.";
+      return RET_ERROR;
+    }
+    all_ops_->insert(all_ops_->begin() + i, pre_ops.begin(), pre_ops.end());
+    i += (pre_ops.size() + 1);
+
+    // insert post_ops after op in vector
+    // modify loop index add post_ops.size() to the next op in the origin vector
+    std::vector<NPUOp *> post_ops;
+    ret = InsertPostNodes(op, &post_ops, graph_outputs);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Insert nchw2nhwc op after op " << op->name() << " failed.";
+      return RET_ERROR;
+    }
+    all_ops_->insert(all_ops_->begin() + i, post_ops.begin(), post_ops.end());
+    i += post_ops.size();
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.h
+++ b/mindspore/lite/src/delegate/npu/pass/npu_transform_pass.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_TRANSFORM_PASS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_TRANSFORM_PASS_H_
+
+#include <set>
+#include <vector>
+#include "src/delegate/npu/op//npu_op.h"
+#include "src/delegate/npu/pass/npu_base_pass.h"
+
+namespace mindspore {
+class NPUTransformPass : public NPUBasePass {
+ public:
+  NPUTransformPass() { name_ = "NPUTransformPass"; }
+
+  int Run(NPUGraph *subgraph) override;
+
+ private:
+  int InsertPreNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops);
+
+  int InsertPostNodes(NPUOp *op, std::vector<NPUOp *> *trans_ops, std::vector<tensor::MSTensor *> graph_outputs);
+
+ private:
+  int total = 0;
+  std::vector<NPUOp *> *all_ops_;
+  std::vector<tensor::MSTensor *> *all_tensors_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_PASS_NPU_TRANSFORM_PASS_H_
--- a/mindspore/lite/src/delegate/npu/transpose_kernel.cc
+++ b/mindspore/lite/src/delegate/npu/transpose_kernel.cc
@ -0,0 +1,161 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/delegate/npu/transpose_kernel.h"
+namespace mindspore {
+#define C8NUM 8
+#ifdef ENABLE_ARM64
+inline void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) {
+  size_t srcStride = src_stride * sizeof(float);
+  size_t dstStride = dst_stride * sizeof(float);
+  asm volatile(
+    "mov x10, %[src_ptr]\n"
+    "mov x11, %[dst_ptr]\n"
+
+    "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n"
+    "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n"
+
+    "zip1 v8.4s, v0.4s, v2.4s\n"
+    "zip2 v9.4s, v0.4s, v2.4s\n"
+    "zip1 v12.4s, v1.4s, v3.4s\n"
+    "zip2 v13.4s, v1.4s, v3.4s\n"
+
+    "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n"
+    "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n"
+
+    "zip1 v10.4s, v4.4s, v6.4s\n"
+    "zip2 v11.4s, v4.4s, v6.4s\n"
+    "zip1 v14.4s, v5.4s, v7.4s\n"
+    "zip2 v15.4s, v5.4s, v7.4s\n"
+
+    "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n"
+    "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n"
+
+    "trn1 v16.2d, v8.2d, v10.2d\n"
+    "trn2 v18.2d, v8.2d, v10.2d\n"
+    "trn1 v20.2d, v9.2d, v11.2d\n"
+    "trn2 v22.2d, v9.2d, v11.2d\n"
+
+    "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n"
+    "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n"
+
+    "trn1 v24.2d, v12.2d, v14.2d\n"
+    "trn2 v26.2d, v12.2d, v14.2d\n"
+    "trn1 v28.2d, v13.2d, v15.2d\n"
+    "trn2 v30.2d, v13.2d, v15.2d\n"
+
+    "zip1 v8.4s, v0.4s, v2.4s\n"
+    "zip2 v9.4s, v0.4s, v2.4s\n"
+    "zip1 v12.4s, v1.4s, v3.4s\n"
+    "zip2 v13.4s, v1.4s, v3.4s\n"
+
+    "zip1 v10.4s, v4.4s, v6.4s\n"
+    "zip2 v11.4s, v4.4s, v6.4s\n"
+    "zip1 v14.4s, v5.4s, v7.4s\n"
+    "zip2 v15.4s, v5.4s, v7.4s\n"
+
+    "trn1 v17.2d, v8.2d, v10.2d\n"
+    "trn2 v19.2d, v8.2d, v10.2d\n"
+    "trn1 v21.2d, v9.2d, v11.2d\n"
+    "trn2 v23.2d, v9.2d, v11.2d\n"
+
+    "trn1 v25.2d, v12.2d, v14.2d\n"
+    "trn2 v27.2d, v12.2d, v14.2d\n"
+    "trn1 v29.2d, v13.2d, v15.2d\n"
+    "trn2 v31.2d, v13.2d, v15.2d\n"
+
+    "st1 {v16.4s, v17.4s}, [x11], %[dstStride]\n"
+    "st1 {v18.4s, v19.4s}, [x11], %[dstStride]\n"
+    "st1 {v20.4s, v21.4s}, [x11], %[dstStride]\n"
+    "st1 {v22.4s, v23.4s}, [x11], %[dstStride]\n"
+    "st1 {v24.4s, v25.4s}, [x11], %[dstStride]\n"
+    "st1 {v26.4s, v27.4s}, [x11], %[dstStride]\n"
+    "st1 {v28.4s, v29.4s}, [x11], %[dstStride]\n"
+    "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n"
+
+    :
+    : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
+    : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+      "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+      "v31");
+}
+#endif
+
+void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) {
+  int hw8 = plane / C8NUM * C8NUM;
+  int task_start = 0;
+  int task_end = plane;
+  int batch = plane * channel;
+  for (int n = 0; n < batches; n++) {
+    const float *src_batch = (const float *)src + n * batch;
+    float *dst_batch = reinterpret_cast<float *>(dst) + n * batch;
+    int hw = task_start;
+    for (; hw < hw8; hw += C8NUM) {
+      int c = 0;
+#ifdef ENABLE_ARM64
+      for (; c <= channel - C8NUM; c += C8NUM) {
+        const float *src_ptr = src_batch + hw * channel + c;
+        float *dst_ptr = dst_batch + c * plane + hw;
+        Transpose8X8Fp32Arm64(src_ptr, dst_ptr, channel, plane);
+      }
+#endif
+      for (; c < channel; c++) {
+        const float *src_ptr = src_batch + hw * channel + c;
+        float *dst_ptr = dst_batch + c * plane + hw;
+        for (size_t i = 0; i < C8NUM; i++) {
+          dst_ptr[i] = src_ptr[i * channel];
+        }
+      }
+    }
+    for (; hw < task_end; hw++) {
+      const float *src_ptr = src_batch + hw * channel;
+      float *dst_ptr = dst_batch + hw;
+      for (size_t i = 0; i < channel; i++) {
+        dst_ptr[i * plane] = src_ptr[i];
+      }
+    }
+  }
+}
+
+void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) {
+  return PackNHWCToNCHWFp32(src, dst, batch, channel, plane);
+}
+
+int TransposeNPUKernel::Execute() {
+  std::vector<int> nh2nc_perm = {0, 3, 1, 2};
+  std::vector<int> nc2nh_perm = {0, 2, 3, 1};
+  if (perm_ != nh2nc_perm && perm_ != nc2nh_perm) {
+    MS_LOG(ERROR) << "NPU transpose op only supports nhwc->nchw or nchw->nhwc.";
+    return RET_ERROR;
+  }
+  auto shape = inputs()[0]->shape();
+  if (shape.size() != 4) {
+    MS_LOG(ERROR) << "NPU transpose op only supports input of 4 dims.";
+    return RET_ERROR;
+  }
+  auto input = inputs()[0]->data();
+  auto output = outputs()[0]->data();
+  if (perm_ == nh2nc_perm) {
+    PackNHWCToNCHWFp32(input, output, shape[0], shape[1] * shape[2], shape[3]);
+  } else if (perm_ == nc2nh_perm) {
+    PackNCHWToNHWCFp32(input, output, shape[0], shape[2] * shape[3], shape[1]);
+  } else {
+    MS_LOG(ERROR) << "NPU transpose op only supports nhwc->nchw or nchw->nhwc.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+}  // namespace mindspore
--- a/mindspore/lite/src/delegate/npu/transpose_kernel.h
+++ b/mindspore/lite/src/delegate/npu/transpose_kernel.h
@ -0,0 +1,59 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_TRANSPOSE_KERNEL_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_TRANSPOSE_KERNEL_H_
+#include <vector>
+#include <string>
+#include "include/graph/op/all_ops.h"
+#include "include/kernel.h"
+#include "include/errorcode.h"
+#include "src/common/log_adapter.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore {
+void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel);
+
+void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel);
+
+class TransposeNPUKernel : public kernel::Kernel {
+ public:
+  TransposeNPUKernel(const std::vector<tensor::MSTensor *> &in_tensors,
+                     const std::vector<tensor::MSTensor *> &out_tensors, std::vector<int> perm, std::string name)
+      : kernel::Kernel(in_tensors, out_tensors, nullptr, nullptr) {
+    type_ = schema::PrimitiveType_Transpose;
+    name_ = name;
+    perm_ = perm;
+  }
+
+  ~TransposeNPUKernel() override = default;
+
+  int Prepare() override { return RET_OK; }
+
+  int Execute() override;
+
+  int ReSize() override {
+    MS_LOG(ERROR) << "NPU does not support the resize function temporarily.";
+    return lite::RET_ERROR;
+  }
+
+ protected:
+  std::vector<int> perm_;
+};
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NPU_TRANSPOSE_KERNEL_H_
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@ -19,7 +19,6 @@
 #include "src/common/log_adapter.h"
 #include "src/common/utils.h"
 #ifdef SUPPORT_NPU
-#include "src/runtime/agent/npu/npu_manager.h"
 #include "include/HiAiModelManagerType.h"
 #endif
 #ifdef SUPPORT_GPU
@ -40,22 +39,6 @@ InnerContext::InnerContext(const Context *context) {
 #endif
 }

-#if SUPPORT_NPU
-InnerContext::InnerContext(const Context *context, NPUManager *npu_manager) {
-  this->allocator = context->allocator;
-  this->thread_num_ = context->thread_num_;
-  this->enable_parallel_ = context->enable_parallel_;
-  SetContextDevice(context);
-  this->npu_manager_ = npu_manager;
-#ifdef ENABLE_ARM
-#ifndef MS_COMPILE_IOS
-  cpu_info_ = new CpuInfo;
-  fp16_flag_ = cpu_info_->ArmIsSupportFp16();
-#endif
-#endif
-}
-#endif
-
 void InnerContext::SetContextDevice(const Context *context) {
  bool isUserSetNPU = context->device_list_.end() !=
                      std::find_if(context->device_list_.begin(), context->device_list_.end(),
@ -218,8 +201,8 @@ bool InnerContext::IsGpuEnabled() const {

 bool InnerContext::IsNpuEnabled() const {
 #ifdef SUPPORT_NPU
-  MS_ASSERT(npu_manager_ != nullptr);
-  return IsUserSetNpu() && npu_manager_->IsSupportNPU();
+  //  return IsUserSetNpu() && npu_manager_->IsSupportNPU();
+  return IsUserSetNpu();
 #else
  return false;
 #endif
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@ -24,9 +24,6 @@
 #ifdef ENABLE_ARM
 #include "src/cpu_info.h"
 #endif
-#ifdef SUPPORT_NPU
-#include "src/runtime/agent/npu/npu_manager.h"
-#endif

 namespace mindspore::lite {
 struct InnerContext : public Context {
@ -34,9 +31,7 @@ struct InnerContext : public Context {
  InnerContext() = default;

  explicit InnerContext(const Context *context);
-#if SUPPORT_NPU
-  InnerContext(const Context *context, NPUManager *npu_manager);
-#endif
+
  int Init();

  bool IsCpuFloat16Enabled() const;
@ -89,12 +84,6 @@ struct InnerContext : public Context {
  CpuInfo *cpu_info_ = nullptr;
 #endif
 #endif
-
-#if SUPPORT_NPU
-#ifndef MS_COMPILE_IOS
-  NPUManager *npu_manager_ = nullptr;
-#endif
-#endif
 };

 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num);
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@ -33,8 +33,7 @@
 #include "src/mindrt_executor.h"
 #endif
 #if SUPPORT_NPU
-#include "src/runtime/agent/npu/npu_manager.h"
-#include "src/runtime/agent/npu/optimizer/npu_pass_manager.h"
+#include "src/delegate/npu/npu_delegate.h"
 #endif
 #if GPU_OPENCL
 #include "src/runtime/kernel/opencl/opencl_subgraph.h"
@ -499,11 +498,7 @@ int LiteSession::CompileGraph(Model *model) {
    return ret;
  }
  // scheduler kernels
-#if SUPPORT_NPU
-  Scheduler scheduler(context_, model, &tensors_, is_train_session_, npu_manager_, npu_pass_manager_, delegate_);
-#else
  Scheduler scheduler(context_, model, &tensors_, is_train_session_, delegate_);
-#endif
  scheduler.SetupSchedulerCb(std::move(sched_cb_));
  ret = scheduler.Schedule(&kernels_);
  if (ret != RET_OK) {
@ -511,15 +506,6 @@ int LiteSession::CompileGraph(Model *model) {
    is_running_.store(false);
    return ret;
  }
-#if SUPPORT_NPU
-  if (this->context_->IsNpuEnabled()) {
-    MS_ASSERT(npu_manager_ != nullptr);
-    if (npu_manager_->LoadOMModel() != RET_OK) {
-      MS_LOG(ERROR) << "NPU client load model failed.";
-      return RET_ERROR;
-    }
-  }
-#endif
  InitGraphInOutTensors(model);

  bool use_mindrt_run = IfUseMindrtExecutor();
@ -636,30 +622,12 @@ int LiteSession::Init(const Context *context) {
    MS_LOG(ERROR) << "Not support multi-threading";
    return RET_ERROR;
  }
-#if SUPPORT_NPU
-  npu_manager_ = new (std::nothrow) NPUManager();
-  if (npu_manager_ == nullptr) {
-    MS_LOG(ERROR) << "New npu_manager_ failed";
-    is_running_.store(false);
-    return RET_ERROR;
-  }
-  npu_pass_manager_ = new (std::nothrow) NPUPassManager();
-  if (npu_pass_manager_ == nullptr) {
-    MS_LOG(ERROR) << "New npu_pass_manager_ failed";
-    is_running_.store(false);
-    return RET_ERROR;
-  }
-#endif
  if (context == nullptr) {
    MS_LOG(ERROR) << "context is nullptr";
    is_running_.store(false);
    return RET_NULL_PTR;
  }
-#if SUPPORT_NPU
-  this->context_ = new (std::nothrow) InnerContext(context, npu_manager_);
-#else
  this->context_ = new (std::nothrow) InnerContext(context);
-#endif
  if (this->context_ == nullptr) {
    MS_LOG(ERROR) << "New Context failed";
    is_running_.store(false);
@ -674,9 +642,22 @@ int LiteSession::Init(const Context *context) {
  if (context->delegate != nullptr) {
    delegate_ = context->delegate;
  }
+#if SUPPORT_NPU
+  if (delegate_ == nullptr && context_->IsNpuEnabled()) {
+    delegate_ = std::shared_ptr<NPUDelegate>(new (std::nothrow) NPUDelegate(context_->GetNpuInfo()));
+    if (delegate_ == nullptr) {
+      MS_LOG(ERROR) << "New delegate_ failed";
+      return RET_ERROR;
+    }
+  }
+#endif
  if (delegate_ != nullptr) {
    auto delegate_ret = delegate_->Init();
-    if (delegate_ret != RET_OK) {
+    if (delegate_ret == RET_NOT_SUPPORT) {
+      MS_LOG(DEBUG) << "Delegate is unsupported";
+      delegate_ = nullptr;
+    }
+    if (delegate_ret == RET_ERROR) {
      MS_LOG(ERROR) << "Delegate init failed";
      return RET_ERROR;
    }
@ -741,14 +722,6 @@ LiteSession::~LiteSession() {

  delete this->executor_;
  this->executor_ = nullptr;
-#if SUPPORT_NPU
-  MS_ASSERT(npu_manager_ != nullptr);
-  MS_ASSERT(npu_pass_manager_ != nullptr);
-  npu_pass_manager_->Clear();
-  delete npu_pass_manager_;
-  npu_manager_->Reset();
-  delete npu_manager_;
-#endif
 #if GPU_OPENCL
  delete opencl_runtime_wrapper_;
 #endif
--- a/Show More
+++ b/Show More