From ed8e877f02ab3e810963bfc18b58b4940057045f Mon Sep 17 00:00:00 2001
From: jpc_chenjianping <jpc.chen@huawei.com>
Date: Mon, 4 Jul 2022 20:24:26 +0800
Subject: [PATCH] copy tensor code to runtime

---
 mindspore/lite/src/CMakeLists.txt             |   4 +-
 .../delegate/tensorrt/tensorrt_subgraph.cc    |   4 +
 .../parameter_cache/cache_algorithm.h         |  43 ++
 .../delegate/parameter_cache/cache_mem_base.h |  41 +
 .../parameter_cache/embedding_cache.cc        | 237 ++++++
 .../parameter_cache/embedding_cache.h         |  89 +++
 .../embedding_cache_manager.cc                | 194 +++++
 .../parameter_cache/embedding_cache_manager.h |  60 ++
 .../parameter_cache/factory_mgr_base.h        |  81 ++
 .../parameter_cache/gpu/gpu_cache_mem.cc      | 158 ++++
 .../parameter_cache/gpu/gpu_cache_mem.h       |  48 ++
 .../delegate/parameter_cache/lfu_cache.cc     | 243 ++++++
 .../delegate/parameter_cache/lfu_cache.h      |  55 ++
 .../parameter_cache/load_host_cache_model.cc  | 148 ++++
 .../parameter_cache/load_host_cache_model.h   |  48 ++
 .../runtime/delegate/tensorrt/CMakeLists.txt  |  95 +++
 .../delegate/tensorrt/cuda_impl/activation.cu |  56 ++
 .../tensorrt/cuda_impl/activation.cuh         |  26 +
 .../delegate/tensorrt/cuda_impl/cast.cu       |  49 ++
 .../delegate/tensorrt/cuda_impl/cast.cuh      |  23 +
 .../tensorrt/cuda_impl/cublas_utils.cc        |  70 ++
 .../tensorrt/cuda_impl/cublas_utils.h         |  62 ++
 .../tensorrt/cuda_impl/cuda_helper.cc         |  48 ++
 .../delegate/tensorrt/cuda_impl/cuda_helper.h |  63 ++
 .../tensorrt/cuda_impl/cudnn_utils.cc         |  41 +
 .../delegate/tensorrt/cuda_impl/cudnn_utils.h |  48 ++
 .../delegate/tensorrt/cuda_impl/equal.cu      |  35 +
 .../delegate/tensorrt/cuda_impl/equal.cuh     |  23 +
 .../delegate/tensorrt/cuda_impl/hash.cu       |  64 ++
 .../delegate/tensorrt/cuda_impl/hash.cuh      |  27 +
 .../delegate/tensorrt/cuda_impl/logical.cu    |  63 ++
 .../delegate/tensorrt/cuda_impl/logical.cuh   |  29 +
 .../delegate/tensorrt/cuda_impl/normalize.cu  |  98 +++
 .../delegate/tensorrt/cuda_impl/normalize.cuh |  24 +
 .../delegate/tensorrt/cuda_impl/utils.cuh     |  41 +
 .../distribution/distribution_base.cc         |  23 +
 .../tensorrt/distribution/distribution_base.h |  31 +
 .../distribution/distribution_base_impl.cc    |  28 +
 .../distribution/distribution_collective.cc   |  38 +
 .../distribution/distribution_collective.h    |  45 ++
 .../distribution_collective_impl.cc           |  72 ++
 .../distribution/distribution_utils.cc        |  58 ++
 .../distribution/distribution_utils.h         |  32 +
 .../tensorrt/op/activation_opt_plugin.cc      | 116 +++
 .../tensorrt/op/activation_opt_plugin.h       |  72 ++
 .../tensorrt/op/activation_tensorrt.cc        | 153 ++++
 .../tensorrt/op/activation_tensorrt.h         |  43 ++
 .../tensorrt/op/allgather_tensorrt.cc         | 113 +++
 .../delegate/tensorrt/op/allgather_tensorrt.h |  75 ++
 .../delegate/tensorrt/op/cast_plugin.cc       |  83 ++
 .../delegate/tensorrt/op/cast_plugin.h        |  67 ++
 .../delegate/tensorrt/op/cast_tensorrt.cc     |  79 ++
 .../delegate/tensorrt/op/cast_tensorrt.h      |  43 ++
 .../delegate/tensorrt/op/concate_tensorrt.cc  | 158 ++++
 .../delegate/tensorrt/op/concate_tensorrt.h   |  50 ++
 .../tensorrt/op/convolution_tensorrt.cc       | 187 +++++
 .../tensorrt/op/convolution_tensorrt.h        |  43 ++
 .../tensorrt/op/deconvolution_tensorrt.cc     | 199 +++++
 .../tensorrt/op/deconvolution_tensorrt.h      |  43 ++
 .../tensorrt/op/elementwise_tensorrt.cc       | 312 ++++++++
 .../tensorrt/op/elementwise_tensorrt.h        |  50 ++
 .../delegate/tensorrt/op/equal_tensorrt.cc    |  96 +++
 .../delegate/tensorrt/op/equal_tensorrt.h     |  63 ++
 .../tensorrt/op/fullyconnected_tensorrt.cc    | 106 +++
 .../tensorrt/op/fullyconnected_tensorrt.h     |  45 ++
 .../delegate/tensorrt/op/gather_d_tensorrt.cc | 139 ++++
 .../delegate/tensorrt/op/gather_d_tensorrt.h  |  80 ++
 .../delegate/tensorrt/op/gather_tensorrt.cc   | 108 +++
 .../delegate/tensorrt/op/gather_tensorrt.h    |  42 +
 .../tensorrt/op/logical_not_tensorrt.cc       | 119 +++
 .../tensorrt/op/logical_not_tensorrt.h        |  78 ++
 .../delegate/tensorrt/op/logical_tensorrt.cc  | 129 ++++
 .../delegate/tensorrt/op/logical_tensorrt.h   |  78 ++
 .../delegate/tensorrt/op/lstm_tensorrt.cc     | 493 ++++++++++++
 .../delegate/tensorrt/op/lstm_tensorrt.h      | 115 +++
 .../delegate/tensorrt/op/matmul_opt_plugin.cc | 202 +++++
 .../delegate/tensorrt/op/matmul_opt_plugin.h  |  80 ++
 .../delegate/tensorrt/op/matmul_tensorrt.cc   | 310 ++++++++
 .../delegate/tensorrt/op/matmul_tensorrt.h    |  62 ++
 .../tensorrt/op/normalize_opt_plugin.cc       |  59 ++
 .../tensorrt/op/normalize_opt_plugin.h        |  61 ++
 .../tensorrt/op/normalize_tensorrt.cc         | 178 +++++
 .../delegate/tensorrt/op/normalize_tensorrt.h |  56 ++
 .../delegate/tensorrt/op/pad_tensorrt.cc      | 140 ++++
 .../delegate/tensorrt/op/pad_tensorrt.h       |  42 +
 .../delegate/tensorrt/op/pool_tensorrt.cc     | 220 ++++++
 .../delegate/tensorrt/op/pool_tensorrt.h      |  55 ++
 .../delegate/tensorrt/op/prelu_tensorrt.cc    |  79 ++
 .../delegate/tensorrt/op/prelu_tensorrt.h     |  39 +
 .../delegate/tensorrt/op/reduce_tensorrt.cc   | 139 ++++
 .../delegate/tensorrt/op/reduce_tensorrt.h    |  44 ++
 .../tensorrt/op/reducescatter_tensorrt.cc     | 126 +++
 .../tensorrt/op/reducescatter_tensorrt.h      |  83 ++
 .../delegate/tensorrt/op/resize_tensorrt.cc   | 230 ++++++
 .../delegate/tensorrt/op/resize_tensorrt.h    |  52 ++
 .../delegate/tensorrt/op/scale_tensorrt.cc    | 227 ++++++
 .../delegate/tensorrt/op/scale_tensorrt.h     |  57 ++
 .../tensorrt/op/scatternd_tensorrt.cc         |  99 +++
 .../delegate/tensorrt/op/scatternd_tensorrt.h |  39 +
 .../delegate/tensorrt/op/shape_tensorrt.cc    |  69 ++
 .../delegate/tensorrt/op/shape_tensorrt.h     |  38 +
 .../delegate/tensorrt/op/shuffle_tensorrt.cc  | 437 +++++++++++
 .../delegate/tensorrt/op/shuffle_tensorrt.h   |  58 ++
 .../delegate/tensorrt/op/slice_tensorrt.cc    | 281 +++++++
 .../delegate/tensorrt/op/slice_tensorrt.h     |  66 ++
 .../delegate/tensorrt/op/softmax_tensorrt.cc  |  95 +++
 .../delegate/tensorrt/op/softmax_tensorrt.h   |  43 ++
 .../delegate/tensorrt/op/split_tensorrt.cc    | 160 ++++
 .../delegate/tensorrt/op/split_tensorrt.h     |  45 ++
 .../delegate/tensorrt/op/tensorrt_op.cc       | 132 ++++
 .../delegate/tensorrt/op/tensorrt_op.h        | 175 +++++
 .../delegate/tensorrt/op/tensorrt_plugin.cc   |  81 ++
 .../delegate/tensorrt/op/tensorrt_plugin.h    | 106 +++
 .../delegate/tensorrt/op/tile_tensorrt.cc     | 183 +++++
 .../delegate/tensorrt/op/tile_tensorrt.h      |  94 +++
 .../delegate/tensorrt/op/topk_tensorrt.cc     | 160 ++++
 .../delegate/tensorrt/op/topk_tensorrt.h      |  49 ++
 .../delegate/tensorrt/op/unary_tensorrt.cc    |  84 ++
 .../delegate/tensorrt/op/unary_tensorrt.h     |  56 ++
 .../delegate/tensorrt/tensorrt_allocator.cc   | 150 ++++
 .../delegate/tensorrt/tensorrt_allocator.h    |  64 ++
 .../delegate/tensorrt/tensorrt_context.cc     |  56 ++
 .../delegate/tensorrt/tensorrt_context.h      |  40 +
 .../delegate/tensorrt/tensorrt_delegate.cc    | 243 ++++++
 .../delegate/tensorrt/tensorrt_delegate.h     |  70 ++
 .../delegate/tensorrt/tensorrt_runtime.cc     |  52 ++
 .../delegate/tensorrt/tensorrt_runtime.h      |  82 ++
 .../delegate/tensorrt/tensorrt_serializer.cc  |  63 ++
 .../delegate/tensorrt/tensorrt_serializer.h   |  45 ++
 .../delegate/tensorrt/tensorrt_subgraph.cc    | 681 +++++++++++++++++
 .../delegate/tensorrt/tensorrt_subgraph.h     | 159 ++++
 .../delegate/tensorrt/tensorrt_utils.cc       | 721 ++++++++++++++++++
 .../delegate/tensorrt/tensorrt_utils.h        | 184 +++++
 133 files changed, 14041 insertions(+), 2 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh
 create mode 100755 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu
 create mode 100755 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc
 create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h

diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 679383b93bf..bb6f1942f2c 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -498,14 +498,14 @@ if(SUPPORT_TENSORRT)
         set(CUDA_LIB_PATH ${CUDA_PATH}/lib64)
         include_directories(${TENSORRT_PATH}/include)
         include_directories(${CUDA_PATH}/include)
-        add_subdirectory(extendrt/delegate/tensorrt)
+        add_subdirectory(runtime/delegate/tensorrt)
     endif()
     target_link_libraries(mindspore-lite tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective)
     target_link_libraries(mindspore-lite_static tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective)
 else()
     if(NOT MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
         set(TENSORRT_STUB
-            ${CMAKE_CURRENT_SOURCE_DIR}/extendrt/delegate/tensorrt/distribution/distribution_base.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/delegate/tensorrt/distribution/distribution_base.cc
         )
         add_library(tensorrt_stub OBJECT ${TENSORRT_STUB})
     endif()
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc
index dc78b17383b..a085955c6dc 100644
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc
@@ -381,6 +381,10 @@ int TensorRTSubGraph::Prepare() {
     return RET_ERROR;
   }
   int binding_num = this->engine_->getNbBindings();
+  if (binding_num < 0) {
+    MS_LOG(ERROR) << "invalid binding_num " << binding_num;
+    return RET_ERROR;
+  }
   tensor_bindings_ = new (std::nothrow) void *[binding_num];
   if (tensor_bindings_ == nullptr) {
     MS_LOG(ERROR) << "malloc tensor binding array failed.";
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h
new file mode 100644
index 00000000000..c496b76b947
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
+
+#include <vector>
+#include "include/api/status.h"
+
+namespace mindspore {
+namespace cache {
+struct CacheNoe {
+  CacheNoe(int _index, int _frequency, int _value) : key(_index), frequency(_frequency), value(_value) {}
+  int key;  // host input index
+  int frequency;
+  int value;  // cache index
+};
+
+class CacheAlgorithm {
+ public:
+  virtual ~CacheAlgorithm() {}
+  virtual int Get(int key) = 0;
+  virtual void Put(int key, int value) = 0;
+  virtual Status Init(size_t cache_size, int min_host_index, int max_host_index) = 0;
+  virtual Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                               std::vector<int> *need_swap_indies, std::vector<int> *need_swap_indies_cache_index) = 0;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h
new file mode 100644
index 00000000000..8844e787404
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
+#include <utility>
+#include <memory>
+
+namespace mindspore {
+namespace cache {
+class CacheMemBase {
+ public:
+  CacheMemBase() = default;
+  virtual ~CacheMemBase() = default;
+  virtual bool InitDevice(uint32_t device_id, const void *context) = 0;
+  virtual void *MallocMemory(size_t size) = 0;
+  virtual void FreeMemory(void *buf) = 0;
+  virtual bool SynchronizeStream() = 0;
+  virtual bool CopyHostMemToDevice(void *dst, const void *src, size_t size) = 0;
+  virtual bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) = 0;
+  virtual bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr,
+                           size_t cache_vocab_size, size_t embedding_size, size_t swap_out_size) = 0;
+  virtual bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr,
+                          size_t cache_vocab_size, size_t embedding_size, size_t swap_in_size) = 0;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc
new file mode 100644
index 00000000000..10222514736
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc
@@ -0,0 +1,237 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/delegate/parameter_cache/embedding_cache.h"
+#include <cuda_runtime.h>
+#include <memory>
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <string>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h"
+#include "src/runtime/delegate/parameter_cache/lfu_cache.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+
+namespace {
+constexpr size_t kEmbeddingTensorShapeSize = 2;
+}
+namespace mindspore {
+namespace cache {
+void LookUpTableTask(size_t indices_lens, size_t first_dim_size, const char *input_addr, const int *indices_addr,
+                     char *output_addr, size_t embedding_len, int min_host_index) {
+  for (size_t i = 0; i < indices_lens; ++i) {
+    int index = indices_addr[i] - min_host_index;
+    if (index >= 0 && index < static_cast<int>(first_dim_size)) {
+      size_t pos = index * embedding_len;
+      std::memcpy(output_addr, input_addr + pos, embedding_len);
+    } else {
+      memset(output_addr, 0, embedding_len);
+    }
+    output_addr += embedding_len;
+  }
+}
+
+EmbeddingCache::~EmbeddingCache() {
+  if (hash_swap_value_device_addr_ != nullptr) {
+    device_cache_->FreeMemory(hash_swap_value_device_addr_);
+    hash_swap_value_device_addr_ = nullptr;
+  }
+  if (hash_swap_value_addr_ != nullptr) {
+    free(hash_swap_value_addr_);
+    hash_swap_value_addr_ = nullptr;
+  }
+  if (hash_swap_index_addr_ != nullptr) {
+    device_cache_->FreeMemory(hash_swap_index_addr_);
+    hash_swap_index_addr_ = nullptr;
+  }
+}
+
+Status EmbeddingCache::Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor) {
+  MS_ASSERT(device_tensor.Shape().size() == kEmbeddingTensorShapeSize);
+  MS_ASSERT(host_cache_tensor.Shape().size() == kEmbeddingTensorShapeSize);
+  MS_ASSERT(device_tensor.DataType() == host_cache_tensor.DataType());
+  MS_ASSERT(host_cache_tensor.Data() != nullptr);
+
+  if (device_tensor.Shape()[1] != host_cache_tensor.Shape()[1]) {
+    MS_LOG(ERROR) << device_tensor.Name() << " embedding_size is invalid, device size is " << device_tensor.Shape()[1]
+                  << ", host size is " << host_cache_tensor.Shape()[1];
+    return kLiteError;
+  }
+  if (host_cache_size_ != host_cache_tensor.Shape()[0]) {
+    MS_LOG(ERROR) << device_tensor.Name() << " host_cache_size is invalid, host_cache_size"
+                  << host_cache_tensor.Shape()[0] << ", index begin:" << min_host_index_
+                  << ", index end:" << max_host_index_ << "rank_group_size_ num:" << rank_group_size_
+                  << ", rank id:" << rank_id_ << ", vocab_size_:" << vocab_size_;
+    return kLiteError;
+  }
+
+  data_type_ = device_tensor.DataType();
+  switch (data_type_) {
+    case DataType::kNumberTypeFloat32:
+      sizeof_data_type_ = sizeof(float);
+      break;
+    default:
+      MS_LOG(ERROR) << device_tensor.Name() << " unsupported data type " << static_cast<int>(data_type_);
+      return kLiteError;
+  }
+  host_addr_ = host_cache_tensor.MutableData();
+  embedding_size_ = device_tensor.Shape()[1];
+  device_start_index_ = device_cache_size_ * rank_id_;
+  // host cache tensor is device tensor
+  if (device_tensor.Shape()[0] == host_cache_tensor.Shape()[0]) {
+    device_start_index_ = min_host_index_;
+  }
+  return kSuccess;
+}
+
+Status EmbeddingCache::MallocCacheMemory() {
+  auto hash_swap_value_size = embedding_size_ * batch_elements_ * sizeof_data_type_;
+  hash_swap_value_device_addr_ = device_cache_->MallocMemory(hash_swap_value_size);
+  if (hash_swap_value_device_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_value_device failed, malloc size " << hash_swap_value_size;
+    return kLiteMemoryFailed;
+  }
+
+  hash_swap_value_addr_ = malloc(hash_swap_value_size);
+  if (hash_swap_value_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_value failed, malloc size " << hash_swap_value_size;
+    return kLiteMemoryFailed;
+  }
+
+  // data type of index
+  hash_swap_index_addr_ = static_cast<int *>(device_cache_->MallocMemory(batch_elements_ * sizeof(int)));
+  if (hash_swap_index_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_index failed, malloc size " << batch_elements_ * sizeof(int);
+    return kLiteMemoryFailed;
+  }
+  return kSuccess;
+}
+
+Status EmbeddingCache::Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor,
+                            mindspore::MSTensor device_tensor) {
+  auto ret = Init(host_cache_tensor, device_tensor);
+  if (ret != kSuccess) {
+    return ret;
+  }
+  cache_ = lite::FactoryManagerBase<std::string, cache::CacheAlgorithm>::Instance().GetProduct("lfu");
+  if (cache_ == nullptr) {
+    MS_LOG(ERROR) << "malloc LFUCacheAlgorithm failed";
+    return kLiteMemoryFailed;
+  }
+  ret = cache_->Init(device_cache_size_, min_host_index_, max_host_index_);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "init cache failed," << ret.CodeAsString;
+    return kLiteError;
+  }
+
+  device_cache_ = lite::FactoryManagerBase<std::string, cache::CacheMemBase>::Instance().GetProduct("gpu");
+  if (device_cache_ == nullptr) {
+    MS_LOG(ERROR) << "get cache failed";
+    return kLiteMemoryFailed;
+  }
+  if (!device_cache_->InitDevice(device_id, context)) {
+    MS_LOG(ERROR) << "init device failed";
+    return kLiteError;
+  }
+  ret = MallocCacheMemory();
+  if (ret != kSuccess) {
+    return ret;
+  }
+
+  MS_LOG(INFO) << "init succ,  rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_
+               << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_
+               << ", device_cache_size_:" << device_cache_size_ << ", embedding_size_:" << embedding_size_
+               << ", batch_elements_:" << batch_elements_ << ", index begin:" << min_host_index_
+               << ", index end:" << max_host_index_;
+  return kSuccess;
+}
+
+Status EmbeddingCache::SetHostCacheAddr(void *addr, size_t size) {
+  if (sizeof_data_type_ * host_cache_size_ * embedding_size_ != size) {
+    return kLiteParamInvalid;
+  }
+  host_addr_ = addr;
+
+  // copy part of host mem to device
+  auto ret =
+    device_cache_->CopyHostMemToDevice(device_addr_, addr, sizeof_data_type_ * device_cache_size_ * embedding_size_);
+  if (!ret) {
+    MS_LOG(ERROR) << "CopyHostMemToDevice failed, copy size "
+                  << sizeof_data_type_ * device_cache_size_ * embedding_size_;
+    return kLiteMemoryFailed;
+  }
+
+  // init cache
+  auto index_num = device_cache_size_;
+  for (size_t i = 0; i < index_num; i++) {
+    cache_->Put(min_host_index_ + i, i);
+  }
+
+  return kSuccess;
+}
+
+Status EmbeddingCache::SetDeviceCacheAddr(void *device_mem_addr, size_t size) {
+  if (sizeof_data_type_ * device_cache_size_ * embedding_size_ != size) {
+    return kLiteParamInvalid;
+  }
+
+  device_addr_ = device_mem_addr;
+  SetHostCacheAddr(host_addr_, sizeof_data_type_ * host_cache_size_ * embedding_size_);
+
+  return kSuccess;
+}
+
+Status EmbeddingCache::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index) {
+  std::vector<int> need_swap_indies;
+  std::vector<int> need_swap_indies_cache_index;
+  auto ret =
+    cache_->CheckCacheHit(batch_ids, batch_ids_len, cache_index, &need_swap_indies, &need_swap_indies_cache_index);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "CheckCacheHit failed";
+    return ret;
+  }
+  auto swap_indices_size = need_swap_indies.size();
+  if (swap_indices_size > 0) {
+    LookUpTableTask(swap_indices_size, host_cache_size_, static_cast<char *>(host_addr_), need_swap_indies.data(),
+                    static_cast<char *>(hash_swap_value_addr_), embedding_size_ * sizeof_data_type_, min_host_index_);
+
+    auto device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_value_device_addr_, hash_swap_value_addr_,
+                                                               swap_indices_size * embedding_size_ * sizeof_data_type_);
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "copy swap value to device failed";
+      return kLiteMemoryFailed;
+    }
+
+    device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_index_addr_, need_swap_indies_cache_index.data(),
+                                                          swap_indices_size * sizeof(int));
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "copy swap indies to device failed";
+      return kLiteMemoryFailed;
+    }
+
+    device_cache_ret = device_cache_->HashSwapIn(device_addr_, hash_swap_value_device_addr_, hash_swap_index_addr_,
+                                                 device_cache_size_, embedding_size_, swap_indices_size);
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "HashSwapIn failed";
+      return kLiteMemoryFailed;
+    }
+  }
+
+  return kSuccess;
+}
+}  // namespace cache
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h
new file mode 100644
index 00000000000..4dab859cd52
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
+#include <cmath>
+#include <algorithm>
+#include <memory>
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/cache_algorithm.h"
+#include "src/runtime/delegate/parameter_cache/cache_mem_base.h"
+
+namespace mindspore {
+namespace cache {
+class EmbeddingCache {
+ public:
+  EmbeddingCache(size_t vocab_size, size_t device_cache_size, size_t batch_elements, int rank_id, int rank_group_size)
+      : vocab_size_(vocab_size),
+        device_cache_size_(device_cache_size),
+        batch_elements_(batch_elements),
+        rank_id_(rank_id),
+        rank_group_size_(rank_group_size) {
+    MS_ASSERT(rank_group_size_ != 0);
+    auto local_shard_size = static_cast<int>(std::ceil(static_cast<float>(vocab_size_) / rank_group_size_));
+    min_host_index_ = local_shard_size * rank_id_;
+    max_host_index_ = std::min(min_host_index_ + local_shard_size, static_cast<int>(vocab_size_));
+    host_cache_size_ = max_host_index_ - min_host_index_;
+
+    MS_LOG(INFO) << "rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_
+                 << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_
+                 << ", index begin:" << min_host_index_ << ", index end:" << max_host_index_;
+  }
+
+  ~EmbeddingCache();
+  Status Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor,
+              mindspore::MSTensor device_tensor);
+  Status SetHostCacheAddr(void *addr, size_t size);
+  Status SetDeviceCacheAddr(void *host_mem_addr, size_t size);
+  Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *hash_index);
+  size_t GetDeviceStartIndex() { return device_start_index_; }
+
+ private:
+  Status Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor);
+  Status MallocCacheMemory();
+
+ private:
+  std::shared_ptr<cache::CacheMemBase> device_cache_{nullptr};
+  std::shared_ptr<CacheAlgorithm> cache_{nullptr};
+
+  size_t vocab_size_{0};         // total size
+  size_t host_cache_size_{0};    // local host size
+  size_t device_cache_size_{0};  // local device cache size
+  size_t device_start_index_{0};
+  size_t embedding_size_{0};
+  size_t batch_elements_{0};
+
+  DataType data_type_{DataType::kNumberTypeFloat32};
+  size_t sizeof_data_type_{0};
+
+  void *device_addr_{nullptr};  // hash_info.device_address.addr
+  void *host_addr_{nullptr};
+
+  int *hash_swap_index_addr_;  // embedding_device_cache_->hash_swap_index_addr_
+  void *hash_swap_value_addr_;
+  void *hash_swap_value_device_addr_;
+
+  int rank_id_;
+  int rank_group_size_;
+  int min_host_index_{0};
+  int max_host_index_{0};
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc
new file mode 100644
index 00000000000..4d48521c917
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc
@@ -0,0 +1,194 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h"
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstring>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+
+namespace {
+constexpr size_t kGatherInputsSize = 3;
+}
+namespace mindspore {
+namespace cache {
+Status EmbeddingCacheManager::Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size) {
+  if (cache_model_path.empty() || vocab_size == 0 || device_cache_size >= vocab_size) {
+    MS_LOG(INFO) << "no cache model ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+    return kSuccess;
+  }
+
+  host_cache_model_ = std::make_shared<HostCacheModel>();
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "HostCacheModel malloc failed";
+    return kLiteMemoryFailed;
+  }
+  auto ret = host_cache_model_->LoadCache(cache_model_path);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "load cache failed";
+    return ret;
+  }
+  vocab_size_ = vocab_size;
+  device_cache_size_ = device_cache_size;
+
+  MS_LOG(INFO) << "cache manager init succ, cache model" << cache_model_path << " ,  vocab_size " << vocab_size
+               << ",  device_cache_size " << device_cache_size;
+  return ret;
+}
+
+Status EmbeddingCacheManager::Init(DelegateModel<schema::Primitive> *model, size_t vocab_size,
+                                   size_t device_cache_size) {
+  if (model == nullptr || vocab_size == 0 || device_cache_size >= vocab_size) {
+    MS_LOG(INFO) << "no cache model ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+    return kSuccess;
+  }
+
+  host_cache_model_ = std::make_shared<HostCacheModel>();
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "HostCacheModel malloc failed";
+    return kLiteMemoryFailed;
+  }
+  auto ret = host_cache_model_->LoadCache(model);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "load cache failed";
+    return ret;
+  }
+  vocab_size_ = vocab_size;
+  device_cache_size_ = device_cache_size;
+
+  MS_LOG(INFO) << "cache manager init succ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+  return ret;
+}
+
+bool EmbeddingCacheManager::CheckIsCacheKernel(kernel::Kernel *kernel) {
+  if (host_cache_model_ == nullptr) {
+    return false;
+  }
+  return host_cache_model_->CheckIsCacheKernel(kernel);
+}
+
+Status EmbeddingCacheManager::InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context) {
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "cache model is nullptr, kernel " << kernel->name() << " init cache failed";
+    return kLiteError;
+  }
+  auto host_cache_tensor = host_cache_model_->GetHostCacheTensor(kernel);
+  if (host_cache_tensor == nullptr) {
+    MS_LOG(ERROR) << kernel->name() << ": invalid cache kernel";
+    return kLiteError;
+  }
+
+  // only support embedding cache
+  if (kernel->type() != schema::PrimitiveType_Gather) {
+    MS_LOG(ERROR) << kernel->name() << " is not embedding kernel";
+    return kLiteError;
+  }
+  MS_ASSERT(kernel->inputs().size() == kGatherInputsSize);
+  auto device_tensor = kernel->inputs()[0];
+  size_t batch_elements = kernel->inputs()[1].ElementNum();
+  auto cache =
+    std::make_shared<EmbeddingCache>(vocab_size_, device_cache_size_, batch_elements, rank_id_, rank_group_size_);
+  if (cache == nullptr) {
+    MS_LOG(ERROR) << kernel->name() << ": malloc EmbeddingCache failed";
+    return kLiteError;
+  }
+
+  auto ret = cache->Init(device_id, context, host_cache_tensor, device_tensor);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << kernel->name() << ": EmbeddingCache init failed";
+    return kLiteError;
+  }
+
+  caches_[device_tensor.Name()] = cache;
+  MS_LOG(INFO) << kernel->name() << " is cache kernel, input tensor " << kernel->inputs()[1].Name() << ", cache tensor "
+               << device_tensor.Name();
+
+  return kSuccess;
+}
+
+bool EmbeddingCacheManager::IsCacheTensor(mindspore::MSTensor tensor) {
+  if (host_cache_model_ == nullptr) {
+    return false;
+  }
+  auto cache = caches_.find(tensor.Name());
+  if (cache != caches_.end()) {
+    return true;
+  }
+  return false;
+}
+
+std::vector<int64_t> EmbeddingCacheManager::GetCacheShape(mindspore::MSTensor tensor) {
+  std::vector<int64_t> shape = tensor.Shape();
+  if (shape.size() > 0 && IsCacheTensor(tensor)) {
+    shape[0] = device_cache_size_;
+  }
+  return shape;
+}
+
+size_t EmbeddingCacheManager::GetCacheDataSize(mindspore::MSTensor tensor) {
+  auto data_size = tensor.DataSize();
+  auto &shape = tensor.Shape();
+  if (shape.size() > 0 && IsCacheTensor(tensor) && shape[0] > 0) {
+    data_size = data_size * device_cache_size_ / shape[0];
+  }
+  return data_size;
+}
+
+Status EmbeddingCacheManager::SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size) {
+  auto cache_iter = caches_.find(tensor_name);
+  if (cache_iter == caches_.end() || cache_iter->second == nullptr) {
+    MS_LOG(ERROR) << "not find cache, " << tensor_name;
+    return kLiteError;
+  }
+  auto cache = cache_iter->second;
+  return cache->SetDeviceCacheAddr(device_mem_addr, size);
+}
+
+// device_addr is model input device addr
+int EmbeddingCacheManager::CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor,
+                                       void *model_input_device_addr) {
+  auto cache_iter = caches_.find(tensor_name);
+  if (cache_iter == caches_.end()) {
+    MS_LOG(ERROR) << "not find cache, " << tensor_name;
+    return lite::RET_ERROR;
+  }
+  auto cache = cache_iter->second;
+  hash_indices_.resize(model_input_tensor.ElementNum());
+  auto ret = cache->CheckCacheHit(static_cast<int *>(model_input_tensor.MutableData()), hash_indices_.size(),
+                                  hash_indices_.data());
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "CheckCacheHit failed, " << model_input_tensor.Name();
+    return lite::RET_ERROR;
+  }
+
+  for (size_t i = 0; i < hash_indices_.size(); i++) {
+    if (hash_indices_[i] != -1) {
+      hash_indices_[i] += cache->GetDeviceStartIndex();
+    }
+  }
+
+  auto cuda_ret = cudaMemcpy(model_input_device_addr, hash_indices_.data(), hash_indices_.size() * sizeof(int),
+                             cudaMemcpyHostToDevice);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "copy mem failed, " << model_input_tensor.Name();
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "cache handle succ, " << model_input_tensor.Name() << "," << tensor_name;
+
+  return lite::RET_OK;
+}
+}  // namespace cache
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h
new file mode 100644
index 00000000000..2c8e2b47a64
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
+#include <memory>
+#include <map>
+#include <string>
+#include <vector>
+#include "include/api/kernel.h"
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "src/runtime/delegate/parameter_cache/embedding_cache.h"
+#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore {
+namespace cache {
+class EmbeddingCacheManager {
+ public:
+  EmbeddingCacheManager() {
+    rank_id_ = lite::GetRankID();
+    rank_group_size_ = lite::GetGPUGroupSize();
+  }
+  Status Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size);
+  Status Init(DelegateModel<schema::Primitive> *model, size_t vocab_size, size_t device_cache_size);
+  bool CheckIsCacheKernel(kernel::Kernel *kernel);
+  Status InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context);
+  bool IsCacheTensor(mindspore::MSTensor tensor);
+  int CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor, void *device_addr);
+  Status SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size);
+  std::vector<int64_t> GetCacheShape(mindspore::MSTensor tensor);
+  size_t GetCacheDataSize(mindspore::MSTensor tensor);
+
+ private:
+  std::map<std::string, std::shared_ptr<EmbeddingCache>> caches_;
+  std::vector<int> hash_indices_;
+  int rank_id_{0};
+  int rank_group_size_{1};
+
+  std::shared_ptr<HostCacheModel> host_cache_model_;
+  size_t vocab_size_;
+  size_t device_cache_size_;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h b/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h
new file mode 100644
index 00000000000..cb0049f5f1f
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
+#include <map>
+#include <memory>
+#include "include/api/status.h"
+
+namespace mindspore {
+namespace lite {
+template <typename KEY, typename PRODUCT>
+class ProcductRegistrar {
+ public:
+  virtual std::shared_ptr<PRODUCT> Create() = 0;
+
+ protected:
+  ProcductRegistrar() {}
+  virtual ~ProcductRegistrar() {}
+
+ private:
+  ProcductRegistrar(const ProcductRegistrar &);
+  const ProcductRegistrar &operator=(const ProcductRegistrar &);
+};
+
+template <typename KEY, typename PRODUCT>
+class FactoryManagerBase {
+ public:
+  static FactoryManagerBase &Instance() {
+    static FactoryManagerBase<KEY, PRODUCT> instance;
+    return instance;
+  }
+  void RegProduct(const KEY &key, ProcductRegistrar<KEY, PRODUCT> *registrar) { registrars[key] = registrar; }
+
+  std::shared_ptr<PRODUCT> GetProduct(const KEY &key) {
+    auto registrar_iter = registrars.find(key);
+    if (registrar_iter != registrars.end()) {
+      if (registrar_iter->second != nullptr) {
+        return registrar_iter->second->Create();
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  FactoryManagerBase() = default;
+  ~FactoryManagerBase() = default;
+  FactoryManagerBase(const FactoryManagerBase &);
+  const FactoryManagerBase &operator=(const FactoryManagerBase &);
+
+ private:
+  std::map<KEY, ProcductRegistrar<KEY, PRODUCT> *> registrars;
+};
+
+template <typename KEY, typename PRODUCT, typename PRODUCT_IMPL>
+class CommonProcductRegistrar : public ProcductRegistrar<KEY, PRODUCT> {
+ public:
+  explicit CommonProcductRegistrar(const KEY &key) {
+    FactoryManagerBase<KEY, PRODUCT>::Instance().RegProduct(key, this);
+  }
+  std::shared_ptr<PRODUCT> Create() { return std::make_shared<PRODUCT_IMPL>(); }
+};
+
+#define RET_COMMON_PRODUCT_REGISTRAR(KEY, PRODUCT, PRODUCT_IMPL, key, name) \
+  static mindspore::lite::CommonProcductRegistrar<KEY, PRODUCT, PRODUCT_IMPL> g_commonProcductRegistrar##name(key);
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc
new file mode 100644
index 00000000000..c285b844e40
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h"
+#include <cuda_runtime.h>
+#include <string>
+#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh"
+#include "plugin/device/gpu/hal/device/cuda_driver.h"
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+namespace mindspore {
+namespace cache {
+namespace gpu {
+RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheMemBase, cache::gpu::GPUCacheMem, "gpu", GPUCacheMem);
+bool GPUCacheMem::InitDevice(uint32_t device_id, const void *context) {
+  auto cuda_ret = cudaSetDevice(static_cast<int>(device_id));
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Failed to set device id " << device_id << ", cuda_ret " << cuda_ret << " "
+                  << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+  if (context != nullptr) {
+    stream_ = *(reinterpret_cast<const cudaStream_t *>(context));
+    return true;
+  }
+
+  cuda_ret = cudaStreamCreate(&stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda create stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+void *GPUCacheMem::MallocMemory(size_t size) {
+  void *device_ptr = nullptr;
+  auto cuda_ret = cudaMalloc(&device_ptr, size);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda Malloc failed for size:" << size << ", cuda_ret " << cuda_ret << " "
+                  << cudaGetErrorString(cuda_ret);
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "cudaMalloc size: " << size;
+  return device_ptr;
+}
+
+void GPUCacheMem::FreeMemory(void *device_addr) {
+  auto cuda_ret = cudaFree(device_addr);
+  if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
+    MS_LOG(WARNING) << "free cuda memory failed, "
+                    << ", cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+  }
+}
+
+bool GPUCacheMem::SynchronizeStream() {
+  auto cuda_ret = cudaStreamSynchronize(stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda sync stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::CopyHostMemToDevice(void *dst, const void *src, size_t size) {
+  if (dst == nullptr) {
+    MS_LOG(ERROR) << "dst is nullptr";
+    return false;
+  }
+  if (src == nullptr) {
+    MS_LOG(ERROR) << "src is nullptr";
+    return false;
+  }
+
+  auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::CopyDeviceMemToHost(void *dst, const void *src, size_t size) {
+  if (dst == nullptr) {
+    MS_LOG(ERROR) << "dst is nullptr";
+    return false;
+  }
+  if (src == nullptr) {
+    MS_LOG(ERROR) << "src is nullptr";
+    return false;
+  }
+
+  auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t,
+                              size_t embedding_size, size_t swap_out_size) {
+  if (hash_table_addr == nullptr) {
+    MS_LOG(ERROR) << "hash_table_addr is nullptr";
+    return false;
+  }
+  if (swap_out_value_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_out_value_addr is nullptr";
+    return false;
+  }
+  if (swap_out_index_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_out_index_addr is nullptr";
+    return false;
+  }
+
+  DoHashSwapOut(reinterpret_cast<float *>(hash_table_addr), reinterpret_cast<float *>(swap_out_value_addr),
+                reinterpret_cast<int *>(swap_out_index_addr), swap_out_size, embedding_size, stream_);
+  return true;
+}
+
+bool GPUCacheMem::HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t,
+                             size_t embedding_size, size_t swap_in_size) {
+  if (hash_table_addr == nullptr) {
+    MS_LOG(ERROR) << "hash_table_addr is nullptr";
+    return false;
+  }
+  if (swap_in_value_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_in_value_addr is nullptr";
+    return false;
+  }
+  if (swap_in_index_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_in_index_addr is nullptr";
+    return false;
+  }
+
+  DoHashSwapIn(reinterpret_cast<float *>(hash_table_addr), reinterpret_cast<float *>(swap_in_value_addr),
+               reinterpret_cast<int *>(swap_in_index_addr), swap_in_size, embedding_size, stream_);
+  return true;
+}
+}  // namespace gpu
+}  // namespace cache
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h
new file mode 100644
index 00000000000..f6196d95711
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
+
+#include <cuda_runtime_api.h>
+#include <memory>
+#include "src/runtime/delegate/parameter_cache/cache_mem_base.h"
+
+namespace mindspore {
+namespace cache {
+namespace gpu {
+class GPUCacheMem : public cache::CacheMemBase {
+ public:
+  GPUCacheMem() = default;
+  ~GPUCacheMem() override = default;
+  bool InitDevice(uint32_t device_id, const void *context) override;
+  void *MallocMemory(size_t size) override;
+  void FreeMemory(void *buf) override;
+  bool SynchronizeStream() override;
+  bool CopyHostMemToDevice(void *dst, const void *src, size_t size) override;
+  bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) override;
+  bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t cache_vocab_size,
+                   size_t embedding_size, size_t swap_out_size) override;
+  bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t cache_vocab_size,
+                  size_t embedding_size, size_t swap_in_size) override;
+
+ private:
+  cudaStream_t stream_;
+};
+}  // namespace gpu
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc
new file mode 100644
index 00000000000..bde17d6f54c
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc
@@ -0,0 +1,243 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vector>
+#include <string>
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/lfu_cache.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+namespace mindspore {
+namespace cache {
+RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheAlgorithm, cache::LFUCacheAlgorithm, "lfu", LFUCacheAlgorithm);
+
+LFUCacheAlgorithm::~LFUCacheAlgorithm() {
+  for (auto iter : key_table_) {
+    delete *(iter.second);
+  }
+  key_table_.clear();
+  frequency_table_.clear();
+}
+
+Status LFUCacheAlgorithm::Init(size_t cache_size, int min_host_index, int max_host_index) {
+  if (cache_size <= 0 || min_host_index < 0 || max_host_index <= 0) {
+    return kLiteParamInvalid;
+  }
+  cache_size_ = cache_size;
+  min_host_index_ = min_host_index;
+  max_host_index_ = max_host_index;
+  return kSuccess;
+}
+
+CacheNoe *LFUCacheAlgorithm::GetNode(int key) {
+  auto key_table_iter = key_table_.find(key);
+  if (key_table_iter == key_table_.end()) {
+    return nullptr;
+  }
+  auto node_iter = key_table_iter->second;
+  auto node = *node_iter;
+
+  auto node_list_iter = frequency_table_.find(key);
+  if (node_list_iter == frequency_table_.end()) {
+    return nullptr;
+  }
+  auto &node_list = node_list_iter->second;
+  node_list.erase(node_iter);
+
+  if (node_list.empty()) {
+    frequency_table_.erase(node_list_iter);
+  }
+
+  node->frequency += 1;
+  frequency_table_[node->frequency].emplace_front(node);
+  key_table_[key] = frequency_table_[node->frequency].begin();
+  return node;
+}
+
+int LFUCacheAlgorithm::Get(int key) {
+  auto node = GetNode(key);
+  if (node != nullptr) {
+    return node->value;
+  }
+  return -1;
+}
+
+void LFUCacheAlgorithm::Put(int key, int value) {
+  auto node = GetNode(key);
+  if (node != nullptr) {
+    node->value = value;
+    return;
+  }
+
+  if (cache_size_ == 0) {
+    return;
+  }
+
+  CacheNoe *add_node = nullptr;
+  if (key_table_.size() == cache_size_) {
+    add_node = frequency_table_.begin()->second.back();
+    key_table_.erase(add_node->key);
+    frequency_table_.begin()->second.pop_back();
+    if (frequency_table_.begin()->second.size() == 0) {
+      frequency_table_.erase(frequency_table_.begin()->first);
+    }
+    add_node->value = value;
+    add_node->key = key;
+    add_node->frequency = 1;
+  } else {
+    add_node = new CacheNoe(key, 1, value);
+    if (add_node == nullptr) {
+      return;
+    }
+  }
+
+  frequency_table_[1].emplace_front(add_node);
+  key_table_[key] = frequency_table_[1].begin();
+}
+
+void LFUCacheAlgorithm::GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                                                std::unordered_map<int, CacheNoe *> *hit_index_nodes,
+                                                std::unordered_map<int, std::vector<int>> *need_swap_map) {
+  // 找到没有命中和命中的index
+  for (size_t i = 0; i < batch_ids_len; i++) {
+    auto key = batch_ids[i];
+    if (key < min_host_index_ || key >= max_host_index_) {
+      cache_index[i] = -1;
+      // out range
+      continue;
+    }
+
+    auto hit_iter = hit_index_nodes->find(key);
+    if (hit_iter != hit_index_nodes->end()) {
+      auto node = hit_iter->second;
+      node->frequency += 1;
+      cache_index[i] = node->value;
+      continue;
+    }
+
+    auto swap_iter = need_swap_map->find(key);
+    if (swap_iter != need_swap_map->end()) {
+      swap_iter->second.push_back(i);
+      continue;
+    }
+
+    auto node_iter_iter = key_table_.find(key);
+    if (node_iter_iter == key_table_.end()) {
+      (*need_swap_map)[key].push_back(i);
+      continue;
+    }
+    auto node_iter = node_iter_iter->second;
+    auto node = *node_iter;
+
+    auto node_list_iter = frequency_table_.find(node->frequency);
+    if (node_list_iter == frequency_table_.end()) {
+      continue;
+    }
+    auto &node_list = node_list_iter->second;
+    node_list.erase(node_iter);
+
+    if (node_list.empty()) {
+      frequency_table_.erase(node_list_iter);
+    }
+    // hit
+    node->frequency += 1;
+    cache_index[i] = node->value;
+    (*hit_index_nodes)[key] = node;
+  }
+  return;
+}
+
+std::list<CacheNoe *> LFUCacheAlgorithm::GetSwapNodes(const std::unordered_map<int, std::vector<int>> &need_swap_map) {
+  std::list<CacheNoe *> need_swap_nodes;
+  auto swap_size = need_swap_map.size();
+
+  while (swap_size > 0 && !frequency_table_.empty()) {
+    auto node_list_iter = frequency_table_.begin();
+    if (node_list_iter->second.size() > swap_size) {
+      auto iter = node_list_iter->second.begin();
+      std::advance(iter, swap_size);
+      need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second, node_list_iter->second.begin(), iter);
+      swap_size = 0;
+    } else {
+      swap_size -= node_list_iter->second.size();
+      need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second);
+      frequency_table_.erase(node_list_iter);
+    }
+  }
+  return need_swap_nodes;
+}
+
+Status LFUCacheAlgorithm::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                                        std::vector<int> *need_swap_indies,
+                                        std::vector<int> *need_swap_indies_cache_index) {
+  if (batch_ids == nullptr) {
+    MS_LOG(ERROR) << "batch_ids is nullptr";
+    return kLiteNullptr;
+  }
+  if (cache_index == nullptr) {
+    MS_LOG(ERROR) << "cache_index is nullptr";
+    return kLiteNullptr;
+  }
+  std::unordered_map<int, std::vector<int>> need_swap_map;
+  std::unordered_map<int, CacheNoe *> hit_index_nodes;
+  GetHitNodesAndSwapIndex(batch_ids, batch_ids_len, cache_index, &hit_index_nodes, &need_swap_map);
+
+  // get need_swap_indies.size() least recently used node
+  std::list<CacheNoe *> need_swap_nodes = GetSwapNodes(need_swap_map);
+
+  // 更新老节点的值
+  {
+    if (need_swap_map.size() != need_swap_nodes.size()) {
+      MS_LOG(ERROR) << " need_swap_map.size() " << need_swap_map.size() << " != need_swap_nodes.size() "
+                    << need_swap_nodes.size();
+      return kLiteError;
+    }
+    need_swap_indies_cache_index->reserve(need_swap_map.size());
+    auto need_swap_map_iter = need_swap_map.begin();
+    for (auto iter = need_swap_nodes.begin();
+         iter != need_swap_nodes.end() && need_swap_map_iter != need_swap_map.end(); iter++, need_swap_map_iter++) {
+      auto node = *iter;
+      key_table_.erase(node->key);
+      node->key = need_swap_map_iter->first;
+      node->frequency = 1;
+      for (auto index : need_swap_map_iter->second) {
+        cache_index[index] = node->value;
+      }
+      need_swap_indies->push_back(need_swap_map_iter->first);
+      need_swap_indies_cache_index->push_back(node->value);
+      MS_LOG(INFO) << "device index " << node->value << ",for host index " << need_swap_map_iter->first;
+      key_table_[(*iter)->key] = iter;
+    }
+
+    auto node_list_iter = frequency_table_.begin();
+    if (node_list_iter->second.size() > 0) {
+      auto iter = node_list_iter->second.begin();
+      if ((*iter)->frequency == 1) {
+        node_list_iter->second.splice(node_list_iter->second.begin(), need_swap_nodes);
+      } else {
+        frequency_table_[1] = need_swap_nodes;
+      }
+    } else {
+      frequency_table_[1] = need_swap_nodes;
+    }
+  }
+  for (auto node_iter : hit_index_nodes) {
+    auto node = node_iter.second;
+    frequency_table_[node->frequency].emplace_front(node);
+    key_table_[node->key] = frequency_table_[node->frequency].begin();
+  }
+  return kSuccess;
+}
+}  // namespace cache
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h
new file mode 100644
index 00000000000..3704a98415c
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
+
+#include <map>
+#include <unordered_map>
+#include <list>
+#include <vector>
+#include "include/api/status.h"
+#include "src/runtime/delegate/parameter_cache/cache_algorithm.h"
+namespace mindspore {
+namespace cache {
+class LFUCacheAlgorithm : public CacheAlgorithm {
+ public:
+  LFUCacheAlgorithm() {}
+  ~LFUCacheAlgorithm() override;
+
+  int Get(int key) override;
+  void Put(int key, int value) override;
+  Status Init(size_t cache_size, int min_host_index, int max_host_index) override;
+  Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                       std::vector<int> *need_swap_indies, std::vector<int> *need_swap_indies_cache_index) override;
+
+ private:
+  CacheNoe *GetNode(int key);
+  void GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                               std::unordered_map<int, CacheNoe *> *hit_index_nodes,
+                               std::unordered_map<int, std::vector<int>> *need_swap_map);
+  std::list<CacheNoe *> GetSwapNodes(const std::unordered_map<int, std::vector<int>> &need_swap_map);
+
+  std::unordered_map<int, std::list<CacheNoe *>::iterator> key_table_;
+  std::map<int, std::list<CacheNoe *>> frequency_table_;
+  size_t cache_size_{0};
+
+  int min_host_index_{0};
+  int max_host_index_{1};
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc
new file mode 100644
index 00000000000..839d8e60e28
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc
@@ -0,0 +1,148 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h"
+#include "src/common/log_adapter.h"
+#include "src/common/common.h"
+#include "include/errorcode.h"
+#include "src/common/file_utils.h"
+
+namespace {
+constexpr size_t kGatherInputsSize = 3;
+}
+namespace mindspore {
+namespace cache {
+HostCacheModel::~HostCacheModel() {
+  if (cache_model_ != nullptr) {
+    delete cache_model_;
+    cache_model_ = nullptr;
+  }
+}
+MSTensor *SchemaTensorToMSTensor(lite::SchemaTensorWrapper *schema_tensor_wrapper,
+                                 mindspore::schema::Tensor *schema_tensor) {
+  std::vector<int64_t> shape;
+  for (size_t j = 0; j < schema_tensor->dims()->size(); j++) {
+    shape.push_back(schema_tensor->dims()->data()[j]);
+  }
+  std::string tensor_name;
+  if (schema_tensor->name() != nullptr) {
+    tensor_name = schema_tensor->name()->str();
+  }
+  return MSTensor::CreateRefTensor(tensor_name, (DataType)schema_tensor->dataType(), shape,
+                                   schema_tensor_wrapper->data(), schema_tensor_wrapper->length());
+}
+
+Status HostCacheModel::LoadCache(const std::string &model_path) {
+  cache_model_ = lite::LiteImportFromPath(model_path.c_str());
+  if (cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "Import model failed";
+    return kLiteGraphFileError;
+  }
+
+  auto allTensors = cache_model_->graph_.all_tensors_;
+  for (auto node : cache_model_->graph_.all_nodes_) {
+    // only support embedding cache
+    if (node == nullptr || node->node_type_ != schema::PrimitiveType_Gather) {
+      continue;
+    }
+
+    auto input_index = node->input_indices_[0];
+    if (input_index > allTensors.size() - 1) {
+      MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index << ",allTensors.size() "
+                    << allTensors.size();
+      return kLiteOutOfTensorRange;
+    }
+    auto schema_tensor_wrapper = cache_model_->GetSchemaTensor(input_index);
+    if (schema_tensor_wrapper == nullptr) {
+      MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index;
+      return kLiteOutOfTensorRange;
+    }
+
+    auto schema_tensor = allTensors[input_index];
+    if (schema_tensor != nullptr && schema_tensor_wrapper->data() != nullptr) {
+      auto tensor = SchemaTensorToMSTensor(schema_tensor_wrapper, schema_tensor);
+      if (tensor == nullptr) {
+        return kLiteMemoryFailed;
+      }
+      cache_tensor_[tensor->Name()] = *tensor;
+      MS_LOG(INFO) << tensor->Name() << " is cache tensor, and the node is [" << node->name_ << "]";
+      delete tensor;
+    }
+  }
+  return kSuccess;
+}
+
+size_t GetVocabSize(kernel::Kernel *kernel) {
+  size_t vocab_size = 0;
+  auto cache_config = kernel->GetConfig(lite::kMSCache);
+  auto vocab_size_iter = cache_config.find(lite::kMSCacheVocabSize);
+  if (vocab_size_iter == cache_config.end()) {
+    return vocab_size;
+  }
+
+  auto vocab_size_opt = lite::GenericParseValue<size_t>(vocab_size_iter->second);
+  if (!vocab_size_opt.IsNone()) {
+    vocab_size = vocab_size_opt.Get();
+  }
+  return vocab_size;
+}
+
+Status HostCacheModel::LoadCache(DelegateModel<schema::Primitive> *model) {
+  KernelIter from, end;
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    kernel::Kernel *kernel = *iter;
+    // only support embedding cache
+    if (kernel->type() != schema::PrimitiveType_Gather) {
+      continue;
+    }
+    MS_ASSERT(kernel->inputs().size() == kGatherInputsSize);
+    auto tensor = kernel->inputs()[0];
+    if (tensor.Data() == nullptr) {
+      continue;
+    }
+
+    size_t vocab_size = GetVocabSize(kernel);
+    if (vocab_size == 0) {
+      continue;
+    }
+
+    cache_tensor_[tensor.Name()] = tensor;
+  }
+  return mindspore::kSuccess;
+}
+
+bool HostCacheModel::CheckIsCacheKernel(kernel::Kernel *kernel) {
+  if (GetHostCacheTensor(kernel) == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+MSTensor HostCacheModel::GetHostCacheTensor(kernel::Kernel *kernel) {
+  if (kernel != nullptr && kernel->inputs().size() > 0) {
+    auto iter = cache_tensor_.find(kernel->inputs()[0].Name());
+    if (iter != cache_tensor_.end()) {
+      return iter->second;
+    }
+  }
+  return MSTensor(nullptr);
+}
+}  // namespace cache
+}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h
new file mode 100644
index 00000000000..52b22eea0d0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_
+
+#include <map>
+#include <string>
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "include/api/types.h"
+#include "include/api/kernel.h"
+#include "include/api/delegate.h"
+#include "src/runtime/lite_model.h"
+
+namespace mindspore {
+namespace cache {
+class HostCacheModel {
+ public:
+  HostCacheModel() = default;
+  ~HostCacheModel();
+  Status LoadCache(const std::string &model_path);
+  Status LoadCache(DelegateModel<schema::Primitive> *model);
+  bool CheckIsCacheKernel(kernel::Kernel *kernel);
+  MSTensor GetHostCacheTensor(kernel::Kernel *kernel);
+
+ private:
+  std::map<std::string, MSTensor> cache_tensor_;
+  mindspore::lite::LiteModel *cache_model_{nullptr};
+  char *model_buf_{nullptr};
+  size_t model_size_;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_EMBEDDING_CACHE_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt b/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt
new file mode 100644
index 00000000000..3e5d613b9da
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt
@@ -0,0 +1,95 @@
+include_directories(${TENSORRT_PATH}/include)
+include_directories(${CUDA_PATH}/include)
+include_directories(${CUDA_PATH})
+include_directories($(CCSRC_DIR)/plugin/device/cpu/kernel)
+include_directories(${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops)
+
+if(DEFINED ENV{MS_ENABLE_CUDA_DISTRIBUTION})
+    set(MS_ENABLE_CUDA_DISTRIBUTION $ENV{MS_ENABLE_CUDA_DISTRIBUTION})
+else()
+    set(MS_ENABLE_CUDA_DISTRIBUTION "off")
+endif()
+
+set(NCCL_MPI_SRC_STUB
+    ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_collective.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_base.cc
+)
+
+# nccl mpi
+if(MS_ENABLE_CUDA_DISTRIBUTION STREQUAL "on")
+    message("enable cuda gpu distribution collective")
+    file(GLOB NCCL_MPI_SRC LIST_DIRECTORIES false
+        ${CMAKE_CURRENT_SOURCE_DIR}/distribution/*.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/collective_wrapper.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/mpi_wrapper.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/nccl_wrapper.cc
+    )
+    list(REMOVE_ITEM NCCL_MPI_SRC ${NCCL_MPI_SRC_STUB})
+
+    add_compile_definitions(LITE_CUDA_DISTRIBUTION)
+    include(${TOP_DIR}/cmake/external_libs/ompi.cmake)
+    include(${TOP_DIR}/cmake/external_libs/nccl.cmake)
+
+    add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC})
+    add_library(mindspore::nccl ALIAS nccl::nccl)
+    add_library(mindspore::ompi ALIAS ompi::mpi)
+    target_link_libraries(gpu_distribution_collective PRIVATE mindspore::ompi mindspore::nccl)
+else()
+    add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC_STUB})
+endif()
+add_dependencies(gpu_distribution_collective fbs_src)
+
+file(GLOB TENSORRT_RUNTIME_SRC LIST_DIRECTORIES false
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/op/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime/delegate/delegate_utils.cc
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache)
+
+set(TENSORRT_RUNTIME_SRC
+        ${TENSORRT_RUNTIME_SRC}
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache_manager.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/load_host_cache_model.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/lfu_cache.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/gpu/gpu_cache_mem.cc
+        )
+
+link_libraries(${CUDA_LIB_PATH}/libcudnn.so)
+link_libraries(${CUDA_LIB_PATH}/libnvrtc.so)
+link_libraries(${CUDA_LIB_PATH}/libcublasLt.so)
+
+add_library(libcudart SHARED IMPORTED)
+set_target_properties(libcudart PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcudart.so)
+
+add_library(libnvinfer SHARED IMPORTED)
+set_target_properties(libnvinfer PROPERTIES IMPORTED_LOCATION ${TENSORRT_LIB_PATH}/libnvinfer.so)
+
+add_library(libcublas SHARED IMPORTED)
+set_target_properties(libcublas PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcublas.so)
+add_library(tensorrt_kernel_mid OBJECT ${TENSORRT_RUNTIME_SRC})
+
+add_dependencies(tensorrt_kernel_mid fbs_src)
+
+target_link_libraries(
+    tensorrt_kernel_mid
+    libcudart
+    libcublas
+    libnvinfer
+)
+
+# cuda
+find_package(CUDA)
+file(GLOB_RECURSE CUDA_KERNEL_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cu
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cu
+)
+
+set_source_files_properties(${CUDA_KERNEL_SRC} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGES} -std=c++14 -fPIC")
+SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++14;)
+cuda_add_library(cuda_kernel_mid STATIC ${CUDA_KERNEL_SRC})
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu
new file mode 100644
index 00000000000..ce412e6fcb4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh"
+#include <stdio.h>
+#include <math.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void SigmoidKernel(const T *input1, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = static_cast<T>(1) / (static_cast<T>(1) + exp(-input1[pos]));
+  }
+}
+
+template <typename T>
+__global__ void GeluKernel(const T *input_addr, T *output_addr, int size) {
+  // formula:
+  // gelu(x) = 0.5 * x * (1.0 + tanh(y))
+  // tanh(y) = 2 / (1 + exp(-2y)) - 1)
+  // y = sqrt(2/pi) * (x + 0.044715 * x^3)
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    float x = input_addr[pos];
+    float tanh_res = tanh(0.7978845608f * (x + 0.044715f * x * x * x));
+    output_addr[pos] = 0.5f * x * (1.0f + tanh_res);
+  }
+}
+
+template <typename T>
+void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  SigmoidKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+  return;
+}
+
+template <typename T>
+void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  GeluKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+  return;
+}
+
+template void Sigmoid(const float *input1, float *output, int element_cnt, cudaStream_t stream);
+
+template void Gelu(const float *input1, float *output, int element_cnt, cudaStream_t stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh
new file mode 100644
index 00000000000..81d187674bd
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
+
+template <typename T>
+void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu
new file mode 100644
index 00000000000..a1e90b16d48
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+// Generic cast
+template <typename S, typename T>
+__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) {
+  *output_addr = static_cast<T>((*input_addr));
+}
+
+template <typename S, typename T>
+__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) {
+    CastBase(input_addr + pos, output_addr + pos);
+  }
+}
+
+template <typename S, typename T>
+void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) {
+  CastKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_addr, output_addr);
+}
+
+template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream);
+
+template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream);
+
+template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh
new file mode 100644
index 00000000000..59d7ab82793
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
+
+template <typename S, typename T>
+void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc
new file mode 100644
index 00000000000..cd50b470ef4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h"
+
+namespace mindspore::lite {
+void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle) {
+  const int m = params[0];
+  const int n = params[1];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(
+    cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &alpha, in_addr, n, &beta, out_addr, m, out_addr, m));
+}
+
+void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params,
+                    const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle) {
+  const int m = params[0];
+  const int n = params[1];
+  const int k = params[2];
+  cublasOperation_t trans_a = operations[0];
+  cublasOperation_t trans_b = operations[1];
+  const int lda = (trans_a == CUBLAS_OP_N) ? k : m;
+  const int ldb = (trans_b == CUBLAS_OP_N) ? n : k;
+  const int ldc = n;
+  cudaDataType type_a = data_types[0];
+  cudaDataType type_b = data_types[1];
+  cudaDataType type_c = data_types[2];
+  cudaDataType compute_type = data_types[3];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(cublasGemmEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addr, type_b, ldb, a_addr, type_a,
+                                 lda, &beta, c_addr, type_c, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params,
+                     const cublasOperation_t *operations, const cudaDataType *data_types,
+                     cublasHandle_t cublas_handle) {
+  cublasOperation_t trans_a = operations[0];
+  cublasOperation_t trans_b = operations[1];
+  const int m = params[0];
+  const int n = params[1];
+  const int k = params[2];
+  const int batch = params[3];
+  const int lda = (trans_a == CUBLAS_OP_N) ? k : m;
+  const int ldb = (trans_b == CUBLAS_OP_N) ? n : k;
+  const int ldc = n;
+  cudaDataType type_a = data_types[0];
+  cudaDataType type_b = data_types[1];
+  cudaDataType type_c = data_types[2];
+  cudaDataType compute_type = data_types[3];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(cublasGemmBatchedEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addrs, type_b, ldb, a_addrs,
+                                        type_a, lda, &beta, c_addrs, type_c, ldc, batch, compute_type,
+                                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h
new file mode 100644
index 00000000000..4a7f4eb0576
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
+
+#include <cublas_v2.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/common/log_util.h"
+
+// cublas API error checking
+#define CUBLAS_CHECK_VOID(err)                        \
+  do {                                                \
+    cublasStatus_t cublas_err = (err);                \
+    if (cublas_err != CUBLAS_STATUS_SUCCESS) {        \
+      MS_LOG(ERROR) << "cublas error " << cublas_err; \
+      return;                                         \
+    }                                                 \
+  } while (0)
+
+#define CUBLAS_CHECK(err)                             \
+  do {                                                \
+    cublasStatus_t cublas_err = (err);                \
+    if (cublas_err != CUBLAS_STATUS_SUCCESS) {        \
+      MS_LOG(ERROR) << "cublas error " << cublas_err; \
+      return -1;                                      \
+    }                                                 \
+  } while (0)
+
+namespace mindspore::lite {
+// a: m * n
+// params order: m, n
+void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle);
+
+// a: m * k, b: k * n, c: m * n
+// params order: m, n, k
+// operations order: trans_a, trans_b
+// data_types: type_a, type_b, type_c, compute type
+void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params,
+                    const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle);
+
+// a: batch * m * k, b: batch * k * n, c: batch * m * n
+// params order: m, n, k, batch
+// operations order: trans_a, trans_b
+// data_types: type_a, type_b, type_c, compute type
+void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params,
+                     const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc
new file mode 100644
index 00000000000..54f5738aeb8
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include <cmath>
+#include "src/common/log_util.h"
+
+CudaHelper &CudaHelper::GetInstance() {
+  static CudaHelper instance;
+  return instance;
+}
+int CudaHelper::GetThreadNum() const { return threads_per_block_; }
+int CudaHelper::GetThreadNum(const int block_size) const {
+  return std::min(threads_per_block_, ((block_size - 1) / 32 + 1) * 32);
+}
+int CudaHelper::GetBlocksNum(const int total_threads) const {
+  return std::min(((total_threads - 1) / threads_per_block_) + 1, max_blocks_);
+}
+int CudaHelper::GetBlocksNum(const int total_threads, const int block_size) const {
+  int valid_block_size = std::min(block_size, threads_per_block_);
+  if (valid_block_size == 0) {
+    MS_LOG(ERROR) << "invalid input of block_size: " << block_size;
+    return 0;
+  }
+  return std::min(((total_threads - 1) / valid_block_size) + 1, max_blocks_);
+}
+
+CudaHelper::CudaHelper() {
+  int device_id = 0;
+  (void)cudaGetDevice(&device_id);
+  cudaDeviceProp prop;
+  (void)cudaGetDeviceProperties(&prop, device_id);
+  threads_per_block_ = prop.maxThreadsPerBlock;
+  max_blocks_ = prop.multiProcessorCount;
+}
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h
new file mode 100644
index 00000000000..dc7cc93afa7
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
+
+#include <cuda_runtime.h>
+#include <algorithm>
+
+class CudaHelper {
+ public:
+  int GetThreadNum() const;
+  int GetThreadNum(const int block_size) const;
+  int GetBlocksNum(const int total_threads) const;
+  int GetBlocksNum(const int total_threads, const int block_size) const;
+  static CudaHelper &GetInstance();
+
+ private:
+  CudaHelper();
+  ~CudaHelper() = default;
+  CudaHelper(const CudaHelper &) = delete;
+  CudaHelper &operator=(const CudaHelper &) = delete;
+
+  int max_blocks_;
+  int threads_per_block_;
+};
+
+#define GET_BLOCKS(total_threads) CudaHelper::GetInstance().GetBlocksNum(total_threads)
+#define GET_BLOCKS_CAL(total_threads, block_size) CudaHelper::GetInstance().GetBlocksNum(total_threads, block_size)
+
+#define GET_THREADS CudaHelper::GetInstance().GetThreadNum()
+#define GET_THREADS_CAL(block_size) CudaHelper::GetInstance().GetThreadNum(block_size)
+
+#define CUDA_CHECK(ret)              \
+  do {                               \
+    cudaError_t cuda_ret = (ret);    \
+    if ((cuda_ret) != cudaSuccess) { \
+      return -1;                     \
+    }                                \
+  } while (0)
+
+#define CUDA_CHECK_VOID(ret)         \
+  do {                               \
+    cudaError_t cuda_ret = (ret);    \
+    if ((cuda_ret) != cudaSuccess) { \
+      return;                        \
+    }                                \
+  } while (0)
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc
new file mode 100644
index 00000000000..1590560f697
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h"
+#include <unordered_map>
+
+namespace mindspore::lite {
+cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype) {
+  std::unordered_map<nvinfer1::DataType, cudnnDataType_t> data_types = {{nvinfer1::DataType::kFLOAT, CUDNN_DATA_FLOAT},
+                                                                        {nvinfer1::DataType::kHALF, CUDNN_DATA_HALF},
+                                                                        {nvinfer1::DataType::kINT32, CUDNN_DATA_INT32},
+                                                                        {nvinfer1::DataType::kINT8, CUDNN_DATA_INT8}};
+  if (data_types.find(trt_datatype) != data_types.end()) {
+    return data_types[trt_datatype];
+  } else {
+    MS_LOG(ERROR) << "invalid datatype for cudnn: " << static_cast<int>(trt_datatype);
+  }
+  return CUDNN_DATA_FLOAT;
+}
+
+int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc,
+                    const cudnnTensorDescriptor_t x_dsc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y) {
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  CUDNN_CHECK(cudnnActivationForward(handle, activation_desc, &alpha, x_dsc, x, &beta, y_dsc, y));
+  return 0;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h
new file mode 100644
index 00000000000..d3202e05e00
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
+
+#include <cudnn.h>
+#include <NvInfer.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/common/log_util.h"
+
+#define CUDNN_CHECK_VOID(err)                                            \
+  do {                                                                   \
+    cudnnStatus_t cudnn_err = (err);                                     \
+    if (cudnn_err != CUDNN_STATUS_SUCCESS) {                             \
+      MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \
+      return;                                                            \
+    }                                                                    \
+  } while (0)
+
+#define CUDNN_CHECK(err)                                                 \
+  do {                                                                   \
+    cudnnStatus_t cudnn_err = (err);                                     \
+    if (cudnn_err != CUDNN_STATUS_SUCCESS) {                             \
+      MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \
+      return -1;                                                         \
+    }                                                                    \
+  } while (0)
+namespace mindspore::lite {
+cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype);
+
+int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc,
+                    const cudnnTensorDescriptor_t x_esc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu
new file mode 100644
index 00000000000..7d4840e9fea
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh"
+#include <stdio.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void EqualKernel(const T *input1, const T *input2, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = (input1[pos] - input2[pos] < 1e-6 && input1[pos] - input2[pos] > -1e-6);
+  }
+}
+
+template <typename T>
+void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  EqualKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+  return;
+}
+
+template void Equal(const float *input1, const float *input2, float *output, int element_cnt, cudaStream_t stream);
+template void Equal(const int *input1, const int *input2, int *output, int element_cnt, cudaStream_t stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh
new file mode 100644
index 00000000000..69551308a97
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
+
+template <typename T>
+void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu
new file mode 100755
index 00000000000..27c626bc5fe
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void HashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                            const int hash_dim) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) {
+    int hash_index = swap_out_index[i];
+    for (int j = 0; j < hash_dim; j++) {
+      swap_out_value[i * hash_dim + j] = hash_table[hash_index * hash_dim + j];
+    }
+  }
+  return;
+}
+
+template <typename T>
+__global__ void HashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                           const int hash_dim) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) {
+    int hash_index = swap_in_index[i];
+    for (int j = 0; j < hash_dim; j++) {
+      hash_table[hash_index * hash_dim + j] = swap_in_value[i * hash_dim + j];
+    }
+  }
+  return;
+}
+
+template <typename T>
+void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                   const int hash_dim, cudaStream_t cuda_stream) {
+  HashSwapOut<<<GET_BLOCKS(index_size), GET_THREADS, 0, cuda_stream>>>(hash_table, swap_out_value, swap_out_index,
+                                                                       index_size, hash_dim);
+  return;
+}
+
+template <typename T>
+void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                  const int hash_dim, cudaStream_t cuda_stream) {
+  HashSwapIn<<<GET_BLOCKS(index_size), GET_THREADS, 0, cuda_stream>>>(hash_table, swap_in_value, swap_in_index,
+                                                                      index_size, hash_dim);
+  return;
+}
+
+template void DoHashSwapOut<float>(const float *hash_table, float *swap_out_value, const int *swap_out_index,
+                                   const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+
+template void DoHashSwapIn<float>(float *hash_table, const float *swap_in_value, const int *swap_in_index,
+                                  const int index_size, const int hash_dim, cudaStream_t cuda_stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh
new file mode 100755
index 00000000000..779abba36b1
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
+
+template <typename T>
+void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                   const int hash_dim, cudaStream_t cuda_stream);
+
+template <typename T>
+void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                  const int hash_dim, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu
new file mode 100644
index 00000000000..7c28811db26
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void LogicalNotKernel(const T *input1, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = static_cast<T>(input1[pos] == 0);
+  }
+}
+
+template <typename T>
+__global__ void LogicalAndKernel(const T *input_addr1, const T *input_addr2, T *output, int size) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    output[pos] = input_addr1[pos] * input_addr2[pos];
+  }
+}
+
+template <typename T>
+__global__ void LogicalOrKernel(const T *input_addr1, const T *input_addr2, T *output, int size) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    T sum = input_addr1[pos] + input_addr2[pos];
+    output[pos] = static_cast<T>(sum > 0);
+  }
+}
+
+template <typename T>
+void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalNotKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+}
+
+template <typename T>
+void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalAndKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+}
+
+template <typename T>
+void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalOrKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+}
+
+template void LogicalNot(const int32_t *input1, int32_t *output, int element_cnt, cudaStream_t stream);
+
+template void LogicalAnd(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt,
+                         cudaStream_t stream);
+
+template void LogicalOr(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt,
+                        cudaStream_t stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh
new file mode 100644
index 00000000000..e2a18187aab
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
+
+template <typename T>
+void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu
new file mode 100644
index 00000000000..b8005a98334
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh"
+#include <stdio.h>
+#include <math.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/utils.cuh"
+
+template <typename T>
+__global__ void NormalizeKernel(const T *input, const T *gamma, const T *beta, T *output, size_t n, float epsilion,
+                                int dim_before_axis) {
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int block_loop = (dim_before_axis - 1) / gridDim.x + 1;
+  const int element_cnt = dim_before_axis * n;
+
+  __shared__ float s_mean[2048];
+  __shared__ float s_variance[2048];
+  float sum = 0.0f;
+  float variance = 0.0f;
+
+  for (int block = 0; block < block_loop; block++) {
+    float local_sum = 0.0f;
+    int mean_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      local_sum += static_cast<float>(input[num_index + i]);
+    }
+    sum = blockReduceSum(local_sum);
+    if (tid == 0) {
+      s_mean[mean_index] = sum / n;
+    }
+  }
+  __syncthreads();
+
+  for (int block = 0; block < block_loop; block++) {
+    float local_var_sum = 0.0f;
+    int var_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      float diff = static_cast<float>(input[num_index + i]) - s_mean[var_index];
+      local_var_sum += diff * diff;
+    }
+    variance = blockReduceSum(local_var_sum);
+    if (tid == 0) {
+      s_variance[var_index] = rsqrtf(variance / n + epsilion);
+    }
+  }
+  __syncthreads();
+  for (int block = 0; block < block_loop; block++) {
+    int var_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      float beta_val = (beta == nullptr) ? 0.0f : static_cast<float>(beta[i]);
+      output[num_index + i] =
+        static_cast<T>(((static_cast<float>(input[num_index + i]) - s_mean[var_index]) * s_variance[var_index]) *
+                         static_cast<float>(gamma[i]) +
+                       beta_val);
+    }
+  }
+}
+
+template <typename T>
+void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion,
+               int element_cnt, cudaStream_t stream) {
+  int thread_num = GET_THREADS_CAL(dim_at_axis);
+  int block_num = GET_BLOCKS_CAL(element_cnt, thread_num);
+  int dim_before_axis = element_cnt / dim_at_axis;
+  NormalizeKernel<<<block_num, thread_num, 0, stream>>>(input, gamma, beta, output, dim_at_axis, epsilion,
+                                                        dim_before_axis);
+  return;
+}
+
+template void Normalize(const float *input, const float *gamma, const float *beta, float *output, size_t dim_at_axis,
+                        float epsilion, int element_cnt, cudaStream_t stream);
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh
new file mode 100644
index 00000000000..03eada9f3b4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
+
+template <typename T>
+void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion,
+               int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh
new file mode 100644
index 00000000000..8d957877db9
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T>
+__device__ T warpedReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  }
+  return val;
+}
+
+template <typename T>
+__device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int warped = threadIdx.x & 0x1f;
+  val = warpedReduceSum<T>(val);
+  if (warped == 0) shared[threadIdx.x >> 5] = val;
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[warped] : static_cast<T>(0.0);
+  val = warpedReduceSum<T>(val);
+  return val;
+}
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc
new file mode 100644
index 00000000000..48f49e688d1
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+int GetGPUGroupSize() { return 1; }
+
+int GetRankID() { return 0; }
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h
new file mode 100644
index 00000000000..4feddaadb1e
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
+
+#include <string>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+
+namespace mindspore::lite {
+constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group";
+
+int GetGPUGroupSize();
+
+int GetRankID();
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc
new file mode 100644
index 00000000000..760952e89b2
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+#include <unistd.h>
+#include <thread>
+#include <string>
+#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int GetGPUGroupSize() { return GetGroupSize(NCCL_WORLD_GROUP); }
+
+int GetRankID() { return GetRankIDByGroup(NCCL_WORLD_GROUP); }
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc
new file mode 100644
index 00000000000..e3cc692de7b
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+DistributionCollective::DistributionCollective() {}
+
+DistributionCollective &DistributionCollective::instance() {
+  static DistributionCollective instance;
+  return instance;
+}
+
+int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count,
+                                                 nvinfer1::DataType data_type, schema::ReduceMode reduce_type,
+                                                 cudaStream_t stream, const std::string &group) {
+  return RET_OK;
+}
+
+int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count,
+                                             nvinfer1::DataType data_type, cudaStream_t stream,
+                                             const std::string &group_name) {
+  return RET_OK;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h
new file mode 100644
index 00000000000..43ac1acbfa7
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
+
+#include <string>
+#include "NvInfer.h"
+#include "schema/ops_types_generated.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+class DistributionCollective {
+ public:
+  DistributionCollective(DistributionCollective const &) = delete;
+
+  DistributionCollective &operator=(const DistributionCollective &) = delete;
+
+  static DistributionCollective &instance();
+
+  int ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type,
+                           schema::ReduceMode reduce_type, cudaStream_t stream, const std::string &group);
+
+  int AllGatherWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type,
+                       cudaStream_t stream, const std::string &group_name);
+
+ private:
+  DistributionCollective();
+
+  ~DistributionCollective() = default;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc
new file mode 100644
index 00000000000..e524db6a6f9
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+#include <unistd.h>
+#include <thread>
+#include <string>
+#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+DistributionCollective::DistributionCollective() {
+  InitMPI();
+  InitNCCLComm();
+}
+
+DistributionCollective &DistributionCollective::instance() {
+  static DistributionCollective instance;
+  return instance;
+}
+
+int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count,
+                                                 nvinfer1::DataType data_type, schema::ReduceMode reduce_type,
+                                                 cudaStream_t stream, const std::string &group) {
+  int rank_id = GetRankID();
+  MS_LOG(DEBUG) << "ReduceScatter on rank: " << rank_id;
+  ncclResult_t ret = ReduceScatter(input_addr, output_addr, count, ConvertNCCLDataType(data_type),
+                                   ConvertNCCLReduceMode(reduce_type), stream, group);
+  if (ret != ncclSuccess) {
+    MS_LOG(ERROR) << "ReduceScatter failed: " << static_cast<int>(ret);
+    return RET_ERROR;
+  }
+  auto cuda_ret = cudaStreamSynchronize(stream);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast<int>(cuda_ret);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count,
+                                             nvinfer1::DataType data_type, cudaStream_t stream,
+                                             const std::string &group_name) {
+  int rank_id = GetRankID();
+  MS_LOG(DEBUG) << "AllGather on rank: " << rank_id;
+  ncclResult_t ret = AllGather(input_addr, output_addr, count, ConvertNCCLDataType(data_type), stream, group_name);
+  if (ret != ncclSuccess) {
+    MS_LOG(ERROR) << "AllGather failed: " << static_cast<int>(ret);
+    return RET_ERROR;
+  }
+  auto cuda_ret = cudaStreamSynchronize(stream);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast<int>(cuda_ret);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc
new file mode 100644
index 00000000000..8f45360c1b4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h"
+#include <unordered_map>
+#include "src/common/log_adapter.h"
+
+namespace mindspore::lite {
+ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id) {
+  std::unordered_map<nvinfer1::DataType, ncclDataType_t> data_type_map = {
+    {nvinfer1::DataType::kINT8, ncclInt8},
+    {nvinfer1::DataType::kINT32, ncclInt32},
+    {nvinfer1::DataType::kFLOAT, ncclFloat32},
+    {nvinfer1::DataType::kHALF, ncclHalf},
+  };
+  auto iter = data_type_map.find(type_id);
+  ncclDataType_t data_type;
+  if (iter != data_type_map.end()) {
+    data_type = iter->second;
+  } else {
+    data_type = ncclFloat32;
+    MS_LOG(WARNING) << "invalid data_type for NCCL, need check: " << static_cast<int>(type_id);
+  }
+  return data_type;
+}
+
+ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode) {
+  std::unordered_map<schema::ReduceMode, ncclRedOp_t> reduce_ops_ = {
+    // higher version support mean {schema::ReduceMode::ReduceMode_ReduceMean, ncclAvg},
+    {schema::ReduceMode::ReduceMode_ReduceMax, ncclMax},
+    {schema::ReduceMode::ReduceMode_ReduceMin, ncclMin},
+    {schema::ReduceMode::ReduceMode_ReduceProd, ncclProd},
+    {schema::ReduceMode::ReduceMode_ReduceSum, ncclSum},
+  };
+  auto iter = reduce_ops_.find(mode);
+  ncclRedOp_t nccl_mode;
+  if (iter != reduce_ops_.end()) {
+    nccl_mode = iter->second;
+  } else {
+    nccl_mode = ncclSum;
+    MS_LOG(WARNING) << "invalid reduce for NCCL, need check: " << static_cast<int>(mode);
+  }
+  return nccl_mode;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h
new file mode 100644
index 00000000000..e38b3a10691
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
+
+#include <nccl.h>
+#include "include/errorcode.h"
+#include "NvInfer.h"
+#include "schema/ops_types_generated.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::lite {
+ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id);
+
+ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc
new file mode 100644
index 00000000000..6ef4682d4be
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cuh"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(ActivationOptPluginCreater);
+template class TensorRTPluginCreater<ActivationOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int ActivationOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                 const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                 void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  return RunCudaActivation(inputDesc, inputs, outputs, stream);
+}
+
+bool ActivationOptPlugin::needResize(const int *current_dims, const int *last_dims) {
+  for (int i = 0; i < infer_dims_cnt_; i++) {
+    if (current_dims[i] != last_dims[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int ActivationOptPlugin::RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                            void *const *outputs, cudaStream_t stream) {
+  if (needResize(infer_dims_, inputDesc[0].dims.d)) {
+    if (input_desc_ != nullptr) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
+      input_desc_ = nullptr;
+    }
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc_));
+    for (int i = 0; i < inputDesc[0].dims.nbDims; i++) {
+      infer_dims_[i] = inputDesc[0].dims.d[i];
+    }
+    CUDNN_CHECK(cudnnSetTensorNdDescriptor(input_desc_, ConvertCudnnDataType(inputDesc[0].type), infer_dims_cnt_,
+                                           infer_dims_, infer_stride_));
+  }
+  CHECK_NULL_RETURN(cudnn_handle_);
+  CHECK_NULL_RETURN(activation_desc_);
+  CHECK_NULL_RETURN(input_desc_);
+  CUDNN_CHECK(cudnnSetStream(cudnn_handle_, stream));
+  auto ret = CudnnActivation(cudnn_handle_, activation_desc_, input_desc_, inputs[0], input_desc_, outputs[0]);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "cudnn activation func call failed " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+int ActivationOptPlugin::RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                           void *const *outputs, cudaStream_t stream) {
+  switch (activation_type_) {
+    case (schema::ActivationType::ActivationType_SIGMOID): {
+      Sigmoid(static_cast<const float *>(inputs[0]), static_cast<float *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+              stream);
+      break;
+    }
+    case (schema::ActivationType::ActivationType_GELU): {
+      Gelu(static_cast<const float *>(inputs[0]), static_cast<float *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+           stream);
+      break;
+    }
+    case (schema::ActivationType::ActivationType_SWISH): {
+      CalSwish(GetDimsVolume(inputDesc[0].dims), static_cast<const float *>(inputs[0]),
+               static_cast<float *>(outputs[0]), stream, device_id_);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid activation type: " << static_cast<int>(activation_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *ActivationOptPlugin::clone() const noexcept {
+  auto *plugin = new ActivationOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t ActivationOptPlugin::getSerializationSize() const noexcept { return sizeof(schema::ActivationType); }
+
+void ActivationOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &activation_type_, sizeof(schema::ActivationType));
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h
new file mode 100644
index 00000000000..9e3b5dfd952
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
+
+#include <string>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h"
+
+namespace mindspore::lite {
+constexpr char *ACTIVATION_OPT_PLUGIN_NAME{"ActivationOptPlugin"};
+class ActivationOptPlugin : public TensorRTPlugin {
+ public:
+  ActivationOptPlugin(const std::string name, schema::ActivationType activation_type, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(ACTIVATION_OPT_PLUGIN_NAME), device_id), activation_type_(activation_type) {}
+
+  ActivationOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    activation_type_ = static_cast<const schema::ActivationType *>(fields[0].data)[0];
+  }
+
+  ActivationOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &activation_type_, sizeof(schema::ActivationType));
+  }
+
+  ActivationOptPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  bool needResize(const int *current_dims, const int *last_dims);
+  int RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                        cudaStream_t stream);
+  int RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                         cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::ActivationType activation_type_;
+  cudnnHandle_t cudnn_handle_{nullptr};
+  cudnnActivationDescriptor_t activation_desc_{nullptr};
+  cudnnTensorDescriptor_t input_desc_{nullptr};
+  int infer_dims_[5]{1, 1, 1, 1, 1};
+  int infer_stride_[5]{1, 1, 1, 1, 1};
+  int infer_dims_cnt_{0};
+};
+class ActivationOptPluginCreater : public TensorRTPluginCreater<ActivationOptPlugin> {
+ public:
+  ActivationOptPluginCreater() : TensorRTPluginCreater(std::string(ACTIVATION_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc
new file mode 100644
index 00000000000..e78ec89dddc
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include <cfloat>
+#include <memory>
+#include <unordered_set>
+#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h"
+
+namespace mindspore::lite {
+namespace {
+bool HasCustomActivationPlugin(schema::ActivationType type) {
+  std::unordered_set<schema::ActivationType> plugin_activation = {schema::ActivationType::ActivationType_SIGMOID,
+                                                                  schema::ActivationType::ActivationType_GELU,
+                                                                  schema::ActivationType::ActivationType_SWISH};
+  return plugin_activation.find(type) != plugin_activation.end();
+}
+}  // namespace
+
+int ActivationTensorRT::IsSupport(const schema::Primitive *primitive,
+                                  const std::vector<mindspore::MSTensor> &in_tensors,
+                                  const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  auto activation_op = this->op_primitive_->value_as_Activation();
+  if (activation_op == nullptr) {
+    MS_LOG(ERROR) << "op convert failed";
+    return RET_ERROR;
+  }
+  auto activation_params_opt = TryConvertActivationType(activation_op->activation_type());
+  bool has_custom_plugin = HasCustomActivationPlugin(activation_op->activation_type());
+  if (!activation_params_opt && !has_custom_plugin) {
+    MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_op->activation_type();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int ActivationTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network is invalid";
+    return RET_ERROR;
+  }
+  auto activation_op = this->op_primitive_->value_as_Activation();
+  if (activation_op == nullptr) {
+    MS_LOG(ERROR) << "op convert failed";
+    return RET_ERROR;
+  }
+  float alpha = activation_op->alpha();
+  nvinfer1::ITensor *activation_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32) {
+    activation_input =
+      TRTTensorCast(ctx, tensorrt_in_tensors_[0].trt_tensor_, nvinfer1::DataType::kFLOAT, op_name_ + "_cast_in");
+  }
+
+  auto activation_layer =
+    ActivationTensorRT::AddActivation(ctx, activation_op->activation_type(), alpha,
+                                      std::isfinite(activation_op->min_val()) ? activation_op->min_val() : FLT_MIN,
+                                      std::isfinite(activation_op->max_val()) ? activation_op->max_val() : FLT_MAX,
+                                      activation_input, device_id_, quant_type_);
+  if (activation_layer == nullptr) {
+    MS_LOG(ERROR) << "add activation op failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  activation_layer->setName(op_name_.c_str());
+  // cast to origin type
+  nvinfer1::ITensor *out_tensor = activation_layer->getOutput(0);
+  if (out_tensor->getType() != ConvertDataType(out_tensors_[0].DataType())) {
+    out_tensor = TRTTensorCast(ctx, activation_layer->getOutput(0), ConvertDataType(out_tensors_[0].DataType()),
+                               op_name_ + "_cast_out");
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = activation_layer;
+  return RET_OK;
+}
+nvinfer1::ILayer *ActivationTensorRT::AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type,
+                                                    float alpha, float min_value, float max_value,
+                                                    nvinfer1::ITensor *trt_in_tensor, uint32_t device_id,
+                                                    schema::QuantType quant_type) {
+  bool has_custom_plugin = HasCustomActivationPlugin(activation_type);
+  // sigmoid precision is wrong for trt
+  if (quant_type == schema::QuantType_QUANT_NONE && has_custom_plugin) {
+    std::string layer_name = std::string(trt_in_tensor->getName()) + "_activation";
+    auto plugin = std::make_shared<ActivationOptPlugin>(layer_name.c_str(), activation_type, device_id);
+    MS_LOG(INFO) << "using opt plugin for " << layer_name;
+    if (plugin == nullptr) {
+      MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << layer_name;
+      return nullptr;
+    }
+    nvinfer1::ITensor *inputTensors[] = {trt_in_tensor};
+    nvinfer1::IPluginV2Layer *activation_opt_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+    activation_opt_layer->setName(layer_name.c_str());
+    return activation_opt_layer;
+  }
+
+  // Just some action_code correct, unfind code is set to default relu. need double check.
+  auto action_param_opt = TryConvertActivationType(activation_type);
+  if (!action_param_opt) {
+    MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_type;
+    return nullptr;
+  }
+  auto action_param = action_param_opt.value();
+  nvinfer1::IActivationLayer *activation_layer =
+    ctx->network()->addActivation(*trt_in_tensor, action_param.activation_type);
+  if (activation_layer == nullptr) {
+    MS_LOG(ERROR) << "add activation op failed for TensorRT.";
+    return nullptr;
+  }
+
+  if (activation_type == schema::ActivationType_HARD_TANH) {
+    activation_layer->setAlpha(min_value);
+    activation_layer->setBeta(max_value);
+    return activation_layer;
+  }
+
+  if (action_param.has_alpha) {
+    activation_layer->setAlpha(alpha);
+  }
+
+  if (action_param.has_beta) {
+    activation_layer->setBeta(action_param.beta);
+  }
+
+  return activation_layer;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Activation, ActivationTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h
new file mode 100644
index 00000000000..81292b520c5
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ActivationTensorRT : public TensorRTOp {
+ public:
+  ActivationTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                     const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                     const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ActivationTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  static nvinfer1::ILayer *AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type, float alpha,
+                                         float min_value, float max_value, nvinfer1::ITensor *trt_in_tensor,
+                                         uint32_t device_id = 0,
+                                         schema::QuantType quant_type = schema::QuantType_QUANT_NONE);
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc
new file mode 100644
index 00000000000..7869766e197
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc
@@ -0,0 +1,113 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/allgather_tensorrt.h"
+#include <numeric>
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(AllGatherPluginCreater);
+template class TensorRTPluginCreater<AllGatherPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int AllGatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+#ifndef LITE_CUDA_DISTRIBUTION
+  MS_LOG(ERROR)
+    << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on.";
+  return RET_ERROR;
+#else
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#endif
+}
+
+int AllGatherTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  auto allgather_op = op_primitive_->value_as_AllGather();
+  if (allgather_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed for " << op_name_;
+    return RET_ERROR;
+  }
+  int rank = GetGPUGroupSize();
+  auto plugin = std::make_shared<AllGatherPlugin>(op_name_, rank, device_id_);
+  MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID();
+  nvinfer1::IPluginV2Layer *allgather_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  if (allgather_layer == nullptr) {
+    MS_LOG(ERROR) << "create AllGather layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *allgather_out = allgather_layer->getOutput(0);
+  allgather_layer->setName(op_name_.c_str());
+  allgather_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{allgather_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = allgather_layer;
+  return RET_OK;
+}
+
+// AllGatherPlugin
+int AllGatherPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                             const void *const *inputs, void *const *outputs, void *workspace,
+                             cudaStream_t stream) noexcept {
+  MS_LOG(INFO) << "all gather run at rank id: " << GetRankID() << " stream: " << stream;
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int send_element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+  const void *input = inputs[0];
+  void *output = outputs[0];
+  auto ret = DistributionCollective::instance().AllGatherWrapper(input, output, send_element_cnt, inputDesc->type,
+                                                                 stream, NCCL_WORLD_GROUP);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AllGather nccl run failed for " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *AllGatherPlugin::clone() const noexcept {
+  auto *plugin = new AllGatherPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AllGatherPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                         int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs->nbDims;
+  auto rank_dim = exprBuilder.constant(rank_);
+  out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs->d[0], *rank_dim);
+  for (int i = 1; i < inputs->nbDims; i++) {
+    out_dims.d[i] = inputs->d[i];
+  }
+  return out_dims;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AllGather, AllGatherTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h
new file mode 100644
index 00000000000..a8e266e526e
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+constexpr char *ALLGATHER_PLUGIN_NAME{"AllGatherPlugin"};
+class AllGatherTensorRT : public TensorRTOp {
+ public:
+  AllGatherTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~AllGatherTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class AllGatherPlugin : public TensorRTPlugin {
+ public:
+  AllGatherPlugin(const std::string name, int rank, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(ALLGATHER_PLUGIN_NAME), device_id), rank_(rank) {}
+
+  AllGatherPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    rank_ = static_cast<const int *>(fields[0].data)[0];
+  }
+
+  AllGatherPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int));
+  }
+
+  AllGatherPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+ private:
+  int rank_{0};
+};
+class AllGatherPluginCreater : public TensorRTPluginCreater<AllGatherPlugin> {
+ public:
+  AllGatherPluginCreater() : TensorRTPluginCreater(std::string(ALLGATHER_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc
new file mode 100644
index 00000000000..9fde14fb2e4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/cast_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(CastPluginCreater);
+template class TensorRTPluginCreater<CastPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int CastPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                        const void *const *inputs, void *const *outputs, void *workspace,
+                        cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+
+  if (inputDesc->type == outputDesc->type) {
+    int element_size = (outputDesc->type == nvinfer1::DataType::kFLOAT)
+                         ? sizeof(float)
+                         : ((outputDesc->type == nvinfer1::DataType::kINT32) ? sizeof(int) : 0);
+    auto cuda_ret = cudaMemcpy(outputs[0], inputs[0], element_cnt * element_size, cudaMemcpyDeviceToDevice);
+    if (cuda_ret != cudaSuccess) {
+      MS_LOG(ERROR) << "copy mem failed for " << layer_name_;
+      return RET_ERROR;
+    }
+    return RET_OK;
+  }
+  if (inputDesc->type == nvinfer1::DataType::kINT32 && dest_datatype_ == nvinfer1::DataType::kFLOAT) {
+    auto input = static_cast<const int *>(inputs[0]);
+    auto output = static_cast<float *>(outputs[0]);
+    Cast(element_cnt, input, output, stream);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT && dest_datatype_ == nvinfer1::DataType::kINT32) {
+    auto input = static_cast<const float *>(inputs[0]);
+    auto output = static_cast<int *>(outputs[0]);
+    Cast(element_cnt, input, output, stream);
+  } else {
+    MS_LOG(ERROR) << "unsupported data type cast " << layer_name_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *CastPlugin::clone() const noexcept {
+  auto *plugin = new CastPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DataType CastPlugin::getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const
+  noexcept {
+  return dest_datatype_;
+}
+
+size_t CastPlugin::getSerializationSize() const noexcept {
+  // origin_datatype_ and dest_datatype_
+  return sizeof(nvinfer1::DataType) * 2;
+}
+
+void CastPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &origin_datatype_, sizeof(nvinfer1::DataType));
+  SerializeValue(&buffer, &dest_datatype_, sizeof(nvinfer1::DataType));
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h
new file mode 100644
index 00000000000..100e142c990
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+constexpr char *CAST_PLUGIN_NAME{"CastPluginCreater"};
+class CastPlugin : public TensorRTPlugin {
+ public:
+  CastPlugin(const std::string name, nvinfer1::DataType origin_datatype, nvinfer1::DataType dest_datatype,
+             uint32_t device_id = 0)
+      : TensorRTPlugin(name, std::string(CAST_PLUGIN_NAME), device_id),
+        origin_datatype_(origin_datatype),
+        dest_datatype_(dest_datatype) {}
+
+  CastPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    origin_datatype_ = static_cast<const nvinfer1::DataType *>(fields[0].data)[0];
+    dest_datatype_ = static_cast<const nvinfer1::DataType *>(fields[1].data)[0];
+  }
+
+  CastPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &origin_datatype_, sizeof(nvinfer1::DataType));
+    DeserializeValue(&serialData, &serialLength, &dest_datatype_, sizeof(nvinfer1::DataType));
+  }
+
+  CastPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const
+    noexcept override;
+
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  nvinfer1::DataType origin_datatype_;
+  nvinfer1::DataType dest_datatype_;
+};
+class CastPluginCreater : public TensorRTPluginCreater<CastPlugin> {
+ public:
+  CastPluginCreater() : TensorRTPluginCreater(std::string(CAST_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc
new file mode 100644
index 00000000000..d9490408076
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/cast_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+
+namespace mindspore::lite {
+int CastTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int CastTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  // cast to type tensor
+  auto type_tensor = in_tensors_[1];
+  if (type_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "unknown cast type of " << op_name_;
+    return RET_ERROR;
+  }
+  auto type_data = static_cast<const int *>(type_tensor.Data().get());
+  DataType data_type = static_cast<DataType>(type_data[0]);
+  MS_LOG(DEBUG) << op_name_ << " cast to data type(43 float): " << type_data[0];
+  nvinfer1::DataType dest_datatype = ConvertDataType(data_type);
+  auto trt_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+
+#if TRT_VERSION_GE(7, 2)
+  dest_datatype = (dest_datatype == nvinfer1::DataType::kBOOL ? nvinfer1::DataType::kINT32 : dest_datatype);
+  auto cast_layer = ctx->network()->addIdentity(*trt_tensor);
+#else
+  auto plugin = std::make_shared<CastPlugin>(op_name_, trt_tensor->getType(), dest_datatype);
+  nvinfer1::ITensor *inputTensors[] = {trt_tensor};
+  nvinfer1::IPluginV2Layer *cast_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+#endif
+  if (cast_layer == nullptr) {
+    MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+#if TRT_VERSION_GE(7, 2)
+  cast_layer->setOutputType(0, dest_datatype);
+#endif
+  cast_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *cast_out = cast_layer->getOutput(0);
+  cast_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{cast_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = cast_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Cast, CastTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h
new file mode 100644
index 00000000000..e96d9477568
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+
+namespace mindspore::lite {
+class CastTensorRT : public TensorRTOp {
+ public:
+  CastTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~CastTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  // CastTensorRT
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc
new file mode 100644
index 00000000000..e1b1eba8aa3
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/concate_tensorrt.h"
+#include <experimental/optional>
+#include <algorithm>
+
+namespace mindspore::lite {
+int ConcateTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (type_ != schema::PrimitiveType_Stack && type_ != schema::PrimitiveType_Concat) {
+    MS_LOG(ERROR) << "Unsupported op :" << op_name_ << " , type: " << type_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() == 0 || in_tensors.size() < INPUT_SIZE2 && type_ != schema::PrimitiveType_Stack) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+
+  int input_nbDims = in_tensors_[0].Shape().size();
+  if (axis_ == -1) {
+    axis_ = input_nbDims - 1;
+  }
+  if (axis_ < 0 || axis_ > input_nbDims || axis_ == input_nbDims && type_ != schema::PrimitiveType_Stack) {
+    MS_LOG(ERROR) << "concate_op valid axis : " << axis_ << " , input dims : " << input_nbDims;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int ConcateTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+
+  if (tensorrt_in_tensors_.size() != in_tensors_.size()) {
+    MS_LOG(ERROR) << "concate_op in tensor is invalid, trt tensor has " << tensorrt_in_tensors_.size()
+                  << ", but origin ms tensor has " << in_tensors_.size();
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *trt_input_tensors[tensorrt_in_tensors_.size()];
+  int ret = PreProcessInputs(ctx, trt_input_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreProcessInputs failed for " << op_name_;
+    return ret;
+  }
+
+  if (!same_format_) {
+    if (trt_input_tensors[0]->getDimensions().nbDims == DIMENSION_4D && out_format_ == Format::NCHW) {
+      // when inputs all NCHW, change axis
+      axis_ = ConvertAxisFromNHWC2NCHW(axis_);
+      MS_LOG(DEBUG) << "concate axis change to " << axis_ << " when using NCHW format.";
+    } else {
+      MS_LOG(WARNING) << "input tensor format needs check, convert concat axis failed for " << op_name_;
+    }
+  }
+
+  if (type_ == schema::PrimitiveType_Stack) {
+    for (size_t i = 0; i != tensorrt_in_tensors_.size(); ++i) {
+      auto shuffle_layer = ctx->network()->addShuffle(*trt_input_tensors[i]);
+      if (shuffle_layer == nullptr) {
+        MS_LOG(ERROR) << "addShuffle failed for TensorRT.";
+        return RET_ERROR;
+      }
+      auto shuffer_dims_opt = UnsqueezeDims(trt_input_tensors[i]->getDimensions(), axis_, 1);
+      if (!shuffer_dims_opt) {
+        MS_LOG(ERROR) << "UnsqueezeDims failed.";
+        return RET_ERROR;
+      }
+      shuffle_layer->setReshapeDimensions(shuffer_dims_opt.value());
+      trt_input_tensors[i] = shuffle_layer->getOutput(0);
+    }
+  }
+  nvinfer1::IConcatenationLayer *concate_layer =
+    ctx->network()->addConcatenation(trt_input_tensors, static_cast<int>(tensorrt_in_tensors_.size()));
+  if (concate_layer == nullptr) {
+    MS_LOG(ERROR) << "addConcatenation failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  if (axis_ != RET_INVALID_OP_ATTR) {
+    concate_layer->setAxis(axis_);
+  }
+  concate_layer->setName(op_name_.c_str());
+  auto concat_output = concate_layer->getOutput(0);
+  concat_output->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{concat_output, out_format_, same_format_});
+  this->layer_ = concate_layer;
+  return RET_OK;
+}
+
+int ConcateTensorRT::PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]) {
+  int input_nbDims = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  same_format_ = tensorrt_in_tensors_[0].same_format_;
+
+  for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+    if (tensorrt_in_tensors_[i].trt_tensor_->getDimensions().nbDims != input_nbDims) {
+      MS_LOG(ERROR) << "dims of inputs is invalid for " << op_name_;
+      return RET_ERROR;
+    }
+    // keep origin format if all input format are the same
+    if (input_nbDims == DIMENSION_4D && tensorrt_in_tensors_[i].format_ != out_format_) {
+      out_format_ = Format::NHWC;
+    }
+  }
+
+  // make sure all inputs are same format
+  if (input_nbDims == DIMENSION_4D) {
+    for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+      if (tensorrt_in_tensors_[i].format_ == out_format_) {
+        trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+        MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]);
+      } else {
+        nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[i].trt_tensor_);
+        if (transpose_layer == nullptr) {
+          MS_LOG(ERROR) << "op action convert failed";
+          return RET_ERROR;
+        }
+        trt_input_tensors[i] = transpose_layer->getOutput(0);
+        this->transpose_layer_ = transpose_layer;
+        same_format_ = true;
+        MS_LOG(DEBUG) << "concate input " << GetTensorFormat(trt_input_tensors[i], Format::NHWC, true);
+      }
+    }
+  } else {
+    for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+      trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+      MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]);
+    }
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Concat, ConcateTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Stack, ConcateTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h
new file mode 100644
index 00000000000..351f4abf17b
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ConcateTensorRT : public TensorRTOp {
+ public:
+  ConcateTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {
+    type_ = primitive->value_type();
+    axis_ = (type_ == schema::PrimitiveType_Concat ? primitive->value_as_Concat()->axis()
+                                                   : primitive->value_as_Stack()->axis());
+  }
+
+  ~ConcateTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]);
+
+  Format out_format_{Format::NHWC};
+  bool same_format_{true};
+  schema::PrimitiveType type_;
+  int axis_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc
new file mode 100644
index 00000000000..28e3215ebcf
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc
@@ -0,0 +1,187 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/convolution_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+
+namespace mindspore::lite {
+constexpr int BIAS_INDEX = 2;
+
+int ConvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
+                                   const std::vector<mindspore::MSTensor> &in_tensors,
+                                   const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  const schema::Conv2DFusion *conv_op = this->op_primitive_->value_as_Conv2DFusion();
+  if (conv_op == nullptr) {
+    MS_LOG(ERROR) << "op action convert failed";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *conv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    conv_input = transpose_layer_in->getOutput(0);
+  }
+
+  // transpose weight
+  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
+  nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_);
+
+  // conv
+  int nbOutputMaps = weight_tensor.Shape()[0];
+  if (nbOutputMaps <= 0) {
+    MS_LOG(ERROR) << "out_channel is invalid";
+    return RET_ERROR;
+  }
+
+  auto kernel_size = conv_op->kernel_size();
+  if (kernel_size == nullptr) {
+    MS_LOG(ERROR) << "kernel_size is null";
+    return RET_ERROR;
+  }
+  nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector<int64_t>(kernel_size->begin(), kernel_size->end()));
+  if (kernelSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  // bias
+  nvinfer1::Weights biasWeights{};
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    biasWeights = lite::ConvertWeight(in_tensors_[BIAS_INDEX]);
+  } else {
+    biasWeights.type = ConvertDataType(weight_tensor.DataType());
+    biasWeights.count = 0;
+    biasWeights.values = nullptr;
+  }
+
+  nvinfer1::IConvolutionLayer *conv_layer =
+    ctx->network()->addConvolutionNd(*conv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+
+  if (conv_layer == nullptr) {
+    MS_LOG(ERROR) << "ConvolutionLayer failed";
+    return RET_ERROR;
+  }
+  conv_layer->setName((op_name_ + "_conv").c_str());
+  this->layer_ = conv_layer;
+
+  // add params
+  SetAttributes(conv_op, conv_layer);
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (conv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = conv_layer;
+  } else {
+    activation_layer =
+      ActivationTensorRT::AddActivation(ctx, conv_op->activation_type(), 0, 0, 0, conv_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for conv failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false});
+  return RET_OK;
+}
+
+void ConvolutionTensorRT::SetAttributes(const schema::Conv2DFusion *conv_op, nvinfer1::IConvolutionLayer *conv_layer) {
+  auto stride = conv_op->stride();
+  if (stride != nullptr) {
+    auto stride_val = std::vector<int64_t>(stride->begin(), stride->end());
+    auto dims = ConvertCudaDims(stride_val);
+    if (dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    conv_layer->setStrideNd(dims);
+  }
+
+  auto dilation = conv_op->dilation();
+  if (dilation != nullptr) {
+    auto dilation_val = std::vector<int64_t>(dilation->begin(), dilation->end());
+    auto dims = ConvertCudaDims(dilation_val);
+    if (dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    conv_layer->setDilationNd(dims);
+  }
+  int nbGroups = conv_op->group();
+  if (nbGroups > 0) {
+    conv_layer->setNbGroups(nbGroups);
+  }
+
+  schema::PadMode pad_mode = conv_op->pad_mode();
+  if (pad_mode == schema::PadMode::PadMode_SAME) {
+    conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    auto padding = conv_op->pad_list();
+    if (padding != nullptr && padding->size() == DIMENSION_4D) {
+      auto padding_val = std::vector<int64_t>(padding->begin(), padding->end());
+      if (padding_val[0] != padding_val[1] || padding_val[DIMENSION_2D] != padding_val[DIMENSION_3D]) {
+        MS_LOG(WARNING) << op_name_ << " has different up and down padding value";
+      }
+      nvinfer1::Dims2 dims(padding_val[0], padding_val[DIMENSION_2D]);
+      conv_layer->setPaddingNd(dims);
+    } else if (padding == nullptr || padding->size() == 0) {
+      nvinfer1::Dims2 dims;
+      conv_layer->setPaddingNd(dims);
+    } else {
+      MS_LOG(WARNING) << "pad list is invalid for " << op_name_;
+    }
+  }
+}
+
+ConvolutionTensorRT::~ConvolutionTensorRT() {
+  if (pack_weight_ != nullptr) {
+    free(pack_weight_);
+    pack_weight_ = nullptr;
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2DFusion, ConvolutionTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h
new file mode 100644
index 00000000000..cfeb755a579
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ConvolutionTensorRT : public TensorRTOp {
+ public:
+  ConvolutionTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                      const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                      const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ConvolutionTensorRT() override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  void SetAttributes(const schema::Conv2DFusion *ms_op, nvinfer1::IConvolutionLayer *current_layer_);
+
+  void *pack_weight_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc
new file mode 100644
index 00000000000..08e96ed6662
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "nnacl/pack.h"
+
+namespace mindspore::lite {
+int DeconvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
+                                     const std::vector<mindspore::MSTensor> &in_tensors,
+                                     const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int DeconvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  const schema::Conv2dTransposeFusion *deconv_op = this->op_primitive_->value_as_Conv2dTransposeFusion();
+  if (deconv_op == nullptr) {
+    MS_LOG(ERROR) << "op action convert failed";
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *deconv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    deconv_input = transpose_layer_in->getOutput(0);
+  }
+
+  // transpose weight
+  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
+  nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_);
+
+  // deconv basic params
+  int nbOutputMaps = weight_tensor.Shape()[0];
+  if (nbOutputMaps <= 0) {
+    MS_LOG(ERROR) << "out_channel is invalid";
+    return RET_ERROR;
+  }
+
+  auto kernel_size = deconv_op->kernel_size();
+  if (kernel_size == nullptr) {
+    MS_LOG(ERROR) << "kernel_size is null";
+    return RET_ERROR;
+  }
+  nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector<int64_t>(kernel_size->begin(), kernel_size->end()));
+  if (kernelSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  // bias
+  nvinfer1::Weights biasWeights{};
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    biasWeights = lite::ConvertWeight(in_tensors_[INPUT_SIZE3 - 1]);
+  } else {
+    biasWeights.type = ConvertDataType(weight_tensor.DataType());
+    biasWeights.count = 0;
+    biasWeights.values = nullptr;
+  }
+
+  nvinfer1::IDeconvolutionLayer *deconv_layer =
+    ctx->network()->addDeconvolutionNd(*deconv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+
+  if (deconv_layer == nullptr) {
+    MS_LOG(ERROR) << "DeconvolutionLayer failed";
+    return RET_ERROR;
+  }
+  deconv_layer->setName((op_name_ + "_deconv").c_str());
+  this->layer_ = deconv_layer;
+  // set extra params
+  SetAttributes(deconv_op, deconv_layer);
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (deconv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = deconv_layer;
+  } else {
+    activation_layer = ActivationTensorRT::AddActivation(ctx, deconv_op->activation_type(), 0, 0, 0,
+                                                         deconv_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for conv failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false});
+  return RET_OK;
+}
+
+void DeconvolutionTensorRT::SetAttributes(const schema::Conv2dTransposeFusion *ms_op,
+                                          nvinfer1::IDeconvolutionLayer *decon_layer) {
+  // kernel_size
+  auto kernel_size = ms_op->kernel_size();
+  if (kernel_size != nullptr) {
+    auto kernel_size_val = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+    nvinfer1::Dims kernel_size_dims = lite::ConvertCudaDims(kernel_size_val);
+    if (kernel_size_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    decon_layer->setKernelSizeNd(kernel_size_dims);
+  }
+
+  // nbOutputMaps
+  int32_t nbOutputMaps = static_cast<int32_t>(ms_op->out_channel());
+  decon_layer->setNbOutputMaps(nbOutputMaps);
+
+  // stride
+  auto stride = ms_op->stride();
+  if (stride != nullptr) {
+    auto stride_val = std::vector<int64_t>(stride->begin(), stride->end());
+    nvinfer1::Dims stride_dims = lite::ConvertCudaDims(stride_val);
+    if (stride_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    decon_layer->setStrideNd(stride_dims);
+  }
+
+  // nbGroups
+  int32_t nbGroups = static_cast<int32_t>(ms_op->group());
+  decon_layer->setNbGroups(nbGroups);
+
+  // padding
+  schema::PadMode pad_mode = ms_op->pad_mode();
+  if (pad_mode == schema::PadMode::PadMode_SAME) {
+    decon_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    auto padding = ms_op->pad_list();
+    auto out_pad = ms_op->output_paddings();
+    if (padding == nullptr || out_pad == nullptr) {
+      MS_LOG(WARNING) << "on pad value of " << op_name_;
+      return;
+    }
+    auto padding_val = std::vector<int64_t>(padding->begin(), padding->end());
+    auto out_pad_val = std::vector<int64_t>(out_pad->begin(), out_pad->end());  // h, w
+    if (out_pad_val.size() != DIMENSION_2D || padding_val.size() != DIMENSION_4D) {
+      MS_LOG(ERROR) << "invalid size of pad " << op_name_;
+      return;
+    }
+    nvinfer1::Dims dims_pre{};
+    dims_pre.nbDims = DIMENSION_2D;
+    dims_pre.d[0] = padding_val[0];  // up
+    dims_pre.d[1] = padding_val[2];  // left
+    decon_layer->setPrePadding(dims_pre);
+    nvinfer1::Dims dims_post{};
+    dims_post.nbDims = DIMENSION_2D;
+    dims_post.d[0] = padding_val[1] - out_pad_val[0];  // down
+    dims_post.d[1] = padding_val[3] - out_pad_val[1];  // right
+    decon_layer->setPostPadding(dims_post);
+  }
+}
+
+DeconvolutionTensorRT::~DeconvolutionTensorRT() {
+  if (pack_weight_ != nullptr) {
+    free(pack_weight_);
+    pack_weight_ = nullptr;
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2dTransposeFusion, DeconvolutionTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h
new file mode 100644
index 00000000000..e7cfe233816
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class DeconvolutionTensorRT : public TensorRTOp {
+ public:
+  DeconvolutionTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                        const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                        const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~DeconvolutionTensorRT() override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  void SetAttributes(const schema::Conv2dTransposeFusion *ms_op, nvinfer1::IDeconvolutionLayer *decon_layer);
+
+  void *pack_weight_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc
new file mode 100644
index 00000000000..05ac5ceaefd
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc
@@ -0,0 +1,312 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unordered_map>
+#include <unordered_set>
+#include "src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+
+namespace mindspore::lite {
+namespace {
+std::unordered_map<schema::PrimitiveType, nvinfer1::ElementWiseOperation> NOT_BOOL_PRIM2NV_ELEM_OP = {
+#if TRT_VERSION_GE(7, 2)
+  {schema::PrimitiveType_Less, nvinfer1::ElementWiseOperation::kLESS},
+  {schema::PrimitiveType_Greater, nvinfer1::ElementWiseOperation::kGREATER},
+#endif
+  {schema::PrimitiveType_AddFusion, nvinfer1::ElementWiseOperation::kSUM},
+  {schema::PrimitiveType_PowFusion, nvinfer1::ElementWiseOperation::kPOW},
+  {schema::PrimitiveType_DivFusion, nvinfer1::ElementWiseOperation::kDIV},
+  {schema::PrimitiveType_RealDiv, nvinfer1::ElementWiseOperation::kDIV},
+  {schema::PrimitiveType_FloorDiv, nvinfer1::ElementWiseOperation::kFLOOR_DIV},
+  {schema::PrimitiveType_SubFusion, nvinfer1::ElementWiseOperation::kSUB},
+  {schema::PrimitiveType_MulFusion, nvinfer1::ElementWiseOperation::kPROD},
+  {schema::PrimitiveType_Minimum, nvinfer1::ElementWiseOperation::kMIN},
+  {schema::PrimitiveType_Maximum, nvinfer1::ElementWiseOperation::kMAX},
+  {schema::PrimitiveType_BiasAdd, nvinfer1::ElementWiseOperation::kSUM},
+#if TRT_VERSION_GE(7, 2)
+  {schema::PrimitiveType_Equal, nvinfer1::ElementWiseOperation::kEQUAL},
+#endif
+};
+}  // namespace
+
+int ElementWiseTensorRT::IsSupport(const schema::Primitive *primitive,
+                                   const std::vector<mindspore::MSTensor> &in_tensors,
+                                   const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensort size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensort size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+
+  // if constant tensor is scalar, it needs to know another input tensor's shape to broadcast
+  if ((in_tensors[0].Shape().size() > 0 && in_tensors[0].Shape()[0] == -1 && in_tensors[1].Shape().size() == 0) ||
+      (in_tensors[1].Shape().size() > 0 && in_tensors[1].Shape()[0] == -1 && in_tensors[0].Shape().size() == 0)) {
+    MS_LOG(ERROR) << "invalid all input tensor shape unknown for: " << op_name_;
+    return RET_ERROR;
+  }
+
+  bool is_not_bool_arith = NOT_BOOL_PRIM2NV_ELEM_OP.find(type_) != NOT_BOOL_PRIM2NV_ELEM_OP.end();
+  if (is_not_bool_arith) {
+    if (std::any_of(in_tensors.begin(), in_tensors.end(),
+                    [](const mindspore::MSTensor &tensor) { return tensor.DataType() == DataType::kNumberTypeBool; })) {
+      MS_LOG(ERROR) << "invalid input type for : " << op_name_;
+      return RET_ERROR;
+    }
+    element_wise_op_ = NOT_BOOL_PRIM2NV_ELEM_OP[type_];
+  }
+  if (!is_not_bool_arith) {
+    // PrimitiveType_Eltwise
+    auto eltwise_op = op_primitive_->value_as_Eltwise();
+    if (eltwise_op == nullptr) {
+      MS_LOG(ERROR) << "convert to Eltwise failed: " << op_name_;
+      return RET_ERROR;
+    }
+    schema::EltwiseMode eltwiseMode = eltwise_op->mode();
+    std::map<schema::EltwiseMode, nvinfer1::ElementWiseOperation> eltwise_modes = {
+      {schema::EltwiseMode::EltwiseMode_SUM, nvinfer1::ElementWiseOperation::kSUM},
+      {schema::EltwiseMode::EltwiseMode_PROD, nvinfer1::ElementWiseOperation::kPROD},
+      {schema::EltwiseMode::EltwiseMode_MAXIMUM, nvinfer1::ElementWiseOperation::kMAX},
+    };
+    auto iter_mode = eltwise_modes.find(eltwiseMode);
+    if (iter_mode != eltwise_modes.end()) {
+      element_wise_op_ = iter_mode->second;
+    } else {
+      MS_LOG(ERROR) << "unsupported type for ElementWise op" << op_name_;
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ElementWiseTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network or input tensor size is invalid";
+    return RET_ERROR;
+  }
+  ITensorHelper x_input;
+  ITensorHelper y_input;
+  int ret = PreprocessInputTensors(ctx, &x_input, &y_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputTensors failed.";
+    return RET_ERROR;
+  }
+  nvinfer1::IElementWiseLayer *cal_layer =
+    ctx->network()->addElementWise(*x_input.trt_tensor_, *y_input.trt_tensor_, element_wise_op_);
+
+  if (cal_layer == nullptr) {
+    MS_LOG(ERROR) << "addElementWise failed for TensorRT.";
+    return RET_ERROR;
+  }
+  cal_layer->setName(op_name_.c_str());
+  this->layer_ = cal_layer;
+
+  nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  // add activation
+  nvinfer1::ITensor *activation_out_tensor = AddActivation(ctx, op_out_tensor);
+  op_out_tensor = (activation_out_tensor == nullptr) ? op_out_tensor : activation_out_tensor;
+
+  // scale and shift
+  if (type_ == schema::PrimitiveType_PowFusion) {
+    auto pow_op = op_primitive_->value_as_PowFusion();
+    if (pow_op == nullptr) {
+      MS_LOG(ERROR) << "PowFusion convert failed.";
+      return RET_ERROR;
+    }
+    float scale = pow_op->scale();
+    float shift = pow_op->shift();
+    if (abs(scale - 1) >= 1.0e-05 || abs(shift - 0) >= 1.0e-05) {
+      MS_LOG(WARNING) << "deal with scale and shift for pow op";
+    }
+  }
+#if TRT_VERSION_GE(7, 2)
+  std::unordered_set<schema::PrimitiveType> bool_producer_ops = {
+    schema::PrimitiveType_Equal, schema::PrimitiveType_Greater, schema::PrimitiveType_Less};
+  if (bool_producer_ops.find(type_) != bool_producer_ops.end()) {
+    auto cast_layer = ctx->network()->addIdentity(*op_out_tensor);
+    if (cast_layer == nullptr) {
+      MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+      return RET_ERROR;
+    }
+    cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    op_out_tensor = cast_layer->getOutput(0);
+    MS_LOG(INFO) << "bool result cast to int32" << op_name_;
+  }
+#endif
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, x_input.format_, x_input.same_format_});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+int ElementWiseTensorRT::PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input) {
+  int input_x_index = SameTensor(tensorrt_in_tensors_[0].trt_tensor_, &in_tensors_[0]) ? 0 : 1;
+  if (in_tensors_[0].Shape() == in_tensors_[1].Shape() && in_tensors_[0].IsConst()) {
+    input_x_index = 1;
+  }
+
+  if (this->tensorrt_in_tensors_.size() != INPUT_SIZE2) {
+    int ret = AddConstTensor(ctx);
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+  *x_input = tensorrt_in_tensors_[input_x_index];
+  *y_input = tensorrt_in_tensors_[1 - input_x_index];
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*x_input);
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*y_input);
+
+  if (x_input->trt_tensor_->getDimensions().nbDims == DIMENSION_4D && x_input->format_ != y_input->format_) {
+    // when inputs format are different, change to NHWC
+    auto need_trans = x_input->format_ == Format::NCHW ? x_input : y_input;
+    nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *need_trans->trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer->setName((op_name_ + "_input_transpose2NHWC").c_str());
+    need_trans->trt_tensor_ = transpose_layer->getOutput(0);
+    need_trans->format_ = Format::NHWC;
+    need_trans->same_format_ = true;
+  }
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*x_input);
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*y_input);
+  if (GetDimsVolume(x_input->trt_tensor_->getDimensions()) == GetDimsVolume(y_input->trt_tensor_->getDimensions()) &&
+      x_input->trt_tensor_->getDimensions().nbDims != y_input->trt_tensor_->getDimensions().nbDims) {
+    bool x_large = x_input->trt_tensor_->getDimensions().nbDims > y_input->trt_tensor_->getDimensions().nbDims;
+    auto input_tensor = x_large ? y_input : x_input;
+    auto output_dim = x_large ? x_input->trt_tensor_->getDimensions() : y_input->trt_tensor_->getDimensions();
+    auto reshape_layer = ctx->network()->addShuffle(*input_tensor->trt_tensor_);
+    if (reshape_layer == nullptr) {
+      MS_LOG(ERROR) << "add reshape failed for " << op_name_;
+      return RET_ERROR;
+    }
+    reshape_layer->setReshapeDimensions(output_dim);
+    input_tensor->trt_tensor_ = reshape_layer->getOutput(0);
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *ElementWiseTensorRT::AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor) {
+  schema::ActivationType activation = schema::ActivationType::ActivationType_NO_ACTIVATION;
+  switch (type_) {
+    case schema::PrimitiveType_AddFusion: {
+      auto sum_op = op_primitive_->value_as_AddFusion();
+      if (sum_op == nullptr) {
+        MS_LOG(ERROR) << "AddFusion convert failed.";
+        return nullptr;
+      }
+      activation = sum_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_DivFusion: {
+      auto div_op = op_primitive_->value_as_DivFusion();
+      if (div_op == nullptr) {
+        MS_LOG(ERROR) << "DivFusion convert failed.";
+        return nullptr;
+      }
+      activation = div_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_SubFusion: {
+      auto sub_op = op_primitive_->value_as_SubFusion();
+      if (sub_op == nullptr) {
+        MS_LOG(ERROR) << "SubFusion convert failed.";
+        return nullptr;
+      }
+      activation = sub_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_MulFusion: {
+      auto mul_op = op_primitive_->value_as_MulFusion();
+      if (mul_op == nullptr) {
+        MS_LOG(ERROR) << "MulFusion convert failed.";
+        return nullptr;
+      }
+      activation = mul_op->activation_type();
+      break;
+    }
+    default:
+      MS_LOG(DEBUG) << "no activation need for: " << op_name_;
+  }
+  nvinfer1::ITensor *activation_out_tensor = nullptr;
+  if (activation != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation, 0, 0, 0, in_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for element wise failed";
+      return nullptr;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    activation_out_tensor = activation_layer->getOutput(0);
+  }
+  return activation_out_tensor;
+}
+int ElementWiseTensorRT::AddConstTensor(TensorRTContext *ctx) {
+  int const_tensor_index = (in_tensors_[0].Data() != nullptr && in_tensors_[0].IsConst()) ? 0 : 1;
+  nvinfer1::ITensor *constant_input = ConvertConstantTensorWithDims(
+    ctx, in_tensors_[const_tensor_index], in_tensors_[1 - const_tensor_index].Shape(), op_name_);
+  CHECK_NULL_RETURN(constant_input);
+  AddInnerInTensors(ITensorHelper{constant_input, tensorrt_in_tensors_[0].format_, true});
+  return RET_OK;
+}
+bool ElementWiseTensorRT::SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor) {
+  if (SameDims(trt_tensor->getDimensions(), ms_tensor->Shape())) {
+    return true;
+  }
+  if (ms_tensor->Shape().size() == DIMENSION_4D) {
+    // nhwc nchw
+    auto nchw_shape = NHWC2NCHW(ms_tensor->Shape());
+    if (SameDims(trt_tensor->getDimensions(), nchw_shape)) {
+      return true;
+    }
+  }
+  auto str_name = strstr(trt_tensor->getName(), ms_tensor->Name().c_str());
+  if (str_name != nullptr) {
+    return true;
+  }
+  str_name = strstr(ms_tensor->Name().c_str(), trt_tensor->getName());
+  if (str_name != nullptr) {
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_SubFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_DivFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_RealDiv, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PowFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AddFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MulFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Eltwise, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Minimum, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Maximum, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_BiasAdd, ElementWiseTensorRT)
+#if TRT_VERSION_GE(7, 2)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Less, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Greater, ElementWiseTensorRT)
+#endif
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h
new file mode 100644
index 00000000000..ece6aeaa62c
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ElementWiseTensorRT : public TensorRTOp {
+ public:
+  ElementWiseTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                      const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                      const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ElementWiseTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  nvinfer1::ITensor *AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor);
+
+  int AddConstTensor(TensorRTContext *ctx);
+
+  bool SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor);
+
+  int PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input);
+
+  nvinfer1::ElementWiseOperation element_wise_op_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc
new file mode 100644
index 00000000000..2b817274d78
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/equal_tensorrt.h"
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(EqualPluginCreater);
+template class TensorRTPluginCreater<EqualPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int EqualTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int EqualTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_};
+  auto plugin = std::make_shared<EqualPlugin>(op_name_, device_id_);
+  nvinfer1::IPluginV2Layer *equal_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (equal_layer == nullptr) {
+    MS_LOG(ERROR) << "create equal layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  layer_ = equal_layer;
+  nvinfer1::ITensor *equal_out = equal_layer->getOutput(0);
+  equal_layer->setName(op_name_.c_str());
+  equal_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{equal_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+int EqualPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                         const void *const *inputs, void *const *outputs, void *workspace,
+                         cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+
+  if (inputDesc->type == nvinfer1::DataType::kINT32) {
+    const int *input1 = static_cast<const int *>(inputs[0]);
+    const int *input2 = static_cast<const int *>(inputs[1]);
+    int *output = static_cast<int *>(outputs[0]);
+    Equal(input1, input2, output, element_cnt, stream);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    const float *input2 = static_cast<const float *>(inputs[1]);
+    float *output = static_cast<float *>(outputs[0]);
+    Equal(input1, input2, output, element_cnt, stream);
+  } else {
+    MS_LOG(ERROR) << "unsupported equal data type";
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *EqualPlugin::clone() const noexcept {
+  auto *plugin = new EqualPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+#if TRT_VERSION_LS(7, 2)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, EqualTensorRT)
+#endif
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h
new file mode 100644
index 00000000000..35e5d2259b5
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh"
+
+namespace mindspore::lite {
+constexpr char *EQUAL_PLUGIN_NAME{"EqualPlugin"};
+class EqualTensorRT : public TensorRTOp {
+ public:
+  EqualTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~EqualTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class EqualPlugin : public TensorRTPlugin {
+ public:
+  EqualPlugin(const std::string name, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(EQUAL_PLUGIN_NAME), device_id) {}
+
+  EqualPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {}
+
+  EqualPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {}
+
+  EqualPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+};
+class EqualPluginCreater : public TensorRTPluginCreater<EqualPlugin> {
+ public:
+  EqualPluginCreater() : TensorRTPluginCreater(std::string(EQUAL_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc
new file mode 100644
index 00000000000..a0ea8f40f6a
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+namespace mindspore::lite {
+constexpr int BIAS_INDEX = 2;
+int FullyConnectedTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                                      const std::vector<mindspore::MSTensor> &in_tensors,
+                                      const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int FullyConnectedTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  auto primitive = op_primitive_->value_as_FullConnection();
+  CHECK_NULL_RETURN(primitive);
+  activation_ = primitive->activation_type();
+  int axis = primitive->axis();
+  if (axis < 0 || axis >= out_tensors_[0].Shape().size()) {
+    MS_LOG(ERROR) << "axis: " << axis << " is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper fc_input;
+  auto ret = PreprocessInputs(ctx, &fc_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputs failed for " << op_name_;
+    return ret;
+  }
+  auto kernel_weight = ConvertWeight(in_tensors_[1].Data().get() == nullptr ? in_tensors_[0] : in_tensors_[1]);
+  nvinfer1::Weights bias_weight{};
+  if (primitive->has_bias()) {
+    bias_weight = ConvertWeight(in_tensors_[BIAS_INDEX]);
+  }
+  nvinfer1::IFullyConnectedLayer *fc_layer = ctx->network()->addFullyConnected(
+    *(fc_input.trt_tensor_), out_tensors_[0].Shape()[axis], kernel_weight, bias_weight);
+  if (fc_layer == nullptr) {
+    MS_LOG(ERROR) << "addFullyConnected failed for " << op_name_;
+    return RET_ERROR;
+  }
+  this->layer_ = fc_layer;
+  fc_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0);
+
+  if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) {
+    std::vector<int64_t> squeeze_dim(out_tensors_[0].Shape());
+    squeeze_dim[0] = out_tensor->getDimensions().d[0] == -1 ? -1 : squeeze_dim[0];
+    out_tensor = Reshape(ctx, out_tensor, squeeze_dim);
+  }
+  // add activation
+  if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    nvinfer1::ILayer *activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for matmul failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    out_tensor = activation_layer->getOutput(0);
+  }
+
+  out_tensor->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, fc_input.format_});
+  return RET_OK;
+}
+
+int FullyConnectedTensorRT::PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input) {
+  auto ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], fc_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim failed for " << op_name_;
+    return ret;
+  }
+  auto origin_dims = fc_input->trt_tensor_->getDimensions();
+  if (origin_dims.nbDims != DIMENSION_4D) {
+    std::vector<int64_t> expand_dim(origin_dims.d, origin_dims.d + origin_dims.nbDims);
+    for (int i = 0; i < DIMENSION_4D - origin_dims.nbDims; i++) {
+      expand_dim.push_back(1);
+    }
+    fc_input->trt_tensor_ = Reshape(ctx, fc_input->trt_tensor_, expand_dim);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_FullConnection, FullyConnectedTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h
new file mode 100644
index 00000000000..f98c543a565
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class FullyConnectedTensorRT : public TensorRTOp {
+ public:
+  FullyConnectedTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                         const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                         const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~FullyConnectedTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input);
+
+  schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc
new file mode 100644
index 00000000000..7c9b5938b22
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc
@@ -0,0 +1,139 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(GatherDPluginCreater);
+template class TensorRTPluginCreater<GatherDPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int GatherDTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported gatherd input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid gatherd input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid gatherd output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherDTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[2].trt_tensor_};
+  auto dim_tensor = static_cast<const int *>(in_tensors_[1].Data().get());
+  if (dim_tensor == nullptr) {
+    MS_LOG(ERROR) << op_name_ << " gatherd dim_tensor is null!";
+    return RET_ERROR;
+  }
+  size_t dim = static_cast<size_t>(dim_tensor[0]);
+
+  auto plugin = std::make_shared<GatherDPlugin>(op_name_, dim, device_id_);
+  nvinfer1::IPluginV2Layer *gatherd_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (gatherd_layer == nullptr) {
+    MS_LOG(ERROR) << "create gatherd failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *gatherd_out = gatherd_layer->getOutput(0);
+  gatherd_layer->setName(op_name_.c_str());
+  gatherd_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{gatherd_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = gatherd_layer;
+  return RET_OK;
+}
+
+int GatherDPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                           const void *const *inputs, void *const *outputs, void *workspace,
+                           cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int dims = input_dims.nbDims;
+  if (axis_ < 0) {
+    axis_ += dims;
+  }
+
+  if (inputDesc->type == nvinfer1::DataType::kINT32) {
+    auto input = static_cast<const int *>(inputs[0]);
+    auto index = static_cast<const int *>(inputs[1]);
+    auto output = static_cast<int *>(outputs[0]);
+    Reshape(inputDesc, outputDesc);
+    Gather<int, int>(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_,
+                     stream, device_id_);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) {
+    auto input = static_cast<const float *>(inputs[0]);
+    auto index = static_cast<const int *>(inputs[1]);
+    auto output = static_cast<float *>(outputs[0]);
+    Reshape(inputDesc, outputDesc);
+    Gather<float, int>(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_,
+                       stream, device_id_);
+  } else {
+    MS_LOG(ERROR) << "unsupported data type gatherd" << layer_name_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *GatherDPlugin::clone() const noexcept {
+  auto *plugin = new GatherDPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs GatherDPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                                       nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs[1].nbDims;
+  for (int i = 0; i < inputs[1].nbDims; i++) {
+    out_dims.d[i] = inputs[1].d[i];
+  }
+  return out_dims;
+}
+
+void GatherDPlugin::Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+  size_t dim_before_axis = 1;
+  for (size_t i = 0; i < IntToSize(axis_); i++) {
+    dim_before_axis *= output_dims.d[i];
+  }
+  size_t dim_at_axis_input = input_dims.d[IntToSize(axis_)];
+  size_t dim_at_axis_output = output_dims.d[IntToSize(axis_)];
+  size_t dim_after_axis = 1;
+  for (size_t i = IntToSize(axis_) + 1; i < output_dims.nbDims; i++) {
+    dim_after_axis *= output_dims.d[i];
+  }
+
+  dim_before_axis_ = dim_before_axis;
+  dim_at_axis_input_ = dim_at_axis_input;
+  dim_at_axis_output_ = dim_at_axis_output;
+  dim_after_axis_ = dim_after_axis;
+  return;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_GatherD, GatherDTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h
new file mode 100644
index 00000000000..714e6c89819
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr char *GATHER_D_PLUGIN_NAME{"GatherDPluginCreater"};
+class GatherDTensorRT : public TensorRTOp {
+ public:
+  GatherDTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~GatherDTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class GatherDPlugin : public TensorRTPlugin {
+ public:
+  GatherDPlugin(const std::string name, size_t dim, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(GATHER_D_PLUGIN_NAME), device_id), axis_(dim) {}
+
+  GatherDPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    axis_ = static_cast<const int *>(fields[0].data)[0];
+  }
+
+  GatherDPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &axis_, sizeof(int));
+  }
+
+  GatherDPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+ private:
+  int axis_;
+  size_t dim_before_axis_;
+  size_t dim_at_axis_input_;
+  size_t dim_at_axis_output_;
+  size_t dim_after_axis_;
+  void Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc);
+};
+
+class GatherDPluginCreater : public TensorRTPluginCreater<GatherDPlugin> {
+ public:
+  GatherDPluginCreater() : TensorRTPluginCreater(std::string(GATHER_D_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc
new file mode 100644
index 00000000000..38bc8bf1861
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/gather_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr int AXIS_INDEX = 2;
+
+int GatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[1].DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(ERROR) << "Gather indices only support Int32";
+    return RET_ERROR;
+  }
+  if (in_tensors[AXIS_INDEX].ElementNum() == 1) {
+    MS_ASSERT(in_tensors[AXIS_INDEX].Data().get());
+    axis_ = static_cast<const int *>(in_tensors[AXIS_INDEX].Data().get())[0];
+  } else {
+    MS_LOG(ERROR) << "TensorRT axis is attribute.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  if (tensorrt_in_tensors_.size() < INPUT_SIZE2 && in_tensors_.size() >= INPUT_SIZE2) {
+    int const_ms_tensor_index = in_tensors_[0].IsConst() ? 0 : 1;
+    auto const_input = ConvertConstantTensor(ctx, in_tensors_[const_ms_tensor_index], op_name_);
+    if (const_input == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{const_input});
+  }
+
+  int indices_tensor_index = tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32 ? 0 : 1;
+  ITensorHelper gather_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - indices_tensor_index], &gather_input);
+  if (ret != RET_OK || gather_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim gather failed for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper indices_tensor;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[indices_tensor_index], &indices_tensor);
+  if (ret != RET_OK || indices_tensor.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim indices failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::IGatherLayer *gather_layer =
+    ctx->network()->addGather(*gather_input.trt_tensor_, *indices_tensor.trt_tensor_, axis_);
+  if (gather_layer == nullptr) {
+    MS_LOG(ERROR) << "addGather failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  this->layer_ = gather_layer;
+  gather_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *op_output = gather_layer->getOutput(0);
+  // keep shape
+  if (in_tensors_[1].Shape().empty()) {
+    auto squeeze = ctx->network()->addShuffle(*op_output);
+    if (squeeze == nullptr) {
+      MS_LOG(ERROR) << "add output squeeze failed for " << op_name_;
+      return RET_ERROR;
+    }
+    squeeze->setName((op_name_ + "_squeeze_out").c_str());
+    auto old_shape = ConvertMSShape(op_output->getDimensions());
+    old_shape.erase(old_shape.begin() + axis_);
+    squeeze->setReshapeDimensions(ConvertCudaDims(old_shape));
+    op_output = squeeze->getOutput(0);
+  }
+  op_output->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_output, gather_input.format_, gather_input.same_format_});
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Gather, GatherTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h
new file mode 100644
index 00000000000..1bd44af8a1f
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class GatherTensorRT : public TensorRTOp {
+ public:
+  GatherTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~GatherTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int axis_{0};
+  mindspore::MSTensor indices_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc
new file mode 100644
index 00000000000..8546a5143f7
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+
+namespace mindspore::lite {
+int LogicalNotTensorRT::IsSupport(const schema::Primitive *primitive,
+                                  const std::vector<mindspore::MSTensor> &in_tensors,
+                                  const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  return RET_OK;
+}
+
+int LogicalNotTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  if (tensorrt_in_tensors_[0].trt_tensor_->getType() != nvinfer1::DataType::kINT32) {
+    auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_);
+    if (cast_layer == nullptr) {
+      MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+      return RET_ERROR;
+    }
+    cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0);
+  }
+  auto plugin = std::make_shared<LogicalNotPlugin>(op_name_, op_primitive_->value_type());
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  this->layer_ = logical_layer;
+  nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+REGISTER_TENSORRT_PLUGIN(LogicalNotPluginCreater);
+template class TensorRTPluginCreater<LogicalNotPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int LogicalNotPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                              const void *const *inputs, void *const *outputs, void *workspace,
+                              cudaStream_t stream) noexcept {
+  return RunCudaLogical(inputDesc, inputs, outputs, stream);
+}
+
+int LogicalNotPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                     void *const *outputs, cudaStream_t stream) {
+  switch (primitive_type_) {
+    case (schema::PrimitiveType_LogicalNot): {
+      LogicalNot(static_cast<const int *>(inputs[0]), static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+                 stream);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid logical type: " << static_cast<int>(primitive_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *LogicalNotPlugin::clone() const noexcept {
+  auto *plugin = new LogicalNotPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t LogicalNotPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); }
+
+void LogicalNotPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType));
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalNot, LogicalNotTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h
new file mode 100644
index 00000000000..09c2582bf22
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class LogicalNotTensorRT : public TensorRTOp {
+ public:
+  LogicalNotTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                     const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                     const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LogicalNotTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+constexpr char *LOGICAL_NOT_PLUGIN_NAME{"LogicalNotPlugin"};
+class LogicalNotPlugin : public TensorRTPlugin {
+ public:
+  LogicalNotPlugin(const std::string name, schema::PrimitiveType primitive_type)
+      : TensorRTPlugin(name, std::string(LOGICAL_NOT_PLUGIN_NAME)), primitive_type_(primitive_type) {}
+
+  LogicalNotPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    primitive_type_ = static_cast<const schema::PrimitiveType *>(fields[0].data)[0];
+  }
+
+  LogicalNotPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType));
+  }
+
+  LogicalNotPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                     cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::PrimitiveType primitive_type_;
+};
+class LogicalNotPluginCreater : public TensorRTPluginCreater<LogicalNotPlugin> {
+ public:
+  LogicalNotPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_NOT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc
new file mode 100644
index 00000000000..653c9431df9
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/logical_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+
+namespace mindspore::lite {
+int LogicalTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int LogicalTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  for (int i = 0; i != tensorrt_in_tensors_.size(); ++i) {
+    if (tensorrt_in_tensors_[i].trt_tensor_->getType() != nvinfer1::DataType::kINT32) {
+      auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_);
+      if (cast_layer == nullptr) {
+        MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+        return RET_ERROR;
+      }
+      cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+      tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0);
+    }
+  }
+  auto plugin = std::make_shared<LogicalPlugin>(op_name_, op_primitive_->value_type());
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_};
+  nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 2, *plugin);
+  this->layer_ = logical_layer;
+  nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+REGISTER_TENSORRT_PLUGIN(LogicalPluginCreater);
+template class TensorRTPluginCreater<LogicalPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int LogicalPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                           const void *const *inputs, void *const *outputs, void *workspace,
+                           cudaStream_t stream) noexcept {
+  return RunCudaLogical(inputDesc, inputs, outputs, stream);
+}
+
+int LogicalPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                  void *const *outputs, cudaStream_t stream) {
+  switch (primitive_type_) {
+    case (schema::PrimitiveType_LogicalAnd): {
+      LogicalAnd(static_cast<const int *>(inputs[0]), static_cast<const int *>(inputs[1]),
+                 static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream);
+      break;
+    }
+    case (schema::PrimitiveType_LogicalOr): {
+      LogicalOr(static_cast<const int *>(inputs[0]), static_cast<const int *>(inputs[1]),
+                static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid logical type: " << static_cast<int>(primitive_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *LogicalPlugin::clone() const noexcept {
+  auto *plugin = new LogicalPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t LogicalPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); }
+
+void LogicalPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType));
+}
+
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalOr, LogicalTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalAnd, LogicalTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h
new file mode 100644
index 00000000000..9ec52e43bc0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
+
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+class LogicalTensorRT : public TensorRTOp {
+ public:
+  LogicalTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LogicalTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+constexpr char *LOGICAL_PLUGIN_NAME{"LogicalPlugin"};
+class LogicalPlugin : public TensorRTPlugin {
+ public:
+  LogicalPlugin(const std::string name, schema::PrimitiveType primitive_type)
+      : TensorRTPlugin(name, std::string(LOGICAL_PLUGIN_NAME)), primitive_type_(primitive_type) {}
+
+  LogicalPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    primitive_type_ = static_cast<const schema::PrimitiveType *>(fields[0].data)[0];
+  }
+
+  LogicalPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType));
+  }
+
+  LogicalPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                     cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::PrimitiveType primitive_type_;
+};
+class LogicalPluginCreater : public TensorRTPluginCreater<LogicalPlugin> {
+ public:
+  LogicalPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc
new file mode 100644
index 00000000000..3f0c80dc764
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc
@@ -0,0 +1,493 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/lstm_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+namespace mindspore::lite {
+int LSTMTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+#if TRT_VERSION_GE(7, 0)
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() < INPUT_TENSOR_SIZE) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != OUTPUT_TENSOR_SIZE) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+  hidden_init_name_ = hidden_in_init.Name() + "_hidden_init";
+  mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+  cell_init_name_ = cell_in_init.Name() + "_cell_init";
+
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher";
+  return RET_ERROR;
+#endif
+}
+
+int LSTMTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  int input_data_dims_cnt = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+  if (input_data_dims_cnt != DIMENSION_3D) {
+    MS_LOG(ERROR) << "invalid input data shape dims for " << op_name_;
+    return RET_ERROR;
+  }
+  network_ = ctx->network();
+  int ret = PreProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreProcess for " << op_name_;
+    return ret;
+  }
+
+  ret = AddLSTMLayers();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AddLSTMLayers for " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (op_data_out_ == nullptr) {
+    MS_LOG(ERROR) << "layers final output tensor is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+  op_data_out_->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "lstm op_data_out_ " << GetTensorFormat(op_data_out_);
+  MS_LOG(DEBUG) << "lstm op_hidden_out_ " << GetTensorFormat(op_hidden_out_);
+  MS_LOG(DEBUG) << "lstm op_cell_out_ " << GetTensorFormat(op_cell_out_);
+  this->AddInnerOutTensors(ITensorHelper{op_data_out_});
+  this->AddInnerOutTensors(ITensorHelper{op_hidden_out_});
+  this->AddInnerOutTensors(ITensorHelper{op_cell_out_});
+  return RET_OK;
+}
+
+int LSTMTensorRT::PreProcess() {
+  auto ms_input_shape = in_tensors_[0].Shape();
+  params_.sequence_size_ = ms_input_shape[0];
+  params_.batch_size_ = ms_input_shape[1];
+  params_.input_data_size_ = ms_input_shape[INPUT_SIZE_INDEX];
+  if (params_.batch_size_ != 1) {
+    MS_LOG(WARNING) << op_name_ << " lstm has batchsize " << params_.batch_size_ << ", needs further verify";
+  }
+  // ms: 0 sequence size, 1 batch size, 2 input size -> tensorrt: 0 batch size, 1 sequence size, 2 input size
+  auto transpose_in_layer = network_->addShuffle(*tensorrt_in_tensors_[0].trt_tensor_);
+  if (transpose_in_layer == nullptr) {
+    MS_LOG(ERROR) << "create transpose_in_layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::Permutation transpose_perm{{1, 0, INPUT_SIZE_INDEX}};
+  transpose_in_layer->setFirstTranspose(transpose_perm);
+  transpose_in_layer->setName((op_name_ + "transpose_in").c_str());
+  input_data_ = transpose_in_layer->getOutput(0);
+  MS_LOG(DEBUG) << "lstm input " << GetTensorFormat(input_data_);
+
+  auto lstm_op = op_primitive_->value_as_LSTM();
+  params_.layer_count_ = lstm_op->num_layers() == 0 ? 1 : lstm_op->num_layers();
+  params_.hidden_size_ = lstm_op->hidden_size();
+  params_.directional_cnt_ = lstm_op->bidirectional() ? BIDIRECTIONAL : 1;
+  params_.data_type_ = ConvertDataType(in_tensors_[1].DataType());
+  return RET_OK;
+}
+
+int LSTMTensorRT::AddLSTMLayers() {
+  mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+  mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+
+  nvinfer1::ITensor *data_out{nullptr};
+  nvinfer1::ITensor *hidden_init = network_->addInput(
+    hidden_init_name_.c_str(), nvinfer1::DataType::kFLOAT,
+    nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_));
+  if (hidden_init == nullptr) {
+    MS_LOG(ERROR) << "add hidden_init input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(),
+                                             nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()});
+  nvinfer1::ITensor *cell_init = network_->addInput(
+    cell_init_name_.c_str(), nvinfer1::DataType::kFLOAT,
+    nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_));
+  if (cell_init == nullptr) {
+    MS_LOG(ERROR) << "add cell_init input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(
+    BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()});
+
+  sequence_size_input_ =
+    network_->addInput((op_name_ + "_seq_input").c_str(), nvinfer1::DataType::kINT32, nvinfer1::Dims{});
+  if (sequence_size_input_ == nullptr) {
+    MS_LOG(ERROR) << "add sequence_size_input_ input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(
+    BindingHelper{(op_name_ + "_seq_input"), &params_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)});
+
+  nvinfer1::ITensor *max_sequence_size =
+    network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, &params_.sequence_size_, 1})
+      ->getOutput(0);
+  if (max_sequence_size == nullptr) {
+    MS_LOG(ERROR) << "add max_sequence_size constant tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  LstmState next_state{input_data_, nullptr, nullptr};  // init states
+  std::vector<nvinfer1::ITensor *> hidden_outputs;
+  std::vector<nvinfer1::ITensor *> cell_outputs;
+  int input_weight_offset = 0;
+  int state_weight_offset = 0;
+  int bias_offset = 0;
+
+  if (params_.layer_count_ != 1) {
+    MS_LOG(WARNING) << op_name_ << " needs verify for layer cnt: " << params_.layer_count_;
+  }
+  for (int i = 0; i < params_.layer_count_; i++) {
+    LstmState layer_input_states[BIDIRECTIONAL];
+    LstmWeights layer_weights[BIDIRECTIONAL];
+    layer_weights[0].max_seq_size_ = max_sequence_size;
+    int ret = ParseLSTMCellInputs(i, hidden_init, cell_init, layer_input_states, &input_weight_offset,
+                                  &state_weight_offset, &bias_offset, layer_weights, next_state);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "ParseLSTMCellInputs failed for " << op_name_;
+      return RET_ERROR;
+    }
+    data_out = AddLSTMCell(layer_input_states, layer_weights, &next_state);
+    hidden_outputs.push_back(next_state.hidden_);
+    cell_outputs.push_back(next_state.cell_);
+    if (data_out == nullptr || next_state.hidden_ == nullptr || next_state.cell_ == nullptr) {
+      MS_LOG(ERROR) << "AddLSTMCell failed for " << op_name_;
+      return RET_ERROR;
+    }
+  }
+
+  op_hidden_out_ = ConcateAll(hidden_outputs);
+  if (op_hidden_out_ == nullptr) {
+    MS_LOG(ERROR) << "concat hidden output failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_hidden_out_->setName(out_tensors_[OUTPUT_HIDDEN_INDEX].Name().c_str());
+  op_cell_out_ = ConcateAll(cell_outputs);
+  if (op_cell_out_ == nullptr) {
+    MS_LOG(ERROR) << "concat cell output failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_cell_out_->setName(out_tensors_[OUTPUT_CELL_INDEX].Name().c_str());
+  op_data_out_ = data_out;
+  return RET_OK;
+}
+
+int LSTMTensorRT::ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init,
+                                      LstmState *layer_input_states, int *input_weight_offset, int *state_weight_offset,
+                                      int *bias_offset, LstmWeights *layer_weights, const LstmState &next_state) {
+  nvinfer1::Dims2 dim_input_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.input_data_size_);
+  nvinfer1::Dims2 dim_state_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.hidden_size_);
+  nvinfer1::Dims dim_bias{1, {LSTM_GATE_NUM * params_.hidden_size_}};
+
+  mindspore::MSTensor &input_weight = in_tensors_[INPUT_WEIGHT];
+  mindspore::MSTensor &state_weight = in_tensors_[STATE_WEIGHT];
+  mindspore::MSTensor &bias = in_tensors_[BIAS];
+
+  nvinfer1::Dims dimW = layer_index == 0 ? dim_input_weight : dim_state_weight;
+
+  for (int direction_index = 0; direction_index < params_.directional_cnt_; direction_index++) {
+    nvinfer1::ITensor *index =
+      network_
+        ->addConstant(nvinfer1::Dims{},
+                      nvinfer1::Weights{nvinfer1::DataType::kINT32,
+                                        &INDICES[layer_index * params_.directional_cnt_ + direction_index], 1})
+        ->getOutput(0);
+    MS_ASSERT(index);
+    layer_input_states[direction_index].data_ = next_state.data_;
+    layer_input_states[direction_index].hidden_ = network_->addGather(*hidden_init, *index, 0)->getOutput(0);
+    layer_input_states[direction_index].cell_ = network_->addGather(*cell_init, *index, 0)->getOutput(0);
+    MS_ASSERT(layer_input_states[direction_index].hidden_);
+    MS_ASSERT(layer_input_states[direction_index].cell_);
+
+    // weight order: input, output, forget, cell
+    if (params_.data_type_ != nvinfer1::DataType::kFLOAT) {
+      MS_LOG(WARNING) << "more data type need to be done";
+      return RET_ERROR;
+    }
+    const float *input_weight_ptr = static_cast<const float *>(input_weight.Data().get());
+    const float *state_weight_ptr = static_cast<const float *>(state_weight.Data().get());
+    const float *bias_ptr = static_cast<const float *>(bias.Data().get());
+    nvinfer1::Weights slice_input_weight{params_.data_type_, input_weight_ptr + *input_weight_offset,
+                                         GetDimsVolume(dimW)};
+    (*input_weight_offset) += slice_input_weight.count;
+    nvinfer1::Weights slice_state_weight{params_.data_type_, state_weight_ptr + *state_weight_offset,
+                                         GetDimsVolume(dim_state_weight)};
+    (*state_weight_offset) += slice_state_weight.count;
+    layer_weights[direction_index].input_weights_ = network_->addConstant(dimW, slice_input_weight)->getOutput(0);
+    layer_weights[direction_index].state_weights_ =
+      network_->addConstant(dim_state_weight, slice_state_weight)->getOutput(0);
+    MS_ASSERT(layer_weights[direction_index].input_weights_);
+    MS_ASSERT(layer_weights[direction_index].state_weights_);
+
+    // bias
+    nvinfer1::Weights slice_input_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)};
+    (*bias_offset) += slice_input_bias.count;
+    nvinfer1::Weights slice_state_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)};
+    (*bias_offset) += slice_state_bias.count;
+    layer_weights[direction_index].input_bias_ = network_->addConstant(dim_bias, slice_input_bias)->getOutput(0);
+    layer_weights[direction_index].state_bias_ = network_->addConstant(dim_bias, slice_state_bias)->getOutput(0);
+    MS_ASSERT(layer_weights[direction_index].input_bias_);
+    MS_ASSERT(layer_weights[direction_index].state_bias_);
+  }
+  if (params_.directional_cnt_ == BIDIRECTIONAL) {
+    layer_weights[1].max_seq_size_ = layer_weights[0].max_seq_size_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *LSTMTensorRT::Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims) {
+  nvinfer1::IShuffleLayer *shuffle = network_->addShuffle(*tensor);
+  shuffle->setReshapeDimensions(dims);
+  return shuffle->getOutput(0);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::ConcateAll(std::vector<nvinfer1::ITensor *> all_tensor, int axis) {
+  if (all_tensor.size() == 1) {
+    return all_tensor[0];
+  }
+  nvinfer1::IConcatenationLayer *concat = network_->addConcatenation(all_tensor.data(), all_tensor.size());
+  if (concat == nullptr) {
+    MS_LOG(ERROR) << "addConcatenation failed for " << op_name_;
+    return nullptr;
+  }
+  if (axis >= all_tensor[0]->getDimensions().nbDims) {
+    MS_LOG(ERROR) << op_name_ << " concat axis is " << axis << ", larger than tensor dims "
+                  << all_tensor[0]->getDimensions().nbDims;
+    return nullptr;
+  }
+  concat->setAxis(axis);
+  return concat->getOutput(0);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights,
+                                             LstmState *next_state) {
+  nvinfer1::ITensor *backward_output = nullptr;
+  nvinfer1::ITensor *backward_hidden_out = nullptr;
+  nvinfer1::ITensor *backward_cell_out = nullptr;
+  nvinfer1::ITensor *forward_hidden_out = nullptr;
+  nvinfer1::ITensor *forward_cell_out = nullptr;
+
+  nvinfer1::ITensor *forward_output =
+    AddLSTMCalculation(layer_input_states[0], layer_weights[0], &forward_hidden_out, &forward_cell_out);
+  if (params_.directional_cnt_ == BIDIRECTIONAL) {
+    backward_output =
+      AddLSTMCalculation(layer_input_states[1], layer_weights[1], &backward_hidden_out, &backward_cell_out, true);
+  }
+
+  // concate forward and backward
+  nvinfer1::ITensor *output_tensor = forward_output;
+  nvinfer1::ITensor *cell_out = forward_cell_out;
+  nvinfer1::ITensor *hidden_out = forward_hidden_out;
+  if (backward_output != nullptr && backward_hidden_out != nullptr && backward_cell_out != nullptr) {
+    nvinfer1::ITensor *output_concat_input[BIDIRECTIONAL] = {forward_output, backward_output};
+    auto ouput_out_layer = network_->addConcatenation(output_concat_input, BIDIRECTIONAL);
+    this->layer_ = ouput_out_layer;
+    if (ouput_out_layer == nullptr) {
+      MS_LOG(ERROR) << "create one loop output concat failed for " << op_name_;
+      return nullptr;
+    }
+    ouput_out_layer->setAxis(1);  // ms: 0 sequence size, 1 layer * direction, 2 batchsize, 3 hidden
+    output_tensor = ouput_out_layer->getOutput(0);
+
+    nvinfer1::ITensor *hidden_concat_input[BIDIRECTIONAL] = {forward_hidden_out, backward_hidden_out};
+    auto hidden_out_layer = network_->addConcatenation(hidden_concat_input, BIDIRECTIONAL);
+    hidden_out_layer->setAxis(0);
+    hidden_out = hidden_out_layer->getOutput(0);
+
+    nvinfer1::ITensor *cell_concat_input[BIDIRECTIONAL] = {forward_cell_out, backward_cell_out};
+    auto cell_out_layer = network_->addConcatenation(cell_concat_input, BIDIRECTIONAL);
+    cell_out_layer->setAxis(0);
+    cell_out = cell_out_layer->getOutput(0);
+  }
+  if (hidden_out == nullptr || cell_out == nullptr) {
+    MS_LOG(ERROR) << "get one loop hidden_out and cell_out failed for " << op_name_;
+    return nullptr;
+  }
+  *next_state = LstmState{output_tensor, hidden_out, cell_out};
+  return output_tensor;
+}
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                                    nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                                    bool is_backward) {
+  std::vector<nvinfer1::ITensor *> all_batch_outputs;
+  std::vector<nvinfer1::ITensor *> all_batch_hidden;
+  std::vector<nvinfer1::ITensor *> all_batch_cell;
+  for (int batch_index = 0; batch_index < params_.batch_size_; batch_index++) {
+    LstmState one_batch_input_state;
+    nvinfer1::ITensor *batch_index_tensor =
+      network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, &INDICES[batch_index], 1})
+        ->getOutput(0);
+    one_batch_input_state.data_ = network_->addGather(*input_state.data_, *batch_index_tensor, 0)->getOutput(0);
+    one_batch_input_state.hidden_ = network_->addGather(*input_state.hidden_, *batch_index_tensor, 0)->getOutput(0);
+    one_batch_input_state.cell_ = network_->addGather(*input_state.cell_, *batch_index_tensor, 0)->getOutput(0);
+    nvinfer1::ITensor *one_batch_hidden = nullptr;
+    nvinfer1::ITensor *one_batch_cell = nullptr;
+    nvinfer1::ITensor *one_batch_output =
+      AddLSTMOneLoop(one_batch_input_state, lstm_weights, &one_batch_hidden, &one_batch_cell, is_backward);
+    if (one_batch_output == nullptr || one_batch_cell == nullptr || one_batch_hidden == nullptr) {
+      MS_LOG(ERROR) << "AddLSTMOneLoop failed for " << op_name_ << " at batch index " << batch_index;
+      return nullptr;
+    }
+    all_batch_outputs.push_back(one_batch_output);
+    all_batch_hidden.push_back(one_batch_hidden);
+    all_batch_cell.push_back(one_batch_cell);
+  }
+  *hidden_out = ConcateAll(all_batch_hidden, 1);
+  *cell_out = ConcateAll(all_batch_cell, 1);
+  return ConcateAll(all_batch_outputs, BATCH_SIZE_INDEX);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                                nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                                bool is_backward) {
+#if TRT_VERSION_GE(7, 0)
+  nvinfer1::ILoop *sequence_loop = network_->addLoop();
+  if (sequence_loop == nullptr) {
+    MS_LOG(ERROR) << "add sequence_loop layer failed for " << op_name_;
+    return nullptr;
+  }
+  std::string loop_name = op_name_ + "_loop" + (is_backward ? "_backward" : "_forward");
+  sequence_loop->setName(loop_name.c_str());
+  sequence_loop->addTripLimit(*sequence_size_input_, nvinfer1::TripLimit::kCOUNT);
+  nvinfer1::ITensor *input = sequence_loop->addIterator(*input_state.data_, 0, is_backward)->getOutput(0);
+
+  nvinfer1::ILayer *hidden_mid = sequence_loop->addRecurrence(*input_state.hidden_);
+  if (hidden_mid == nullptr) {
+    MS_LOG(ERROR) << "add hidden layer failed for " << op_name_;
+    return nullptr;
+  }
+  nvinfer1::ILayer *cell_mid = sequence_loop->addRecurrence(*input_state.cell_);
+  if (cell_mid == nullptr) {
+    MS_LOG(ERROR) << "add cell layer failed for " << op_name_;
+    return nullptr;
+  }
+
+  nvinfer1::ITensor *input_matmul =
+    network_
+      ->addMatrixMultiply(*input, nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.input_weights_,
+                          nvinfer1::MatrixOperation::kTRANSPOSE)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *hidden_matmul =
+    network_
+      ->addMatrixMultiply(*hidden_mid->getOutput(0), nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.state_weights_,
+                          nvinfer1::MatrixOperation::kTRANSPOSE)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *weights_add =
+    network_->addElementWise(*input_matmul, *hidden_matmul, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+  nvinfer1::ITensor *bias =
+    network_->addElementWise(*lstm_weights.input_bias_, *lstm_weights.state_bias_, nvinfer1::ElementWiseOperation::kSUM)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *gates_calculate =
+    network_->addElementWise(*weights_add, *bias, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+  const auto isolateGate = [&](nvinfer1::ITensor &gates, int gateIndex) -> nvinfer1::ITensor * {
+    nvinfer1::ISliceLayer *slice =
+      network_->addSlice(gates, nvinfer1::Dims{1, {gateIndex * params_.hidden_size_}},
+                         nvinfer1::Dims{1, {params_.hidden_size_}}, nvinfer1::Dims{1, {1}});
+    return Reshape(slice->getOutput(0), nvinfer1::Dims{1, {params_.hidden_size_}});
+  };
+  // weight order: input, output, forget, cell
+  nvinfer1::ITensor *i =
+    network_->addActivation(*isolateGate(*gates_calculate, 0), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+
+  nvinfer1::ITensor *o =
+    network_->addActivation(*isolateGate(*gates_calculate, 1), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+
+  nvinfer1::ITensor *f =
+    network_->addActivation(*isolateGate(*gates_calculate, FORGET_GATE), nvinfer1::ActivationType::kSIGMOID)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *c =
+    network_->addActivation(*isolateGate(*gates_calculate, CELL_GATE), nvinfer1::ActivationType::kTANH)->getOutput(0);
+
+  nvinfer1::ITensor *C =
+    network_
+      ->addElementWise(
+        *network_->addElementWise(*f, *cell_mid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
+        *network_->addElementWise(*i, *c, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
+        nvinfer1::ElementWiseOperation::kSUM)
+      ->getOutput(0);
+  nvinfer1::ITensor *H =
+    network_
+      ->addElementWise(*o, *network_->addActivation(*C, nvinfer1::ActivationType::kTANH)->getOutput(0),
+                       nvinfer1::ElementWiseOperation::kPROD)
+      ->getOutput(0);
+
+  // Recurrent backedge input for hidden and cell.
+  cell_mid->setInput(1, *C);
+  hidden_mid->setInput(1, *H);
+  // outputs
+  nvinfer1::LoopOutput output_mode = is_backward ? nvinfer1::LoopOutput::kREVERSE : nvinfer1::LoopOutput::kCONCATENATE;
+  nvinfer1::ILoopOutputLayer *output_layer = sequence_loop->addLoopOutput(*H, output_mode);
+  output_layer->setInput(1, *lstm_weights.max_seq_size_);
+  *hidden_out =
+    Reshape(sequence_loop->addLoopOutput(*hidden_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0),
+            nvinfer1::Dims3(1, 1, params_.hidden_size_));
+  *cell_out =
+    Reshape(sequence_loop->addLoopOutput(*cell_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0),
+            nvinfer1::Dims3(1, 1, params_.hidden_size_));
+  return Reshape(output_layer->getOutput(0), nvinfer1::Dims4(params_.sequence_size_, 1, 1, params_.hidden_size_));
+#else
+  MS_LOG(ERROR) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher";
+  return nullptr;
+#endif
+}
+
+int LSTMTensorRT::Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) {
+  if (op_binding_tensor_.size() == 0) {
+    MS_LOG(DEBUG) << "unsing serialized engine, add input tensor for " << op_name_;
+    mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+    mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+
+    op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(),
+                                               nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()});
+    op_binding_tensor_.push_back(
+      BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()});
+    params_.sequence_size_ = in_tensors_[0].Shape()[0];
+    op_binding_tensor_.push_back(
+      BindingHelper{(op_name_ + "_seq_input"), &params_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)});
+  }
+  for (auto tensor : op_binding_tensor_) {
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor.name_, tensor.size_, tensor.data_type_);
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "malloc for inputs tensor device memory failed " << tensor.name_;
+      return RET_ERROR;
+    }
+    int index = engine->getBindingIndex(tensor.name_.c_str());
+    network_tensor_bindings[index] = device_ptr;
+    runtime_->GetAllocator()->SyncMemInHostAndDevice(tensor.data_, tensor.name_, tensor.size_, true);
+    runtime_->GetAllocator()->MarkMemValid(tensor.name_, true);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LSTM, LSTMTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h
new file mode 100644
index 00000000000..962bf778ff4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <array>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+constexpr int INPUT_TENSOR_SIZE = 6;
+constexpr int OUTPUT_TENSOR_SIZE = 3;
+constexpr int INPUT_WEIGHT = 1;
+constexpr int STATE_WEIGHT = 2;
+constexpr int BIAS = 3;
+constexpr int HIDDEN_IN_TENSOR_INIT = 4;
+constexpr int CELL_IN_TENSOR_INIT = 5;
+constexpr int LSTM_GATE_NUM = 4;
+constexpr int BIDIRECTIONAL = 2;
+constexpr int OUTPUT_HIDDEN_INDEX = 1;
+constexpr int OUTPUT_CELL_INDEX = 2;
+constexpr int INPUT_SIZE_INDEX = 2;
+constexpr int FORGET_GATE = 2;
+constexpr int CELL_GATE = 3;
+constexpr int BATCH_SIZE_INDEX = 2;
+static const std::array<int, 4> INDICES{0, 1, 2, 3};
+
+struct LSTMParams {
+  int sequence_size_;
+  int input_data_size_;
+  int batch_size_;
+  int layer_count_;
+  int hidden_size_;
+  nvinfer1::DataType data_type_;
+  int directional_cnt_;
+};
+
+struct LstmState {
+  nvinfer1::ITensor *data_{nullptr};
+  nvinfer1::ITensor *hidden_{nullptr};
+  nvinfer1::ITensor *cell_{nullptr};
+};
+
+struct LstmWeights {
+  nvinfer1::ITensor *input_weights_{nullptr};
+  nvinfer1::ITensor *state_weights_{nullptr};
+  nvinfer1::ITensor *input_bias_{nullptr};
+  nvinfer1::ITensor *state_bias_{nullptr};
+  nvinfer1::ITensor *max_seq_size_{nullptr};
+};
+
+class LSTMTensorRT : public TensorRTOp {
+ public:
+  LSTMTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LSTMTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  int Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) override;
+
+ private:
+  int PreProcess();
+
+  int AddLSTMLayers();
+
+  nvinfer1::ITensor *AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights,
+                                 LstmState *next_state);
+
+  nvinfer1::ITensor *Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims);
+
+  nvinfer1::ITensor *ConcateAll(std::vector<nvinfer1::ITensor *> all_tensort, int axis = 0);
+
+  nvinfer1::ITensor *AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                        nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                        bool is_backward = false);
+  nvinfer1::ITensor *AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                    nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                    bool is_backward = false);
+
+  int ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init,
+                          LstmState *input_state, int *input_weight_offset, int *state_weight_offset, int *bias_offset,
+                          LstmWeights *lstm_weights, const LstmState &next_state);
+
+  nvinfer1::INetworkDefinition *network_{nullptr};
+  nvinfer1::ITensor *input_data_{nullptr};
+  nvinfer1::ITensor *sequence_size_input_{nullptr};
+  nvinfer1::ITensor *op_data_out_{nullptr};
+  nvinfer1::ITensor *op_hidden_out_{nullptr};
+  nvinfer1::ITensor *op_cell_out_{nullptr};
+  LSTMParams params_;
+  std::string hidden_init_name_;
+  std::string cell_init_name_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc
new file mode 100644
index 00000000000..e5b610eb120
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc
@@ -0,0 +1,202 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(MatmulOptPluginCreater);
+template class TensorRTPluginCreater<MatmulOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+// MatmulOptPlugin
+int MatmulOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                             const void *const *inputs, void *const *outputs, void *workspace,
+                             cudaStream_t stream) noexcept {
+  CHECK_NULL_RETURN(cublas_handle_);
+  CUBLAS_CHECK(cublasSetStream(cublas_handle_, stream));
+  const nvinfer1::PluginTensorDesc desc_a = inputDesc[0];
+  const nvinfer1::PluginTensorDesc desc_b = inputDesc[1];
+  const nvinfer1::PluginTensorDesc desc_c = outputDesc[0];
+
+  if (desc_a.dims.nbDims == DIMENSION_2D) {
+    // a: m * k, b: k * n, c: m * n
+    int m = desc_c.dims.d[0];
+    int n = desc_c.dims.d[1];
+    int k = b_trans_ ? desc_b.dims.d[1] : desc_b.dims.d[0];
+    const int mm_params[]{m, n, k};
+    CublasMM1Batch(inputs[0], inputs[1], outputs[0], mm_params, operations_, data_types_, cublas_handle_);
+  } else if (desc_a.dims.nbDims == DIMENSION_3D) {
+    return RunBatchedMatmul(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+  } else {
+    MS_LOG(ERROR) << layer_name_ << " input dims needs check a: " << desc_a.dims.nbDims;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatmulOptPlugin::RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc,
+                                      const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                      void *const *outputs, void *workspace, cudaStream_t stream) {
+  const nvinfer1::PluginTensorDesc desc_b = inputDesc[1];
+  const nvinfer1::PluginTensorDesc desc_c = outputDesc[0];
+  int batch = desc_c.dims.d[0];
+  int m = desc_c.dims.d[1];
+  int n = desc_c.dims.d[DIMENSION_2D];
+  int k = b_trans_ ? desc_b.dims.d[DIMENSION_2D] : desc_b.dims.d[1];
+  const int mm_params[]{m, n, k, batch};
+  for (int i = 0; i < batch; i++) {
+    a_addrs_[i] = inputs[0] + i * m * k * sizeof(float);
+    b_addrs_[i] = inputs[1] + i * k * n * sizeof(float);
+    c_addrs_[i] = outputs[0] + i * m * n * sizeof(float);
+  }
+  int data_size = batch * sizeof(void *);
+  int max_batchsize = a_addrs_.size();
+  if (a_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  if (b_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  if (c_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  CUDA_CHECK(cudaMemcpy(a_device_addrs_, a_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(b_device_addrs_, b_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(c_device_addrs_, c_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+
+  CublasMMBatched(a_device_addrs_, b_device_addrs_, c_device_addrs_, mm_params, operations_, data_types_,
+                  cublas_handle_);
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *MatmulOptPlugin::clone() const noexcept {
+  auto *plugin = new MatmulOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs MatmulOptPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                         int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  if (nbInputs != INPUT_SIZE2 && nbInputs != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid input size " << nbInputs << " of " << layer_name_;
+    return out_dims;
+  }
+  out_dims.nbDims = inputs[0].nbDims;
+  if (out_dims.nbDims == DIMENSION_2D) {
+    out_dims.d[0] = a_trans_ ? inputs[0].d[1] : inputs[0].d[0];
+    out_dims.d[1] = b_trans_ ? inputs[1].d[0] : inputs[1].d[1];
+    return out_dims;
+  } else if (out_dims.nbDims == DIMENSION_3D) {
+    out_dims.d[0] = inputs[0].d[0];
+    out_dims.d[1] = a_trans_ ? inputs[0].d[DIMENSION_2D] : inputs[0].d[1];
+    out_dims.d[DIMENSION_2D] = b_trans_ ? inputs[1].d[1] : inputs[1].d[DIMENSION_2D];
+    return out_dims;
+  }
+  MS_LOG(ERROR) << "invalid input dims " << out_dims.nbDims << " of " << layer_name_;
+  return out_dims;
+}
+
+void MatmulOptPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                                      const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {
+  operations_[0] = a_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N;
+  operations_[1] = b_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N;
+  data_types_[0] = ConvertDataType(in[0].desc.type);             // input a
+  data_types_[1] = ConvertDataType(in[1].desc.type);             // input b
+  data_types_[THIRD_INPUT] = ConvertDataType(out[0].desc.type);  // output c
+  data_types_[FOURTH_INPUT] =
+    (in[0].desc.type == nvinfer1::DataType::kHALF || in[1].desc.type == nvinfer1::DataType::kHALF)
+      ? CUDA_R_16F
+      : CUDA_R_32F;  // compute type
+  if (in[0].max.nbDims == DIMENSION_3D) {
+    int max_batchsize = in[0].max.d[0];
+    a_addrs_.resize(max_batchsize);
+    b_addrs_.resize(max_batchsize);
+    c_addrs_.resize(max_batchsize);
+    if (a_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+    if (b_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+    if (c_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+  }
+}
+
+int MatmulOptPlugin::initialize() noexcept {
+  if (cublas_handle_ == nullptr) {
+    CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  }
+  for (int i = 0; i < DIMENSION_4D; i++) {
+    if (data_types_[i] != CUDA_R_32F) {
+      MS_LOG(ERROR) << layer_name_ << " only support fp32";
+      return RET_ERROR;
+    }
+  }
+}
+
+void MatmulOptPlugin::terminate() noexcept {
+  if (cublas_handle_ != nullptr) {
+    auto cublas_ret = cublasDestroy(cublas_handle_);
+    if (cublas_ret != CUBLAS_STATUS_SUCCESS) {
+      MS_LOG(ERROR) << "cublasDestroy failed: " << cublas_ret;
+    } else {
+      cublas_handle_ = nullptr;
+    }
+  }
+  cudaError_t err;
+  if (a_device_addrs_ != nullptr) {
+    err = cudaFree(a_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    a_device_addrs_ = nullptr;
+  }
+  if (b_device_addrs_ != nullptr) {
+    err = cudaFree(b_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    b_device_addrs_ = nullptr;
+  }
+  if (c_device_addrs_ != nullptr) {
+    err = cudaFree(c_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    c_device_addrs_ = nullptr;
+  }
+}
+
+size_t MatmulOptPlugin::getSerializationSize() const noexcept { return 2 * sizeof(bool); }
+
+void MatmulOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &a_trans_, sizeof(bool));
+  SerializeValue(&buffer, &b_trans_, sizeof(bool));
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h
new file mode 100644
index 00000000000..bc5559f6591
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h"
+
+namespace mindspore::lite {
+constexpr char *MATMUL_OPT_PLUGIN_NAME{"MatmulOptPlugin"};
+class MatmulOptPlugin : public TensorRTPlugin {
+ public:
+  MatmulOptPlugin(const std::string name, bool a_trans, bool b_trans, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(MATMUL_OPT_PLUGIN_NAME), device_id), a_trans_(a_trans), b_trans_(b_trans) {}
+
+  MatmulOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    a_trans_ = static_cast<const bool *>(fields[0].data)[0];
+    b_trans_ = static_cast<const bool *>(fields[1].data)[0];
+  }
+
+  MatmulOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &a_trans_, sizeof(bool));
+    DeserializeValue(&serialData, &serialLength, &b_trans_, sizeof(bool));
+  }
+
+  MatmulOptPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  int initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                       const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream);
+
+  bool a_trans_{false};
+  bool b_trans_{false};
+  cublasHandle_t cublas_handle_{nullptr};
+  cublasOperation_t operations_[2]{CUBLAS_OP_N, CUBLAS_OP_N};
+  cudaDataType data_types_[4]{CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
+  std::vector<const void *> a_addrs_;
+  std::vector<const void *> b_addrs_;
+  std::vector<void *> c_addrs_;
+  void **a_device_addrs_{nullptr};
+  void **b_device_addrs_{nullptr};
+  void **c_device_addrs_{nullptr};
+};
+class MatmulOptPluginCreater : public TensorRTPluginCreater<MatmulOptPlugin> {
+ public:
+  MatmulOptPluginCreater() : TensorRTPluginCreater(std::string(MATMUL_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc
new file mode 100644
index 00000000000..b12b8457a02
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc
@@ -0,0 +1,310 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/matmul_tensorrt.h"
+#include <memory>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+namespace mindspore::lite {
+MatMulTensorRT::~MatMulTensorRT() {
+  if (weight_ptr_ != nullptr) {
+    free(weight_ptr_);
+    weight_ptr_ = nullptr;
+  }
+}
+int MatMulTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                              const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatMulTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (type_ == schema::PrimitiveType_MatMulFusion) {
+    auto primitive = this->GetPrimitive()->value_as_MatMulFusion();
+    if (primitive == nullptr) {
+      MS_LOG(ERROR) << "convert to primitive matmul failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_a_ = primitive->transpose_a();
+    transpose_b_ = primitive->transpose_b();
+    activation_ = primitive->activation_type();
+  }
+  nvinfer1::ITensor *out_tensor = nullptr;
+  if (RunOptPlugin()) {
+    out_tensor = AddAsOptPlugin(ctx);
+  } else if (RunFullConnect()) {
+    MS_LOG(DEBUG) << "use fully connected instead of matmul for " << op_name_;
+    out_tensor = AddAsFullConnect(ctx);
+  } else {
+    MS_LOG(DEBUG) << "use origin tensorrt matmul for " << op_name_;
+    out_tensor = AddAsMatmul(ctx);
+  }
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "add matmul failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  // add activation
+  if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    nvinfer1::ILayer *activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for matmul failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    out_tensor = activation_layer->getOutput(0);
+  }
+
+  out_tensor->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor, out_format_, true);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_});
+  return RET_OK;
+}
+
+int MatMulTensorRT::PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b) {
+  if (tensorrt_in_tensors_.size() == INPUT_SIZE2) {
+    int a_index =
+      GetDimsVolume(tensorrt_in_tensors_[0].trt_tensor_->getDimensions()) == GetDimsVolume(in_tensors_[0].Shape()) ? 0
+                                                                                                                   : 1;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[a_index], matmul_a);
+    ret += PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - a_index], matmul_b);
+    if (ret != RET_OK || matmul_a->trt_tensor_ == nullptr || matmul_b->trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul inputs failed for " << op_name_;
+      return ret;
+    }
+    out_format_ = matmul_a->format_;
+    if (matmul_a->format_ != matmul_b->format_) {
+      MS_LOG(WARNING) << "matmul input tensor has different format " << op_name_;
+      out_format_ = Format::NHWC;
+    }
+  } else if (tensorrt_in_tensors_.size() == 1) {
+    auto weight = ProcessWeightTensor(ctx);
+    if (weight == nullptr) {
+      MS_LOG(ERROR) << "create constant weight tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0;
+    ITensorHelper *weight_helper = (weight_index == 1) ? matmul_b : matmul_a;
+    ITensorHelper *var_helper = (weight_index == 1) ? matmul_a : matmul_b;
+    weight_helper->trt_tensor_ = weight;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - weight_index], var_helper);
+    if (ret != RET_OK || var_helper->trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul input var_helper failed for " << op_name_;
+      return ret;
+    }
+    out_format_ = var_helper->format_;
+  } else {
+    MS_LOG(ERROR) << op_name_ << " tensorrt in tensor size is invalid " << tensorrt_in_tensors_.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *MatMulTensorRT::ProcessWeightTensor(TensorRTContext *ctx) {
+  nvinfer1::ITensor *weight = nullptr;
+  int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0;
+  if (in_tensors_[weight_index].Shape().size() <
+      static_cast<size_t>(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) {
+    std::vector<int64_t> expect_shape(in_tensors_[1 - weight_index].Shape().size(), 1);
+    auto origin_shape = in_tensors_[weight_index].Shape();
+    for (int i = 0; i < origin_shape.size(); i++) {
+      expect_shape[expect_shape.size() - 1 - i] = origin_shape[origin_shape.size() - 1 - i];
+    }
+    weight = ConvertTensorWithExpandDims(ctx, in_tensors_[weight_index], expect_shape, op_name_);
+  } else if (in_tensors_[weight_index].Shape().size() ==
+             static_cast<size_t>(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) {
+    weight = ConvertConstantTensor(ctx, in_tensors_[weight_index], op_name_);
+  } else {
+    MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_;
+    return nullptr;
+  }
+  return weight;
+}
+
+nvinfer1::ITensor *MatMulTensorRT::AddAsMatmul(TensorRTContext *ctx) {
+  ITensorHelper matmul_a;
+  ITensorHelper matmul_b;
+
+  int ret = PreprocessMatMulInputs(ctx, &matmul_a, &matmul_b);
+  if (ret != RET_OK || matmul_a.trt_tensor_ == nullptr || matmul_b.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessMatMulInputs matmul failed for " << op_name_;
+    return nullptr;
+  }
+
+  MS_LOG(DEBUG) << "matmul input a " << GetTensorFormat(matmul_a);
+  MS_LOG(DEBUG) << "matmul input b " << GetTensorFormat(matmul_b);
+
+  auto matmul_layer = ctx->network()->addMatrixMultiply(
+    *matmul_a.trt_tensor_, transpose_a_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE,
+    *matmul_b.trt_tensor_, transpose_b_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE);
+  if (matmul_layer == nullptr) {
+    MS_LOG(ERROR) << "addMatrixMultiply failed for " << op_name_;
+    return nullptr;
+  }
+  this->layer_ = matmul_layer;
+  matmul_layer->setName(op_name_.c_str());
+  return AddBias(ctx, matmul_layer->getOutput(0));
+}
+
+nvinfer1::ITensor *MatMulTensorRT::AddAsFullConnect(TensorRTContext *ctx) {
+  nvinfer1::Weights weight;
+  nvinfer1::Weights bias = ConvertWeight(in_tensors_[kBiasIndex]);
+  nvinfer1::ITensor *input_a = tensorrt_in_tensors_[0].trt_tensor_;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  if (input_a->getDimensions().nbDims != DIMENSION_4D) {
+    nvinfer1::Dims in_dims(input_a->getDimensions());
+    in_dims.nbDims = DIMENSION_4D;
+    for (int i = input_a->getDimensions().nbDims; i < DIMENSION_4D; i++) {
+      in_dims.d[i] = 1;
+    }
+    input_a = Reshape(ctx, input_a, in_dims);
+    if (input_a == nullptr) {
+      MS_LOG(ERROR) << "reshape input failed for " << op_name_;
+      return nullptr;
+    }
+    MS_LOG(DEBUG) << "full connect expand input a to " << GetTensorFormat(input_a);
+  } else {
+    ITensorHelper tmp_input;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &tmp_input);
+    if (ret != RET_OK || tmp_input.trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "rPreprocessInputs2SameDim failed for " << op_name_;
+      return nullptr;
+    }
+    input_a = tmp_input.trt_tensor_;
+    out_format_ = tmp_input.format_;
+    MS_LOG(DEBUG) << "full connect preprocess input a to " << GetTensorFormat(tmp_input);
+  }
+  if (!transpose_b_) {
+    // transpose weight
+    weight = TransposeWeight2D(in_tensors_[1], &weight_ptr_);
+    if (weight.values == nullptr || weight_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "TransposeWeight2D input weight failed for " << op_name_;
+      return nullptr;
+    }
+  } else {
+    weight = ConvertWeight(in_tensors_[1]);
+  }
+
+  int output_cnt = in_tensors_[kBiasIndex].Shape()[0];
+
+  auto fc_layer = ctx->network()->addFullyConnected(*input_a, output_cnt, weight, bias);
+  if (fc_layer == nullptr) {
+    MS_LOG(ERROR) << "add fully connected layer failed for " << op_name_;
+    return nullptr;
+  }
+  this->layer_ = fc_layer;
+  fc_layer->setName((op_name_ + "_fullyconnected").c_str());
+  nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0);
+  if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) {
+    std::vector<int64_t> out_dims(out_tensors_[0].Shape());
+    out_dims[0] = out_tensor->getDimensions().d[0];
+    out_tensor = Reshape(ctx, out_tensor, out_dims);
+  }
+  return out_tensor;
+}
+nvinfer1::ITensor *MatMulTensorRT::AddAsOptPlugin(TensorRTContext *ctx) {
+  nvinfer1::ITensor *weight_tensor = nullptr;
+  if (tensorrt_in_tensors_.size() >= INPUT_SIZE2) {
+    weight_tensor = tensorrt_in_tensors_[1].trt_tensor_;
+  } else {
+    weight_tensor = ConvertConstantTensor(ctx, in_tensors_[1], op_name_);
+  }
+
+  auto plugin = std::make_shared<MatmulOptPlugin>(op_name_, transpose_a_, transpose_b_, device_id_);
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create MatmulOptPlugin failed for " << op_name_;
+    return nullptr;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, weight_tensor};
+  nvinfer1::IPluginV2Layer *matmul_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (matmul_layer == nullptr) {
+    MS_LOG(ERROR) << "add matmul opt plugin layer failed for " << op_name_;
+    return nullptr;
+  }
+  layer_ = matmul_layer;
+  return AddBias(ctx, matmul_layer->getOutput(0));
+}
+nvinfer1::ITensor *MatMulTensorRT::AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor) {
+  nvinfer1::ITensor *out_tensor = input_tensor;
+  if (in_tensors_.size() == kBiasIndex + 1) {
+    nvinfer1::ITensor *bias = nullptr;
+    if (in_tensors_[kBiasIndex].Shape().size() < static_cast<size_t>(out_tensor->getDimensions().nbDims)) {
+      std::vector<int64_t> expect_dims(out_tensors_[0].Shape());
+      expect_dims[0] = out_tensor->getDimensions().d[0];
+      bias = ConvertTensorWithExpandDims(ctx, in_tensors_[kBiasIndex], expect_dims, op_name_);
+    } else if (in_tensors_[kBiasIndex].Shape().size() == static_cast<size_t>(out_tensor->getDimensions().nbDims)) {
+      bias = ConvertConstantTensor(ctx, in_tensors_[kBiasIndex], op_name_);
+    } else {
+      MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_;
+      return nullptr;
+    }
+    if (bias == nullptr) {
+      MS_LOG(ERROR) << "create constant bias tensor failed for " << op_name_;
+      return nullptr;
+    }
+    auto bias_layer = ctx->network()->addElementWise(*out_tensor, *bias, nvinfer1::ElementWiseOperation::kSUM);
+    if (bias_layer == nullptr) {
+      MS_LOG(ERROR) << "add bias add layer failed for " << op_name_;
+      return nullptr;
+    }
+    auto bias_layer_name = op_name_ + "_bias";
+    bias_layer->setName(bias_layer_name.c_str());
+    out_tensor = bias_layer->getOutput(0);
+  }
+  return out_tensor;
+}
+
+bool MatMulTensorRT::RunOptPlugin() {
+  if (quant_type_ == schema::QuantType_QUANT_NONE &&
+      runtime_->GetRuntimePrecisionMode() == RuntimePrecisionMode::RuntimePrecisionMode_FP32) {
+    if (in_tensors_[0].Shape().size() == DIMENSION_2D && in_tensors_[1].Shape().size() == DIMENSION_2D &&
+        in_tensors_[0].Shape()[0] > 1 && tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0] == -1) {
+      MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 2D dynamic batchsize";
+      return true;
+    } else if (in_tensors_[0].Shape().size() == DIMENSION_3D && in_tensors_[1].Shape().size() == DIMENSION_3D) {
+      //  batched matmul using opt
+      MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 3D batchsized";
+      return true;
+    }
+  }
+  return false;
+}
+bool MatMulTensorRT::RunFullConnect() {
+  if (in_tensors_.size() == INPUT_SIZE3 && in_tensors_[1].Data() != nullptr &&
+      in_tensors_[kBiasIndex].Data() != nullptr && !transpose_a_ && in_tensors_[1].Shape().size() == DIMENSION_2D &&
+      (in_tensors_[0].Shape().size() == DIMENSION_2D || in_tensors_[0].Shape().size() == DIMENSION_4D)) {
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MatMulFusion, MatMulTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h
new file mode 100644
index 00000000000..db3175c8cc4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class MatMulTensorRT : public TensorRTOp {
+ public:
+  MatMulTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~MatMulTensorRT() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+ private:
+  int PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b);
+
+  nvinfer1::ITensor *ProcessWeightTensor(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsMatmul(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsFullConnect(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsOptPlugin(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor);
+
+  bool RunOptPlugin();
+  bool RunFullConnect();
+
+  bool transpose_a_{false};
+  bool transpose_b_{false};
+  Format out_format_{Format::NHWC};
+  schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION};
+  void *weight_ptr_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc
new file mode 100644
index 00000000000..4100a39bf1f
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(NormalizeOptPluginCreater);
+template class TensorRTPluginCreater<NormalizeOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int NormalizeOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  auto input = static_cast<const float *>(inputs[0]);
+  auto gamma = static_cast<const float *>(inputs[1]);
+  auto beta = static_cast<const float *>(inputs[2]);
+  auto output = static_cast<float *>(outputs[0]);
+  auto input_dims = inputDesc[0].dims;
+  size_t dim_at_axis = input_dims.d[axis_];
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+  Normalize(input, gamma, beta, output, dim_at_axis, epsilion_, element_cnt, stream);
+}
+
+nvinfer1::IPluginV2DynamicExt *NormalizeOptPlugin::clone() const noexcept {
+  auto *plugin = new NormalizeOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t NormalizeOptPlugin::getSerializationSize() const noexcept { return sizeof(size_t) + sizeof(float); }
+
+void NormalizeOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &axis_, sizeof(size_t));
+  SerializeValue(&buffer, &epsilion_, sizeof(float));
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h
new file mode 100644
index 00000000000..981628e6da5
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+constexpr char *NORMALIZE_OPT_PLUGIN_NAME{"NormalizeOptPlugin"};
+class NormalizeOptPlugin : public TensorRTPlugin {
+ public:
+  NormalizeOptPlugin(const std::string name, size_t axis, float epsilion, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(NORMALIZE_OPT_PLUGIN_NAME), device_id), axis_(axis), epsilion_(epsilion) {}
+
+  NormalizeOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    axis_ = static_cast<const size_t *>(fields[0].data)[0];
+    epsilion_ = static_cast<const float *>(fields[1].data)[0];
+  }
+
+  NormalizeOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &axis_, sizeof(size_t));
+    DeserializeValue(&serialData, &serialLength, &epsilion_, sizeof(float));
+  }
+
+  NormalizeOptPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  size_t axis_{0};
+  float epsilion_{0.0f};
+};
+class NormalizeOptPluginCreater : public TensorRTPluginCreater<NormalizeOptPlugin> {
+ public:
+  NormalizeOptPluginCreater() : TensorRTPluginCreater(std::string(NORMALIZE_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc
new file mode 100644
index 00000000000..ec5a5ab4007
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc
@@ -0,0 +1,178 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/normalize_tensorrt.h"
+#include <functional>
+#include <memory>
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h"
+
+namespace mindspore::lite {
+int NormalizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != INPUT_SIZE3 && out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  auto norm_op = primitive->value_as_LayerNormFusion();
+  CHECK_NULL_RETURN(norm_op);
+  int being_norm_axis = norm_op->begin_norm_axis();
+  being_norm_axis = being_norm_axis >= 0 ? being_norm_axis : in_tensors[0].Shape().size() + being_norm_axis;
+  int begin_params_axis = norm_op->begin_params_axis();
+  begin_params_axis = begin_params_axis >= 0 ? begin_params_axis : in_tensors[0].Shape().size() + begin_params_axis;
+  if (begin_params_axis != being_norm_axis || begin_params_axis != in_tensors[0].Shape().size() - 1) {
+    MS_LOG(ERROR) << "only support normalize on last one dim, being_norm_axis is " << being_norm_axis << " for "
+                  << op_name_;
+    return RET_ERROR;
+  }
+  axis_ = begin_params_axis;
+  epsilon_ = norm_op->epsilon();
+  return RET_OK;
+}
+
+int NormalizeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  CHECK_NULL_RETURN(ctx->network());
+  int ret = PreprocessInputs(ctx);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "preprocess input failed for " << op_name_;
+    return ret;
+  }
+  return RunOptPlugin() ? RunAsOptPlugin(ctx) : RunAsTrtOps(ctx);
+}
+
+int NormalizeTensorRT::PreprocessInputs(TensorRTContext *ctx) {
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &norm_input_);
+  if (ret != RET_OK || norm_input_.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim norm_input failed for " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == BETA_INDEX + 1) {
+    gamma_ = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + in_tensors_[1].Name());
+    CHECK_NULL_RETURN(gamma_);
+    beta_ = ConvertTensorWithExpandDims(ctx, in_tensors_[BETA_INDEX], in_tensors_[0].Shape(),
+                                        op_name_ + in_tensors_[BETA_INDEX].Name());
+    CHECK_NULL_RETURN(beta_);
+  }
+  return RET_OK;
+}
+
+int NormalizeTensorRT::RunAsOptPlugin(TensorRTContext *ctx) {
+  auto plugin = std::make_shared<NormalizeOptPlugin>(op_name_, axis_, epsilon_, device_id_);
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create NormalizeOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {norm_input_.trt_tensor_, gamma_, beta_};
+  nvinfer1::IPluginV2Layer *norm_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE3, *plugin);
+  if (norm_layer == nullptr) {
+    MS_LOG(ERROR) << "add norm opt plugin layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  layer_ = norm_layer;
+  layer_->setName(op_name_.c_str());
+  AddInnerOutTensors(ITensorHelper{norm_layer->getOutput(0), norm_input_.format_, norm_input_.same_format_});
+  return RET_OK;
+}
+
+int NormalizeTensorRT::RunAsTrtOps(TensorRTContext *ctx) {
+  size_t axis = 1u << axis_;
+  // first output, add later
+  AddInnerOutTensors(ITensorHelper{nullptr, norm_input_.format_, norm_input_.same_format_});
+
+  // mean
+  auto mean =
+    ctx->network()->addReduce(*(norm_input_.trt_tensor_), nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0);
+  CHECK_NULL_RETURN(mean);
+  if (out_tensors_.size() == INPUT_SIZE3) {
+    AddInnerOutTensors(ITensorHelper{mean, norm_input_.format_, norm_input_.same_format_});
+  }
+  // x - mean
+  auto sub_mean = ctx->network()
+                    ->addElementWise(*(norm_input_.trt_tensor_), *mean, nvinfer1::ElementWiseOperation::kSUB)
+                    ->getOutput(0);
+  CHECK_NULL_RETURN(sub_mean);
+  // (x - mean)^2
+  auto const_two =
+    ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &two_, DataType::kNumberTypeFloat32, op_name_ + "_two");
+  CHECK_NULL_RETURN(const_two);
+  auto pow = ctx->network()->addElementWise(*sub_mean, *const_two, nvinfer1::ElementWiseOperation::kPOW)->getOutput(0);
+  CHECK_NULL_RETURN(pow);
+  // mean of (x - mean)^2
+  auto var = ctx->network()->addReduce(*pow, nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0);
+  CHECK_NULL_RETURN(var);
+  if (out_tensors_.size() == INPUT_SIZE3) {
+    AddInnerOutTensors(ITensorHelper{var, norm_input_.format_, norm_input_.same_format_});
+  }
+
+  // var + min epsilon
+  auto const_epsilon = ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &epsilon_,
+                                              DataType::kNumberTypeFloat32, op_name_ + "_epsilion");
+  CHECK_NULL_RETURN(const_epsilon);
+  auto var_epsilon =
+    ctx->network()->addElementWise(*var, *const_epsilon, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+  CHECK_NULL_RETURN(var_epsilon);
+
+  // standard deviation
+  auto std_dev = ctx->network()->addUnary(*var_epsilon, nvinfer1::UnaryOperation::kSQRT)->getOutput(0);
+  CHECK_NULL_RETURN(std_dev);
+
+  // sub_mean / std_dev
+  auto norm_layer = ctx->network()->addElementWise(*sub_mean, *std_dev, nvinfer1::ElementWiseOperation::kDIV);
+  CHECK_NULL_RETURN(norm_layer);
+  this->layer_ = norm_layer;
+  auto norm = norm_layer->getOutput(0);
+  CHECK_NULL_RETURN(norm);
+
+  // scale with gamma and beta
+  if (gamma_ != nullptr && beta_ != nullptr) {
+    auto gamma_out =
+      ctx->network()->addElementWise(*norm, *gamma_, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
+    CHECK_NULL_RETURN(gamma_out);
+    auto beta_out =
+      ctx->network()->addElementWise(*gamma_out, *beta_, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+    CHECK_NULL_RETURN(beta_out);
+    tensorrt_out_tensors_[0].trt_tensor_ = beta_out;
+  } else {
+    tensorrt_out_tensors_[0].trt_tensor_ = norm;
+  }
+  return RET_OK;
+}
+
+bool NormalizeTensorRT::RunOptPlugin() {
+  if (out_tensors_.size() == 1 && in_tensors_.size() == INPUT_SIZE3 && axis_ == in_tensors_[0].Shape().size() - 1 &&
+      in_tensors_[0].Shape()[axis_] < GET_THREADS) {
+    // insufficient shared memory
+    int dim_sum = std::accumulate(in_tensors_[0].Shape().begin(), in_tensors_[0].Shape().begin() + axis_, 1,
+                                  std::multiplies<int>());
+    const int kSharedMemoryThreshold = 2048;
+    if (dim_sum > kSharedMemoryThreshold) {
+      return false;
+    }
+    MS_LOG(INFO) << op_name_ << " use opt plugin";
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LayerNormFusion, NormalizeTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h
new file mode 100644
index 00000000000..5b7e67882fd
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+constexpr int BETA_INDEX = 2;
+
+class NormalizeTensorRT : public TensorRTOp {
+ public:
+  NormalizeTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~NormalizeTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreprocessInputs(TensorRTContext *ctx);
+
+  int RunAsOptPlugin(TensorRTContext *ctx);
+
+  int RunAsTrtOps(TensorRTContext *ctx);
+
+  bool RunOptPlugin();
+
+  ITensorHelper norm_input_;
+  nvinfer1::ITensor *gamma_{nullptr};
+  nvinfer1::ITensor *beta_{nullptr};
+  size_t axis_{0};
+  const float two_{2.0f};
+  float epsilon_{0.0f};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc
new file mode 100644
index 00000000000..534f35b0875
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/pad_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PadTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                           const std::vector<mindspore::MSTensor> &in_tensors,
+                           const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors_[1].Data() == nullptr) {
+    MS_LOG(ERROR) << "invalid pad tensor for: " << op_name_;
+    return RET_ERROR;
+  }
+  auto pad_primitive = this->GetPrimitive()->value_as_PadFusion();
+  if (pad_primitive == nullptr) {
+    MS_LOG(ERROR) << "convert PadFusion failed: " << op_name_;
+    return RET_ERROR;
+  }
+  schema::PaddingMode padding_mode = pad_primitive->padding_mode();
+  if (padding_mode != schema::PaddingMode::PaddingMode_CONSTANT) {
+    MS_LOG(ERROR) << "Unsupported padding mode: " << schema::PaddingMode(padding_mode) << ", for op: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  constant_value_ = pad_primitive->constant_value();
+  return RET_OK;
+}
+
+int PadTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  mindspore::MSTensor &pad_tensor = in_tensors_[1];
+  int element_cnt = std::accumulate(pad_tensor.Shape().begin(), pad_tensor.Shape().end(), 1, std::multiplies<int>());
+  if (element_cnt != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims * INPUT_SIZE2) {
+    MS_LOG(ERROR) << "pad tensor cnt is invalid. cnt: " << element_cnt
+                  << ", input tensor dims cnt: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *pad_input = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "before transpose "
+                << GetTensorFormat(pad_input, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_);
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    pad_input = transpose_layer_in->getOutput(0);
+    MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(pad_input, Format::NCHW, false);
+  }
+
+  // trt 6 only support 2D padding
+  const int *padding_data = reinterpret_cast<const int *>(in_tensors_[1].Data().get());
+  MS_ASSERT(padding_data);
+  nvinfer1::IPaddingLayer *padding_layer = nullptr;
+  if (element_cnt == index_NHWC_ * INPUT_SIZE2) {
+    // only support pad at HW index
+    int h_pre;
+    int h_post;
+    int w_pre;
+    int w_post;
+    if (SameDims(pad_input->getDimensions(), in_tensors_[0].Shape())) {
+      // NCHW: 0: N_pre, 1: N_post, 2: C_pre, 3: C_post, 4: H_pre, 5: H_post, 6: W_pre, 7: W_post
+      if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 2) != 0 || *(padding_data + 3) != 0) {
+        MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_;
+      }
+      h_pre = 4;
+      h_post = 5;
+      w_pre = 6;
+      w_post = 7;
+    } else {
+      // NHWC: 0: N_pre, 1: N_post, 2: H_pre, 3: H_post, 4: W_pre, 5: W_post, 6: C_pre, 7: C_post
+      if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 6) != 0 || *(padding_data + 7) != 0) {
+        MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_;
+      }
+      h_pre = 2;
+      h_post = 3;
+      w_pre = 4;
+      w_post = 5;
+    }
+    nvinfer1::DimsHW prePadding{*(padding_data + h_pre), *(padding_data + w_pre)};
+    nvinfer1::DimsHW postPadding{*(padding_data + h_post), *(padding_data + w_post)};
+    MS_LOG(DEBUG) << op_name_ << " prePadding: " << prePadding.d[0] << ", " << prePadding.d[1]
+                  << "; postPadding: " << postPadding.d[0] << ", " << postPadding.d[1];
+
+    padding_layer = ctx->network()->addPadding(*pad_input, prePadding, postPadding);
+  } else {
+    MS_LOG(ERROR) << "need check for pad_tensor dims: " << op_name_
+                  << ", pad_tensor ElementNum: " << pad_tensor.ElementNum();
+    return RET_ERROR;
+  }
+  if (padding_layer == nullptr) {
+    MS_LOG(ERROR) << "add padding layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  this->layer_ = padding_layer;
+  padding_layer->setName(op_name_.c_str());
+  padding_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  bool same_format = SameDims(padding_layer->getOutput(0)->getDimensions(), out_tensors_[0].Shape()) &&
+                     SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape());
+  this->AddInnerOutTensors(ITensorHelper{padding_layer->getOutput(0), Format::NCHW, same_format});
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PadFusion, PadTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h
new file mode 100644
index 00000000000..def44c32bc8
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PadTensorRT : public TensorRTOp {
+ public:
+  PadTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+              const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+              const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PadTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  const int index_NHWC_ = 4;
+  float constant_value_ = 0.0f;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc
new file mode 100644
index 00000000000..7d83d9c54c0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc
@@ -0,0 +1,220 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/pool_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PoolTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                            const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PoolTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << tensorrt_in_tensors_.size();
+    return RET_ERROR;
+  }
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]);
+  int ret = ParseParams();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ParseParams failed for : " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *pool_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    pool_input = transpose_layer_in->getOutput(0);
+  }
+
+  // pooling layer
+  nvinfer1::Dims windowSize = lite::ConvertCudaDims(kernel_size_);
+  if (windowSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::IPoolingLayer *pooling_layer = ctx->network()->addPoolingNd(*pool_input, pooling_type_, windowSize);
+  if (pooling_layer == nullptr) {
+    MS_LOG(ERROR) << "addPoolingNd failed for TensorRT.";
+    return RET_ERROR;
+  }
+  AddParams(pooling_layer);
+  pooling_layer->setName(op_name_.c_str());
+  this->layer_ = pooling_layer;
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (activation_type_ == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = pooling_layer;
+  } else {
+    activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_type_, 0, 0, 0, pooling_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for pool failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  nvinfer1::ITensor *out_trt_tensor = activation_layer->getOutput(0);
+  out_trt_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_trt_tensor, Format::NCHW, false});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+int PoolTensorRT::ParseParams() {
+  int in_h = in_tensors_[0].Shape()[kNHWC_H];
+  int in_w = in_tensors_[0].Shape()[kNHWC_W];
+  int out_h = out_tensors_[0].Shape()[kNHWC_H];
+  int out_w = out_tensors_[0].Shape()[kNHWC_W];
+  int kernel_h;
+  int kernel_w;
+  switch (type_) {
+    case (schema::PrimitiveType_AvgPoolFusion): {
+      const schema::AvgPoolFusion *pool_primitive = this->GetPrimitive()->value_as_AvgPoolFusion();
+      if (pool_primitive == nullptr) {
+        MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_;
+        return RET_ERROR;
+      }
+      pooling_type_ = nvinfer1::PoolingType::kAVERAGE;
+
+      auto stride = pool_primitive->strides();
+      if (stride == nullptr) {
+        MS_LOG(ERROR) << "get stride failed: " << op_name_;
+        return RET_ERROR;
+      }
+      stride_ = std::vector<int64_t>(stride->begin(), stride->end());
+      kernel_h = in_h - (out_h - 1) * stride_[0];
+      kernel_w = in_w - (out_w - 1) * stride_[1];
+      auto kernel_size = pool_primitive->kernel_size();
+      if (kernel_size == nullptr) {
+        kernel_size_.push_back(kernel_h);
+        kernel_size_.push_back(kernel_w);
+        MS_LOG(WARNING) << op_name_ << "don't has kernel size, calculate kernel size on ms tensor, kernel_h is "
+                        << kernel_h << ", kernel_w is " << kernel_w;
+      } else {
+        kernel_size_ = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+      }
+      auto padding = pool_primitive->pad();
+      if (padding != nullptr && padding->size() != DIMENSION_4D) {
+        MS_LOG(ERROR) << op_name_ << "has invalid pad dims: " << padding->size();
+        return RET_ERROR;
+      } else if (padding == nullptr || padding->size() == 0) {
+        padding_ = std::vector<int64_t>(DIMENSION_4D, 0);
+      } else {
+        padding_ = std::vector<int64_t>(padding->begin(), padding->end());
+      }
+
+      pad_mode_ = pool_primitive->pad_mode();
+      activation_type_ = pool_primitive->activation_type();
+      break;
+    }
+    case (schema::PrimitiveType_MaxPoolFusion): {
+      const schema::MaxPoolFusion *pool_primitive = this->GetPrimitive()->value_as_MaxPoolFusion();
+      if (pool_primitive == nullptr) {
+        MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_;
+        return RET_ERROR;
+      }
+      pooling_type_ = nvinfer1::PoolingType::kMAX;
+
+      auto kernel_size = pool_primitive->kernel_size();
+      if (kernel_size == nullptr) {
+        MS_LOG(ERROR) << "get kernel size failed: " << op_name_;
+        return RET_ERROR;
+      }
+      kernel_size_ = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+
+      auto stride = pool_primitive->strides();
+      if (stride == nullptr) {
+        MS_LOG(ERROR) << "get stride failed: " << op_name_;
+        return RET_ERROR;
+      }
+      stride_ = std::vector<int64_t>(stride->begin(), stride->end());
+      kernel_h = in_h - (out_h - 1) * stride_[0];
+      kernel_w = in_w - (out_w - 1) * stride_[1];
+      auto padding = pool_primitive->pad();
+      if (padding == nullptr) {
+        MS_LOG(INFO) << "get padding is null, set to default 0: " << op_name_;
+        padding_ = {0, 0, 0, 0};
+      } else {
+        padding_ = std::vector<int64_t>(padding->begin(), padding->end());
+      }
+
+      pad_mode_ = pool_primitive->pad_mode();
+      activation_type_ = pool_primitive->activation_type();
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "unsupported primitive type of " << type_ << " for node: " << op_name_;
+      return RET_ERROR;
+    }
+  }
+  // some model kernel size is large than hw, correct it
+  if (kernel_size_[0] > in_h || kernel_size_[1] > in_w) {
+    MS_LOG(WARNING) << op_name_ << " kernel size is larger than input size";
+    kernel_size_[0] = kernel_size_[0] > kernel_h ? kernel_h : kernel_size_[0];
+    kernel_size_[1] = kernel_size_[1] > kernel_w ? kernel_w : kernel_size_[1];
+  }
+  return RET_OK;
+}
+
+void PoolTensorRT::AddParams(nvinfer1::IPoolingLayer *pooling_layer) {
+  nvinfer1::Dims stride_dims = ConvertCudaDims(stride_);
+  if (stride_dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return;
+  }
+  pooling_layer->setStrideNd(stride_dims);
+  if (pad_mode_ == schema::PadMode::PadMode_SAME) {
+    pooling_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    nvinfer1::Dims dims{};
+    dims.nbDims = DIMENSION_2D;
+    dims.d[0] = padding_[0];
+    dims.d[1] = padding_[DIMENSION_2D];
+    pooling_layer->setPaddingNd(dims);
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AvgPoolFusion, PoolTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MaxPoolFusion, PoolTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h
new file mode 100644
index 00000000000..de8003ca08c
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PoolTensorRT : public TensorRTOp {
+ public:
+  PoolTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PoolTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int ParseParams();
+
+  void AddParams(nvinfer1::IPoolingLayer *pooling_layer);
+
+  std::vector<int64_t> kernel_size_;
+
+  std::vector<int64_t> stride_;
+
+  std::vector<int64_t> padding_;
+
+  nvinfer1::PoolingType pooling_type_;
+
+  schema::PadMode pad_mode_;
+
+  schema::ActivationType activation_type_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc
new file mode 100644
index 00000000000..e3968264654
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/prelu_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PReluTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                             const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PReluTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  ITensorHelper prelu_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &prelu_input);
+  if (ret != RET_OK || prelu_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return ret;
+  }
+  int input_nbdims = prelu_input.trt_tensor_->getDimensions().nbDims;
+  int slope_nbdims = in_tensors_[1].Shape().size();
+  auto slope = tensorrt_in_tensors_[1].trt_tensor_;
+  if (input_nbdims != slope_nbdims) {
+    slope = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + "_slope");
+    tensorrt_in_tensors_[1].trt_tensor_ = slope;
+  }
+  if (slope == nullptr) {
+    MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper slope_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &slope_helper);
+  if (ret != RET_OK || slope_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim slope tensor failed for " << op_name_;
+    return ret;
+  }
+
+  auto *prelu_layer = ctx->network()->addParametricReLU(*prelu_input.trt_tensor_, *slope_helper.trt_tensor_);
+  if (prelu_layer == nullptr) {
+    MS_LOG(ERROR) << "addParameticReLU failed for TensorRT : " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *out_tensor = prelu_layer->getOutput(0);
+  out_tensor->setName((op_name_ + "_0").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, prelu_input.format_, prelu_input.same_format_});
+  this->layer_ = prelu_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PReLUFusion, PReluTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h
new file mode 100644
index 00000000000..3d6505b8afd
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PReluTensorRT : public TensorRTOp {
+ public:
+  PReluTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PReluTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc
new file mode 100644
index 00000000000..e8cdeb23281
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc
@@ -0,0 +1,139 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <valarray>
+#include "src/runtime/delegate/tensorrt/op/reduce_tensorrt.h"
+
+namespace mindspore::lite {
+int ReduceTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  return RET_OK;
+}
+
+int ReduceTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  auto reduce_op = op_primitive_->value_as_ReduceFusion();
+  if (reduce_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed";
+    return RET_ERROR;
+  }
+  bool keep_dims = reduce_op->keep_dims();
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  nvinfer1::ITensor *reduce_input = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]);
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      !SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape())) {
+    if (tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+      // NCHW->NHWC
+      nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      reduce_input = transpose_layer->getOutput(0);
+      out_format_ = Format::NHWC;
+      this->transpose_layer_ = transpose_layer;
+    } else if (tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+      // NHWC->NCHW
+      nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      reduce_input = transpose_layer->getOutput(0);
+      out_format_ = Format::NCHW;
+      this->transpose_layer_ = transpose_layer;
+    } else {
+      MS_LOG(WARNING) << "input tensor format needs check: " << op_name_;
+    }
+  }
+  MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(reduce_input, out_format_, true);
+  if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) {
+    // x^2
+    auto *pow2_layer =
+      ctx->network()->addElementWise(*reduce_input, *reduce_input, nvinfer1::ElementWiseOperation::kPROD);
+    CHECK_NULL_RETURN(pow2_layer);
+    pow2_layer->setName((op_name_ + "_pow2").c_str());
+
+    reduce_input = pow2_layer->getOutput(0);
+    CHECK_NULL_RETURN(reduce_input);
+  }
+
+  uint32_t reduceAxis = GetAxis();
+  auto reduce_operation_opt = TryConvertTRTReduceMode(reduce_op->mode());
+  if (!reduce_operation_opt) {
+    MS_LOG(WARNING) << "invalid reduce for TensorRT, need check: " << static_cast<int>(reduce_op->mode());
+    return RET_ERROR;
+  }
+  nvinfer1::IReduceLayer *layer =
+    ctx->network()->addReduce(*reduce_input, reduce_operation_opt.value(), reduceAxis, keep_dims);
+  CHECK_NULL_RETURN(layer);
+  layer->setName(op_name_.c_str());
+  this->layer_ = layer;
+
+  nvinfer1::ITensor *out_tensor = layer->getOutput(0);
+  CHECK_NULL_RETURN(out_tensor);
+
+  if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) {
+    auto sqrt_layer = ctx->network()->addUnary(*out_tensor, nvinfer1::UnaryOperation::kSQRT);
+    CHECK_NULL_RETURN(sqrt_layer);
+    sqrt_layer->setName((op_name_ + "_sqrt").c_str());
+    out_tensor = sqrt_layer->getOutput(0);
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_, true});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+uint32_t ReduceTensorRT::GetAxis() {
+  // axis
+  uint32_t reduceAxis = 0;
+  mindspore::MSTensor axis_tensor = this->in_tensors_[1];
+  if (axis_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "invalid axis_tensor";
+    return reduceAxis;
+  }
+  if (axis_tensor.DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(WARNING) << "not int data type";
+  }
+  int *axis_data = reinterpret_cast<int *>(axis_tensor.MutableData());
+  CHECK_NULL_RETURN(axis_data);
+  for (int i = 0; i < axis_tensor.ElementNum(); i++) {
+    int format_axis_data = (*axis_data == -1) ? in_tensors_[0].Shape().size() - 1 : *axis_data;
+    MS_LOG(DEBUG) << op_name_ << " reduceAxis at index : " << *axis_data;
+    reduceAxis |= 1u << format_axis_data;
+    axis_data++;
+  }
+  return reduceAxis;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceFusion, ReduceTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h
new file mode 100644
index 00000000000..d01927f704b
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ReduceTensorRT : public TensorRTOp {
+ public:
+  ReduceTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ReduceTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  uint32_t GetAxis();
+  Format out_format_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc
new file mode 100644
index 00000000000..7c9256992cb
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc
@@ -0,0 +1,126 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h"
+#include <numeric>
+#include <thread>
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(ReduceScatterPluginCreater);
+template class TensorRTPluginCreater<ReduceScatterPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int ReduceScatterTensorRT::IsSupport(const schema::Primitive *primitive,
+                                     const std::vector<mindspore::MSTensor> &in_tensors,
+                                     const std::vector<mindspore::MSTensor> &out_tensors) {
+#ifndef LITE_CUDA_DISTRIBUTION
+  MS_LOG(ERROR)
+    << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on.";
+  return RET_ERROR;
+#else
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#endif
+}
+
+int ReduceScatterTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  auto reduce_op = op_primitive_->value_as_ReduceScatter();
+  if (reduce_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed for " << op_name_;
+    return RET_ERROR;
+  }
+  auto reduce_mode = reduce_op->mode();
+  auto rank = GetGPUGroupSize();
+  auto plugin = std::make_shared<ReduceScatterPlugin>(op_name_, reduce_mode, rank, device_id_);
+  MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID();
+  nvinfer1::IPluginV2Layer *reduce_scatter_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  if (reduce_scatter_layer == nullptr) {
+    MS_LOG(ERROR) << "create ReduceScatter layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *reduce_scatter_out = reduce_scatter_layer->getOutput(0);
+  reduce_scatter_layer->setName(op_name_.c_str());
+  reduce_scatter_out->setName((op_name_ + "_output").c_str());
+  this->layer_ = reduce_scatter_layer;
+  this->AddInnerOutTensors(
+    ITensorHelper{reduce_scatter_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+// ReduceScatterPlugin
+int ReduceScatterPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                 const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                 void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  MS_LOG(INFO) << "ReduceScatter run at rank id: " << GetRankID() << " stream: " << stream;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+  int recieve_element_cnt =
+    std::accumulate(output_dims.d, output_dims.d + output_dims.nbDims, 1, std::multiplies<int64_t>());
+  const void *input = inputs[0];
+  void *output = outputs[0];
+  auto data_type = inputDesc->type;
+  auto ret = DistributionCollective::instance().ReduceScatterWrapper(input, output, recieve_element_cnt, data_type,
+                                                                     red_mode_, stream, NCCL_WORLD_GROUP);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ReduceScatter nccl run failed for " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *ReduceScatterPlugin::clone() const noexcept {
+  auto *plugin = new ReduceScatterPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs ReduceScatterPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                             int nbInputs,
+                                                             nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs->nbDims;
+  auto rank_dim = exprBuilder.constant(rank_);
+  out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kCEIL_DIV, *inputs->d[0], *rank_dim);
+  for (int i = 1; i < inputs->nbDims; i++) {
+    out_dims.d[i] = inputs->d[i];
+  }
+  return out_dims;
+}
+
+size_t ReduceScatterPlugin::getSerializationSize() const noexcept { return sizeof(schema::ReduceMode); }
+
+void ReduceScatterPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &red_mode_, sizeof(schema::ReduceMode));
+}
+
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceScatter, ReduceScatterTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h
new file mode 100644
index 00000000000..297397922a0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+constexpr char *REDUCESCATTER_PLUGIN_NAME{"ReduceScatterPlugin"};
+class ReduceScatterTensorRT : public TensorRTOp {
+ public:
+  ReduceScatterTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                        const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                        const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ReduceScatterTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class ReduceScatterPlugin : public TensorRTPlugin {
+ public:
+  ReduceScatterPlugin(const std::string name, schema::ReduceMode red_mode, int rank, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(REDUCESCATTER_PLUGIN_NAME), device_id), red_mode_(red_mode), rank_(rank) {}
+
+  ReduceScatterPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    red_mode_ = static_cast<const schema::ReduceMode *>(fields[0].data)[0];
+    rank_ = static_cast<const int *>(fields[1].data)[0];
+  }
+
+  ReduceScatterPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &red_mode_, sizeof(schema::ReduceMode));
+    DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int));
+  }
+
+  ReduceScatterPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int rank_{0};
+  schema::ReduceMode red_mode_;
+};
+class ReduceScatterPluginCreater : public TensorRTPluginCreater<ReduceScatterPlugin> {
+ public:
+  ReduceScatterPluginCreater() : TensorRTPluginCreater(std::string(REDUCESCATTER_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc
new file mode 100644
index 00000000000..65776da3fcf
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc
@@ -0,0 +1,230 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/resize_tensorrt.h"
+#include "nnacl/nnacl_common.h"
+
+namespace mindspore::lite {
+int ResizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  resize_op_ = op_primitive_->value_as_Resize();
+  if (resize_op_ == nullptr) {
+    MS_LOG(ERROR) << "convert failed " << op_name_;
+    return RET_ERROR;
+  }
+  if (resize_op_->method() == schema::ResizeMethod_LINEAR) {
+    MS_LOG(WARNING) << "TensorRT linear resize has precision issue, using cpu instead for " << op_name_;
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ =
+    (resize_op_->new_height() > 0 && resize_op_->new_width() > 0) ? false : true;
+  // constant new hw op don't support hw resize
+  return RET_OK;
+}
+
+int ResizeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *resize_in_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]);
+
+  if (resize_in_tensor->getDimensions().nbDims == DIMENSION_4D && tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+    resize_in_tensor = transpose_layer->getOutput(0);
+    this->transpose_layer_ = transpose_layer;
+  }
+  MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(resize_in_tensor, Format::NCHW, false);
+
+  nvinfer1::IResizeLayer *resize_layer = ctx->network()->addResize(*resize_in_tensor);
+  if (resize_layer == nullptr) {
+    MS_LOG(ERROR) << "create resize layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  int ret = SetOutputDims(resize_in_tensor, resize_layer);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetOutputDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  ret = SetParams(resize_layer);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetParams failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  resize_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{resize_layer->getOutput(0), Format::NCHW, false});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  this->layer_ = resize_layer;
+  return RET_OK;
+}
+
+int ResizeTensorRT::SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer) {
+  nvinfer1::Dims in_dims = resize_in_tensor->getDimensions();
+  if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) {
+    nvinfer1::Dims4 new_dims(in_dims.d[0], in_dims.d[1], resize_op_->new_height(), resize_op_->new_width());  // nchw
+    resize_layer->setOutputDimensions(new_dims);  // static shape
+  } else if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_hw_dynamic_ &&
+             dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) {
+    // hw is static, but has dynamic batch size
+    float scales[DIMENSION_4D]{1, 1, 1, 1};
+    scales[kNCHW_H] = static_cast<float>(resize_op_->new_height()) / static_cast<float>(in_dims.d[kNCHW_H]);
+    scales[kNCHW_W] = static_cast<float>(resize_op_->new_width()) / static_cast<float>(in_dims.d[kNCHW_W]);
+    resize_layer->setScales(scales, DIMENSION_4D);
+  } else {
+    auto shape_value_tensor = in_tensors_[1];
+    if (shape_value_tensor.Data() == nullptr && tensorrt_in_tensors_.size() >= INPUT_SIZE2) {
+      // dynamic output shape
+      resize_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_);
+    } else {
+      std::vector<float> out_shape;
+      ParseValueFromShapeTensor(shape_value_tensor, &out_shape);
+      if (SameDims(out_shape, out_tensors_[0].Shape())) {
+        // static dims
+        if (out_shape.size() == DIMENSION_4D) {
+          // convert nhwc to nchw
+          auto channel = out_shape[out_shape.size() - 1];
+          out_shape.insert(out_shape.begin() + 1, channel);
+          out_shape.erase(out_shape.begin() + out_shape.size() - 1);
+        }
+        resize_layer->setOutputDimensions(ConvertCudaDims(out_shape));
+      } else if (IsScaleOutputDim(in_tensors_[0].Shape(), out_tensors_[0].Shape(), out_shape)) {
+        // scale dims
+        float scales[DIMENSION_4D]{1, 1, 1, 1};
+        scales[kNCHW_H] =
+          static_cast<float>(out_tensors_[0].Shape()[kNHWC_H]) / static_cast<float>(in_tensors_[0].Shape()[kNHWC_H]);
+        scales[kNCHW_W] =
+          static_cast<float>(out_tensors_[0].Shape()[kNHWC_W]) / static_cast<float>(in_tensors_[0].Shape()[kNHWC_W]);
+        resize_layer->setScales(scales, DIMENSION_4D);
+      } else if (out_tensors_[0].Shape().size() == DIMENSION_4D) {
+        MS_LOG(DEBUG) << op_name_ << " output shape tensor value is const, but set to scales for dynamic input shape.";
+        float scales[out_tensors_[0].Shape().size()];
+        for (size_t i = 0; i < out_tensors_[0].Shape().size(); i++) {
+          scales[i] = static_cast<float>(out_tensors_[0].Shape()[i]) / static_cast<float>(in_tensors_[0].Shape()[i]);
+        }
+        // change to nchw
+        scales[kNCHW_W] = scales[kNHWC_W];
+        scales[kNCHW_H] = scales[kNHWC_H];
+        scales[kNCHW_C] = 1;
+        MS_LOG(DEBUG) << op_name_ << "scale at H " << kNCHW_H << ": " << scales[kNCHW_H] << ", W " << kNCHW_W << ": "
+                      << scales[kNCHW_W];
+        resize_layer->setScales(scales, out_tensors_[0].Shape().size());
+      } else {
+        MS_LOG(ERROR) << "resize dims needs check for " << op_name_;
+        return RET_ERROR;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+void ResizeTensorRT::ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor,
+                                               std::vector<float> *out_shape) {
+  switch (shape_value_tensor.DataType()) {
+    case DataType::kNumberTypeFloat32: {
+      const float *shape_data_fp32 = static_cast<const float *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(*(shape_data_fp32 + i));
+      }
+      break;
+    }
+    case DataType::kNumberTypeFloat16: {
+      const uint16_t *shape_data_fp16 = static_cast<const uint16_t *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(ShortToFloat32(*(shape_data_fp16 + i)));
+      }
+      break;
+    }
+    case DataType::kNumberTypeInt32: {
+      const int *shape_data_fp16 = static_cast<const int *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(*(shape_data_fp16 + i));
+      }
+      break;
+    }
+    default:
+      MS_LOG(WARNING) << op_name_
+                      << " more datatype need to check: " << static_cast<int>(shape_value_tensor.DataType());
+      break;
+  }
+  if (out_shape->size() == DIMENSION_2D &&
+      tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D) {
+    // out_shape: origin_n, out_shape[0], out_shape[1], origin_c
+    out_shape->insert(out_shape->begin(),
+                      tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0]);  // batch size is dynamic
+    out_shape->push_back(in_tensors_[0].Shape()[kNHWC_C]);                         // channel is const
+  }
+}
+
+bool ResizeTensorRT::IsScaleOutputDim(const std::vector<int64_t> &in_shape, const std::vector<int64_t> &out_shape,
+                                      const std::vector<float> &shape_tensor_val) {
+  if (out_shape.size() != DIMENSION_4D) {
+    MS_LOG(WARNING) << "dims count needs check for " << op_name_;
+    return false;
+  }
+  if (in_shape.size() != out_shape.size() || shape_tensor_val.size() != in_shape.size()) {
+    MS_LOG(WARNING) << "tensor shape is not same for " << op_name_;
+    return false;
+  }
+  for (size_t i = 0; i < in_shape.size(); i++) {
+    if (std::abs(in_shape[i] * shape_tensor_val[i] - out_shape[i]) > 1e-6) {
+      return false;
+    }
+  }
+  return true;
+}
+
+int ResizeTensorRT::SetParams(nvinfer1::IResizeLayer *resize_layer) {
+  auto method = resize_op_->method();
+  std::map<schema::ResizeMethod, nvinfer1::ResizeMode> method_map = {
+    {schema::ResizeMethod_LINEAR, nvinfer1::ResizeMode::kLINEAR},
+    {schema::ResizeMethod_NEAREST, nvinfer1::ResizeMode::kNEAREST}};
+  if (method_map.find(method) == method_map.end()) {
+    MS_LOG(ERROR) << op_name_ << " unsupported resize mode " << EnumNameResizeMethod(method);
+    return RET_ERROR;
+  }
+  resize_layer->setResizeMode(method_map.at(method));
+
+  // unsupported for trt6, but support setCoordinateTransformation() in version8
+  auto coordinate_transform_mode = resize_op_->coordinate_transform_mode();
+  if (coordinate_transform_mode != schema::CoordinateTransformMode_ASYMMETRIC) {
+    MS_LOG(WARNING) << op_name_ << " has coordinate_transform_mode may not supported: "
+                    << EnumNameCoordinateTransformMode(coordinate_transform_mode);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Resize, ResizeTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h
new file mode 100644
index 00000000000..645436caff3
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ResizeTensorRT : public TensorRTOp {
+ public:
+  ResizeTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ResizeTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer);
+
+  void ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor, std::vector<float> *out_shape);
+
+  bool IsScaleOutputDim(const std::vector<int64_t> &in_shape, const std::vector<int64_t> &out_shape,
+                        const std::vector<float> &shape_tensor_val);
+
+  int SetParams(nvinfer1::IResizeLayer *resize_layer);
+
+  const schema::Resize *resize_op_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc
new file mode 100644
index 00000000000..02f2c0de383
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc
@@ -0,0 +1,227 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/scale_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr int SCALE_INDEX = 1;
+constexpr int SHIFT_INDEX = 2;
+constexpr int POWER_INDEX = 3;
+
+int ScaleTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != INPUT_SIZE4) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ScaleTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  CHECK_NULL_RETURN(ctx);
+  auto scale_op = op_primitive_->value_as_ScaleFusion();
+  CHECK_NULL_RETURN(scale_op);
+
+  schema::ActivationType activation_type = scale_op->activation_type();
+  // mode of scale
+  axis_ = scale_op->axis();
+  axis_ = axis_ < 0 ? static_cast<int64_t>(in_tensors_[0].Shape().size() + axis_) : axis_;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  out_same_format_ = tensorrt_in_tensors_[0].same_format_;
+  mode_ = GetScaleMode(axis_);
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]);
+
+  nvinfer1::ITensor *scale_in_tensor = PreProcessInputTensor(ctx);
+  if (scale_in_tensor == nullptr) {
+    MS_LOG(ERROR) << "PreProcessInputTensor failed: " << op_name_;
+    return RET_ERROR;
+  }
+
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(scale_in_tensor, out_format_, out_same_format_);
+
+  nvinfer1::ITensor *op_out_tensor{nullptr};
+  if (scale_in_tensor->getDimensions().nbDims == DIMENSION_4D) {
+    op_out_tensor = RunAs4DimsScale(ctx, scale_in_tensor);
+  } else {
+    op_out_tensor = RunAsMutiDimsScale(ctx, scale_in_tensor);
+  }
+  CHECK_NULL_RETURN(op_out_tensor);
+
+  // add activation
+  if (activation_type != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation_type, 0, 0, 0, op_out_tensor, device_id_);
+    CHECK_NULL_RETURN(activation_layer);
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    op_out_tensor = activation_layer->getOutput(0);
+  }
+
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, out_format_, out_same_format_});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+nvinfer1::ITensor *ScaleTensorRT::PreProcessInputTensor(TensorRTContext *ctx) {
+  nvinfer1::ITensor *scale_in_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      mode_ == nvinfer1::ScaleMode::kCHANNEL) {
+    // per channel input format should be nchw, otherwise should be same with scale nhwc
+    // transpose: NHWC->NCHW
+    if ((tensorrt_in_tensors_[0].format_ == Format::NHWC && axis_ == kNHWC_C) ||
+        (tensorrt_in_tensors_[0].same_format_ == true && axis_ == kNHWC_C)) {
+      nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer_in == nullptr) {
+        MS_LOG(ERROR) << "op action convert failed";
+        return nullptr;
+      }
+      transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+      scale_in_tensor = transpose_layer_in->getOutput(0);
+      out_format_ = Format::NCHW;
+      out_same_format_ = !out_same_format_;
+    } else if (out_format_ != Format::NCHW && axis_ != kNCHW_C) {
+      MS_LOG(WARNING) << op_name_ << " out format (NHWC:1, NCHW:0) infer as " << out_format_ << ", and axis is "
+                      << axis_;
+    }
+  } else if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+             tensorrt_in_tensors_[0].format_ == Format::NCHW && mode_ == nvinfer1::ScaleMode::kELEMENTWISE) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return nullptr;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    scale_in_tensor = transpose_layer_in->getOutput(0);
+    out_format_ = Format::NHWC;
+    out_same_format_ = true;
+  }
+  return scale_in_tensor;
+}
+
+nvinfer1::ScaleMode ScaleTensorRT::GetScaleMode(int64_t axis) {
+  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kUNIFORM;
+  auto input_data_shape = in_tensors_[0].Shape();
+  auto input_weight_shape = in_tensors_[1].Shape();
+  int total = std::accumulate(input_data_shape.begin(), input_data_shape.end(), 1, std::multiplies<int>());
+  if (input_weight_shape.size() == 0 || (input_weight_shape.size() == 1 && input_weight_shape[0] == 1)) {
+    mode = nvinfer1::ScaleMode::kUNIFORM;
+  } else if ((axis < static_cast<int64_t>(input_data_shape.size()) && input_weight_shape.size() == 1 &&
+              input_data_shape[axis] == input_weight_shape[0]) ||
+             (input_data_shape.size() == DIMENSION_4D && axis == DIMENSION_3D)) {
+    mode = nvinfer1::ScaleMode::kCHANNEL;
+  } else if (input_weight_shape.size() == 1 && input_weight_shape[0] == total) {
+    mode = nvinfer1::ScaleMode::kELEMENTWISE;
+  } else {
+    MS_LOG(ERROR) << "ScaleMode create failed: " << op_name_;
+    return mode;
+  }
+  MS_LOG(DEBUG) << op_name_ << " ScaleMode(UNIFORM 0, CHANNEL 1, ELEMENTWISE 2): " << static_cast<int>(mode);
+  return mode;
+}
+
+nvinfer1::ITensor *ScaleTensorRT::RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) {
+  bool nd = false;
+  // (input * scale + shift) ^ power
+  nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  if (in_tensors_.size() > SCALE_INDEX) {
+    scale.values = in_tensors_[SCALE_INDEX].MutableData();
+    MS_ASSERT(scale.values);
+    scale.count = in_tensors_[SCALE_INDEX].ElementNum();
+    scale.type = ConvertDataType(in_tensors_[SCALE_INDEX].DataType());
+    shift.type = scale.type;
+    power.type = scale.type;
+    nd = in_tensors_[1].Shape().size() == 1 ? false : true;
+  }
+  if (in_tensors_.size() > SHIFT_INDEX) {
+    shift.values = in_tensors_[SHIFT_INDEX].MutableData();
+    MS_ASSERT(shift.values);
+    shift.count = in_tensors_[SHIFT_INDEX].ElementNum();
+  }
+  if (in_tensors_.size() > POWER_INDEX) {
+    power.values = in_tensors_[POWER_INDEX].MutableData();
+    MS_ASSERT(power.values);
+    power.count = in_tensors_[POWER_INDEX].ElementNum();
+  }
+  nvinfer1::IScaleLayer *cal_layer = nullptr;
+
+  if (nd) {
+    MS_LOG(WARNING) << "multi dims ScaleMode enter";
+    cal_layer = ctx->network()->addScaleNd(*scale_in_tensor, mode_, shift, scale, power, axis_);
+  } else {
+    cal_layer = ctx->network()->addScale(*scale_in_tensor, mode_, shift, scale, power);
+  }
+
+  if (cal_layer == nullptr) {
+    MS_LOG(ERROR) << "addScaleNd failed for: " << op_name_;
+    return nullptr;
+  }
+  cal_layer->setName(op_name_.c_str());
+  this->layer_ = cal_layer;
+  return cal_layer->getOutput(0);
+}
+
+nvinfer1::ITensor *ScaleTensorRT::RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) {
+  auto scale_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_);
+  if (scale_tensor == nullptr) {
+    MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_;
+    return nullptr;
+  }
+  auto mul_layer =
+    ctx->network()->addElementWise(*scale_in_tensor, *scale_tensor, nvinfer1::ElementWiseOperation::kPROD);
+  if (mul_layer == nullptr) {
+    MS_LOG(ERROR) << "add mul failed for " << op_name_;
+    return nullptr;
+  }
+  mul_layer->setName((op_name_ + "_scale").c_str());
+  layer_ = mul_layer;
+  nvinfer1::ITensor *out_tensor = mul_layer->getOutput(0);
+  // add shift
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    auto shift_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[SHIFT_INDEX], in_tensors_[0].Shape(), op_name_);
+    if (shift_tensor == nullptr) {
+      MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_;
+      return nullptr;
+    }
+    auto shift_layer = ctx->network()->addElementWise(*out_tensor, *shift_tensor, nvinfer1::ElementWiseOperation::kSUM);
+    if (shift_layer == nullptr) {
+      MS_LOG(ERROR) << "add bias failed for " << op_name_;
+      return nullptr;
+    }
+    shift_layer->setName((op_name_ + "_shift").c_str());
+    out_tensor = shift_layer->getOutput(0);
+  }
+  if (in_tensors_.size() == INPUT_SIZE4) {
+    MS_LOG(WARNING) << op_name_ << " has power";
+    return nullptr;
+  }
+  return out_tensor;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScaleFusion, ScaleTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h
new file mode 100644
index 00000000000..463b7813549
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+namespace mindspore::lite {
+class ScaleTensorRT : public TensorRTOp {
+ public:
+  ScaleTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ScaleTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  nvinfer1::ScaleMode GetScaleMode(int64_t axis);
+
+  nvinfer1::ITensor *PreProcessInputTensor(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor);
+
+  nvinfer1::ITensor *RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor);
+
+  Format out_format_;
+
+  bool out_same_format_{false};
+
+  nvinfer1::ScaleMode mode_;
+
+  int64_t axis_{0};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc
new file mode 100644
index 00000000000..58d948ab6d2
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int ScatterNdTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                                 const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+#if TRT_VERSION_GE(8, 2)
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher";
+  return RET_ERROR;
+#endif
+}
+
+int ScatterNdTensorRT::AddInnerOp(TensorRTContext *ctx) {
+#if TRT_VERSION_GE(8, 2)
+  ITensorHelper scatter_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &scatter_input);
+  if (ret != RET_OK || scatter_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return ret;
+  }
+  if (tensorrt_in_tensors_.size() < INPUT_SIZE3) {
+    auto indices = ConvertConstantTensor(ctx, in_tensors_[1], op_name_ + "_indice");
+    if (indices == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{indices});
+    auto updates = ConvertConstantTensor(ctx, in_tensors_[INPUT_SIZE2], op_name_ + "_update");
+    if (updates == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{updates});
+  }
+  ITensorHelper indices_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &indices_helper);
+  if (ret != RET_OK || indices_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim indices tensor failed for " << op_name_;
+    return ret;
+  }
+  ITensorHelper updates_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[INPUT_SIZE2], &updates_helper);
+  if (ret != RET_OK || updates_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim update tensor failed for " << op_name_;
+    return ret;
+  }
+
+  nvinfer1::IScatterLayer *scatter_layer = ctx->network()->addScatter(
+    *scatter_input.trt_tensor_, *indices_helper.trt_tensor_, *updates_helper.trt_tensor_, nvinfer1::ScatterMode::kND);
+  if (scatter_layer == nullptr) {
+    MS_LOG(ERROR) << "addScatter failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *out_tensor = scatter_layer->getOutput(0);
+  out_tensor->setName((op_name_ + "_0").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, scatter_input.format_, scatter_input.same_format_});
+  this->layer_ = scatter_layer;
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher";
+  return RET_ERROR;
+#endif
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScatterNdUpdate, ScatterNdTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h
new file mode 100644
index 00000000000..c8954d206aa
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ScatterNdTensorRT : public TensorRTOp {
+ public:
+  ScatterNdTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ScatterNdTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc
new file mode 100644
index 00000000000..99c016a3665
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/shape_tensorrt.h"
+
+namespace mindspore::lite {
+int ShapeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+}
+int ShapeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *shape_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NCHW->NHWC failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    shape_input = transpose_layer_in->getOutput(0);
+    this->transpose_layer_ = transpose_layer_in;
+  }
+  nvinfer1::IShapeLayer *shape_layer = ctx->network()->addShape(*shape_input);
+
+  if (shape_layer == nullptr) {
+    MS_LOG(ERROR) << "add shape op failed for TensorRT.";
+    return RET_ERROR;
+  }
+  shape_layer->setName(op_name_.c_str());
+  shape_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{shape_layer->getOutput(0), Format::NHWC, true});
+  this->layer_ = shape_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Shape, ShapeTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h
new file mode 100644
index 00000000000..f7cce06daa4
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ShapeTensorRT : public TensorRTOp {
+ public:
+  ShapeTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ShapeTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc
new file mode 100644
index 00000000000..53886a2d0cb
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc
@@ -0,0 +1,437 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h"
+#include <vector>
+#include <numeric>
+#include <functional>
+
+namespace mindspore::lite {
+int ShuffleTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  switch (type_) {
+    case schema::PrimitiveType_Flatten:
+    case schema::PrimitiveType_Unsqueeze: {
+      if (in_tensors.size() != 1) {
+        MS_LOG(ERROR) << "Unsupported in_tensors size " << in_tensors.size() << " of "
+                      << schema::EnumNamePrimitiveType(type_);
+        return RET_ERROR;
+      }
+      break;
+    }
+    case schema::PrimitiveType_Squeeze: {
+      if (in_tensors.size() != 1) {
+        MS_LOG(ERROR) << "Unsupported in_tensors size " << in_tensors.size() << " of "
+                      << schema::EnumNamePrimitiveType(type_);
+        return RET_ERROR;
+      }
+      auto squeeze_op = this->op_primitive_->value_as_Squeeze();
+      if (squeeze_op == nullptr) {
+        MS_LOG(ERROR) << "SqueezeOp convert failed";
+        return RET_ERROR;
+      }
+      param_axis_ = squeeze_op->axis();
+      if (param_axis_ == nullptr) {
+        MS_LOG(WARNING) << op_name_ << " is a full dim squeeze, don't support dynamic input shape.";
+        dynamic_shape_params_.support_dynamic_ = false;
+        dynamic_shape_params_.support_hw_dynamic_ = false;
+      }
+      break;
+    }
+    case schema::PrimitiveType_Reshape: {
+      if (in_tensors.size() != INPUT_SIZE2) {
+        MS_LOG(ERROR) << "PrimitiveType_Transpose Unsupported in_tensors size: " << in_tensors.size();
+        return RET_ERROR;
+      }
+      dynamic_shape_params_.support_hw_dynamic_ = false;
+      if (in_tensors[0].Shape()[0] != out_tensors[0].Shape()[0]) {
+        dynamic_shape_params_.support_dynamic_ = false;
+      }
+      break;
+    }
+    case schema::PrimitiveType_Transpose:
+    case schema::PrimitiveType_ExpandDims:
+    case schema::PrimitiveType_BroadcastTo: {
+      if (in_tensors.size() != INPUT_SIZE2) {
+        MS_LOG(ERROR) << "PrimitiveType_Transpose Unsupported in_tensors size: " << in_tensors.size();
+        return RET_ERROR;
+      }
+      if (in_tensors[1].Data() == nullptr) {
+        MS_LOG(ERROR) << "Unsupported shape tensor of " << schema::EnumNamePrimitiveType(type_);
+        return RET_ERROR;
+      }
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "Unsupported op type:" << schema::EnumNamePrimitiveType(type_);
+      return RET_ERROR;
+    }
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensort size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ShuffleTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  ctx_ = ctx;
+
+  int ret = InputTensorPreprocess();
+  if (ret != RET_OK || shuffler_input_ == nullptr) {
+    MS_LOG(ERROR) << "InputTensorPreprocess failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::IShuffleLayer *shuffle_layer = ctx->network()->addShuffle(*shuffler_input_);
+  if (shuffle_layer == nullptr) {
+    MS_LOG(ERROR) << "add Shuffle op failed for TensorRT.";
+    return RET_ERROR;
+  }
+  shuffle_layer->setName(op_name_.c_str());
+  this->layer_ = shuffle_layer;
+
+  ret = RET_OK;
+  switch (type_) {
+    case schema::PrimitiveType_Unsqueeze: {
+      ret = AddUnsqueezeOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_Squeeze: {
+      ret = AddSqueezeOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_Transpose: {
+      ret = AddTransposeOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_Reshape: {
+      ret = AddReshapeOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_Flatten: {
+      ret = AddFlattenOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_ExpandDims: {
+      ret = AddExpandDimsOp(shuffle_layer);
+      break;
+    }
+    case schema::PrimitiveType_BroadcastTo: {
+      ret = AddBroadcastToOp(shuffle_layer);
+      break;
+    }
+    default:
+      MS_LOG(ERROR) << "Unsupported op type for " << op_name_;
+      return RET_ERROR;
+  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AddOp failed for " << op_name_;
+    return ret;
+  }
+
+  if (shuffler_output_ == nullptr) {
+    MS_LOG(ERROR) << "output tensor create failed for " << op_name_;
+    return RET_ERROR;
+  }
+  shuffler_output_->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{shuffler_output_, out_format_, true});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+int ShuffleTensorRT::InputTensorPreprocess() {
+  shuffler_input_ = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]);
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  if (shuffler_input_->getDimensions().nbDims == DIMENSION_4D && !tensorrt_in_tensors_[0].same_format_) {
+    // input tensor support NCHW format input
+    if (tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+      // for transpose op, if tensor has same dim with ms tensor, keep origin dims
+      nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx_, *shuffler_input_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      shuffler_input_ = transpose_layer->getOutput(0);
+      out_format_ = Format::NHWC;
+    } else if (tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+      // infer format may error, correct here
+      nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx_, *shuffler_input_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      shuffler_input_ = transpose_layer->getOutput(0);
+      out_format_ = Format::NCHW;
+    }
+  }
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(shuffler_input_, out_format_, true);
+  return RET_OK;
+}
+
+int ShuffleTensorRT::AddSqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  // axis
+  auto squeeze_shape = shuffler_input_->getDimensions();
+  std::vector<int64_t> new_shape(squeeze_shape.d, squeeze_shape.d + squeeze_shape.nbDims);
+  if (param_axis_ == nullptr) {
+    MS_LOG(WARNING) << op_name_ << " has null axis, output shape is totally depends on ms tensor.";
+    new_shape = out_tensors_[0].Shape();
+  } else {
+    for (int i = param_axis_->size() - 1; i >= 0; i--) {
+      if (new_shape[param_axis_->Get(i)] != 1) {
+        MS_LOG(WARNING) << "squeeze_shape value at " << i << " is " << param_axis_->Get(i) << ", need check "
+                        << op_name_;
+      }
+      new_shape.erase(new_shape.begin() + param_axis_->Get(i));
+    }
+  }
+
+  nvinfer1::Dims squeeze_dims = lite::ConvertCudaDims(new_shape);
+  if (squeeze_dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  shuffle_layer->setReshapeDimensions(squeeze_dims);
+  shuffler_output_ = shuffle_layer->getOutput(0);
+  return shuffler_output_ == nullptr ? RET_ERROR : RET_OK;
+}
+
+int ShuffleTensorRT::AddUnsqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  // Unsqueeze
+  auto unsqueeze_op = this->op_primitive_->value_as_Unsqueeze();
+  if (unsqueeze_op == nullptr) {
+    MS_LOG(ERROR) << "AddUnsqueezeOp convert failed";
+    return RET_ERROR;
+  }
+  // axis
+  param_axis_ = unsqueeze_op->axis();
+  if (param_axis_ == nullptr) {
+    MS_LOG(ERROR) << "axis is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+  if (param_axis_->size() != 1) {
+    MS_LOG(WARNING) << op_name_ << " has unsqueeze axis size: " << param_axis_->size();
+  }
+  nvinfer1::ITensor *expand_input = shuffler_input_;
+  for (size_t i = 0; i < param_axis_->size(); i++) {
+    expand_input = ExpandDim(shuffle_layer, expand_input, param_axis_->Get(i));
+  }
+  shuffler_output_ = expand_input;
+  return shuffler_output_ == nullptr ? RET_ERROR : RET_OK;
+}
+
+int ShuffleTensorRT::AddTransposeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  if (in_tensors_[0].Shape().size() != in_tensors_[1].ElementNum()) {
+    MS_LOG(WARNING) << "transpose perm is invalid for input, ignore " << op_name_;
+    shuffler_output_ = shuffler_input_;
+    return RET_OK;
+  }
+  auto transpose_op = this->op_primitive_->value_as_Transpose();
+  if (transpose_op == nullptr) {
+    MS_LOG(ERROR) << "AddTransposeOp convert failed";
+    return RET_ERROR;
+  }
+  // perm
+  mindspore::MSTensor perm_ternsor = in_tensors_[1];
+  if (perm_ternsor.Data() == nullptr) {
+    MS_LOG(ERROR) << "AddTransposeOp perm_ternsor data is invalid: " << op_name_;
+    return RET_ERROR;
+  }
+  int *perm_data = reinterpret_cast<int *>(perm_ternsor.MutableData());
+
+  nvinfer1::Permutation perm{};
+  for (int i = 0; i < perm_ternsor.ElementNum(); i++) {
+    perm.order[i] = *perm_data;
+    perm_data++;
+  }
+  shuffle_layer->setFirstTranspose(perm);
+  if (perm_ternsor.ElementNum() == DIMENSION_4D) {
+    if (perm.order[kNCHW_C] == kNHWC_C && perm.order[kNCHW_H] == kNHWC_H && perm.order[kNCHW_W] == kNHWC_W) {
+      out_format_ = Format::NCHW;
+    } else if (perm.order[kNHWC_H] == kNCHW_H && perm.order[kNHWC_W] == kNCHW_W && perm.order[kNHWC_C] == kNCHW_C) {
+      out_format_ = Format::NHWC;
+    } else {
+      MS_LOG(INFO) << "input format and perm order is not NHWC or NCHW: " << op_name_;
+    }
+  }
+  shuffler_output_ = shuffle_layer->getOutput(0);
+  return RET_OK;
+}
+
+int ShuffleTensorRT::AddReshapeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  mindspore::MSTensor &shape_tensor = in_tensors_[1];
+  if (shape_tensor.Data() != nullptr) {
+    // static shuffle layer
+    shuffle_layer->setReshapeDimensions(
+      InferReshapeDims(shuffler_input_->getDimensions(), in_tensors_[0].Shape(), out_tensors_[0].Shape()));
+  } else {
+    if (tensorrt_in_tensors_.size() != INPUT_SIZE2) {
+      MS_LOG(ERROR) << "invalid shape tensor for reshape " << op_name_;
+      return RET_ERROR;
+    }
+    shuffle_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_);
+  }
+  shuffler_output_ = shuffle_layer->getOutput(0);
+  return RET_OK;
+}
+
+int ShuffleTensorRT::AddFlattenOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  nvinfer1::Dims flatten_dims;
+  const std::vector<int64_t> &input_shape = in_tensors_[0].Shape();
+  flatten_dims.nbDims = DIMENSION_2D;
+  flatten_dims.d[0] = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0] == -1
+                        ? 0
+                        : tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0];
+  flatten_dims.d[1] = std::accumulate(input_shape.begin() + 1, input_shape.end(), 1, std::multiplies<int>());
+  if (flatten_dims.d[1] <= 0) {
+    MS_LOG(ERROR) << op_name_ << "infer shape failed";
+  }
+  shuffle_layer->setReshapeDimensions(flatten_dims);
+  shuffler_output_ = shuffle_layer->getOutput(0);
+  return RET_OK;
+}
+
+int ShuffleTensorRT::AddExpandDimsOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  if (in_tensors_[1].DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(WARNING) << op_name_ << " axis tensor data type is " << static_cast<int>(in_tensors_[1].DataType());
+  }
+  auto axis_data = static_cast<const int *>(in_tensors_[1].Data().get());
+  int axis = axis_data[0];
+  shuffler_output_ = ExpandDim(shuffle_layer, shuffler_input_, axis);
+  return shuffler_output_ == nullptr ? RET_ERROR : RET_OK;
+}
+
+int ShuffleTensorRT::AddBroadcastToOp(nvinfer1::IShuffleLayer *shuffle_layer) {
+  if (out_tensors_[0].ElementNum() != in_tensors_[0].ElementNum() &&
+      out_tensors_[0].Shape().size() == in_tensors_[0].Shape().size()) {
+    MS_LOG(WARNING) << "broadcast element cnt changes, ignore broadcast for " << op_name_;
+    shuffle_layer->setReshapeDimensions(shuffler_input_->getDimensions());
+  } else if (out_tensors_[0].ElementNum() == in_tensors_[0].ElementNum()) {
+    nvinfer1::Dims new_dims = ConvertCudaDims(out_tensors_[0].Shape());
+    if (new_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return RET_ERROR;
+    }
+    new_dims.d[0] = shuffler_input_->getDimensions().d[0];
+    shuffle_layer->setReshapeDimensions(new_dims);
+  } else {
+    MS_LOG(ERROR) << "broadcast needs check for " << op_name_;
+  }
+  shuffler_output_ = shuffle_layer->getOutput(0);
+  return shuffler_output_ == nullptr ? RET_ERROR : RET_OK;
+}
+
+nvinfer1::ITensor *ShuffleTensorRT::ExpandDim(nvinfer1::IShuffleLayer *shuffle_layer, nvinfer1::ITensor *input_tensor,
+                                              int axis) {
+  auto input_dims = input_tensor->getDimensions();
+  // if expand dim not at last dim and shape is dynamic, change to expanddim at last dim and transpose
+  bool special_expand = false;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    special_expand = special_expand || input_dims.d[i] == -1;
+  }
+  special_expand = special_expand && (axis != -1 && axis != input_dims.nbDims - 1);
+
+  if (special_expand) {
+    std::vector<int64_t> new_shape;
+    for (int i = 0; i < input_dims.nbDims; i++) {
+      new_shape.push_back(input_dims.d[i] == -1 ? 0 : input_dims.d[i]);
+    }
+    new_shape.push_back(1);
+    nvinfer1::Dims new_dims = ConvertCudaDims(new_shape);
+    if (new_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return nullptr;
+    }
+    shuffle_layer->setReshapeDimensions(new_dims);
+    // transpose
+    nvinfer1::Permutation perm{};
+    for (int i = 0; i < new_dims.nbDims; i++) {
+      if (i < axis) {
+        perm.order[i] = i;
+      } else if (i == axis) {
+        perm.order[i] = new_dims.nbDims - 1;
+      } else {
+        perm.order[i] = i - 1;
+      }
+    }
+    nvinfer1::IShuffleLayer *trans_layer = ctx_->network()->addShuffle(*shuffle_layer->getOutput(0));
+    if (trans_layer == nullptr) {
+      MS_LOG(ERROR) << "add transpose layer failed for special expand dims op " << op_name_;
+      return nullptr;
+    }
+    trans_layer->setFirstTranspose(perm);
+    return trans_layer->getOutput(0);
+  } else {
+    std::vector<int64_t> new_shape;
+    for (int i = 0; i < input_dims.nbDims; i++) {
+      if (axis == i) {
+        new_shape.push_back(1);
+      }
+      new_shape.push_back(input_dims.d[i] == -1 ? 0 : input_dims.d[i]);
+    }
+    if (axis == -1 || axis == input_dims.nbDims) {
+      new_shape.push_back(1);
+    }
+    nvinfer1::Dims new_dims = ConvertCudaDims(new_shape);
+    if (new_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return nullptr;
+    }
+    shuffle_layer->setReshapeDimensions(new_dims);
+    return shuffle_layer->getOutput(0);
+  }
+}
+
+nvinfer1::Dims ShuffleTensorRT::InferReshapeDims(const nvinfer1::Dims &input_dims,
+                                                 const std::vector<int64_t> &ms_input_shape,
+                                                 const std::vector<int64_t> &ms_output_shape) {
+  // tensorrt support infer shape of 0 and -1
+  nvinfer1::Dims reshape_dims = ConvertCudaDims(ms_output_shape);
+  if (reshape_dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return reshape_dims;
+  }
+  for (int i = 0; i < reshape_dims.nbDims; i++) {
+    if (input_dims.d[i] == -1) {
+      if (ms_input_shape[i] == ms_output_shape[i]) {
+        reshape_dims.d[i] = 0;
+      } else {
+        reshape_dims.d[i] = -1;
+      }
+    }
+    MS_LOG(DEBUG) << "reshape infer_index " << i << " value: " << reshape_dims.d[i];
+  }
+  return reshape_dims;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Unsqueeze, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Squeeze, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Reshape, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Transpose, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Flatten, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ExpandDims, ShuffleTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_BroadcastTo, ShuffleTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h
new file mode 100644
index 00000000000..d326c37588a
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+class ShuffleTensorRT : public TensorRTOp {
+ public:
+  ShuffleTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ShuffleTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int InputTensorPreprocess();
+  int AddSqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddUnsqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddTransposeOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddReshapeOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddFlattenOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddExpandDimsOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  int AddBroadcastToOp(nvinfer1::IShuffleLayer *shuffle_layer);
+  nvinfer1::ITensor *ExpandDim(nvinfer1::IShuffleLayer *shuffle_layer, nvinfer1::ITensor *input_tensor, int axis);
+  nvinfer1::Dims InferReshapeDims(const nvinfer1::Dims &input_dims, const std::vector<int64_t> &ms_input_shape,
+                                  const std::vector<int64_t> &ms_output_shape);
+
+  Format out_format_ = Format::NHWC;
+  nvinfer1::ITensor *shuffler_input_{nullptr};
+  nvinfer1::ITensor *shuffler_output_{nullptr};
+  TensorRTContext *ctx_{nullptr};
+  const flatbuffers::Vector<int64_t> *param_axis_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc
new file mode 100644
index 00000000000..1908acceea3
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc
@@ -0,0 +1,281 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <utility>
+#include "src/runtime/delegate/tensorrt/op/slice_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+namespace {
+class StrideSliceTensorRTUtil final : public SliceTensorRTUtil {
+ public:
+  StrideSliceTensorRTUtil() = default;
+  ~StrideSliceTensorRTUtil() = default;
+  bool IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors) override {
+    if (in_tensors.size() < HAS_AXIS - 1) {
+      MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+      return false;
+    }
+    if (out_tensors.size() != 1) {
+      MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+      return false;
+    }
+    if (in_tensors.at(BEGINS_INDEX).Data() == nullptr || in_tensors.at(ENDS_INDEX).Data() == nullptr) {
+      MS_LOG(ERROR) << "invalid input tensor for: " << op_name_;
+      return false;
+    }
+    return true;
+  }
+  std::tuple<nvinfer1::Dims, nvinfer1::Dims, nvinfer1::Dims> GetSliceParams(
+    const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+    const std::vector<mindspore::MSTensor> &out_tensors) override {
+    const mindspore::MSTensor &begin = in_tensors.at(BEGINS_INDEX);
+    const mindspore::MSTensor &stride = in_tensors.back();
+    const mindspore::MSTensor &end = in_tensors.at(ENDS_INDEX);
+
+    nvinfer1::Dims start_dims;
+    nvinfer1::Dims size_dims;
+    nvinfer1::Dims stride_dims;
+
+    size_t axis_index = in_tensors.size() == HAS_AXIS ? AXIS_INDEX : -1;
+    auto out_shape = out_tensors.front().Shape();
+    if (static_cast<size_t>(begin.ElementNum()) == in_tensors.at(0).Shape().size()) {
+      start_dims = lite::ConvertCudaDims(begin.Data().get(), begin.ElementNum());
+      if (shrink_axis_ == 0) {
+        size_dims = lite::ConvertCudaDims(out_shape);
+      } else {
+        size_dims.nbDims = start_dims.nbDims;
+        auto end_dims = lite::ConvertCudaDims(end.Data().get(), end.ElementNum());
+        for (int i = 0; i < size_dims.nbDims; i++) {
+          size_dims.d[i] = end_dims.d[i] - start_dims.d[i];
+        }
+      }
+      stride_dims = lite::ConvertCudaDims(stride.Data().get(), stride.ElementNum());
+    } else {
+      if (axis_index == -1 || in_tensors.at(axis_index).ElementNum() != 1) {
+        MS_LOG(ERROR) << "invalid input params for " << op_name_;
+        return {};
+      }
+      int axis_value = *(static_cast<const int *>(in_tensors.at(axis_index).Data().get()));
+      int start_value = *(static_cast<const int *>(begin.Data().get()));
+      start_dims.nbDims = in_tensors.at(0).Shape().size();
+      for (int i = 0; i < start_dims.nbDims; i++) {
+        start_dims.d[i] = (i == axis_value) ? start_value : 0;
+      }
+
+      size_dims = lite::ConvertCudaDims(out_shape);
+      int stride_value = *(static_cast<const int *>(stride.Data().get()));
+      stride_dims = nvinfer1::Dims{size_dims.nbDims, {}};
+      std::fill(stride_dims.d, stride_dims.d + stride_dims.nbDims, stride_value);
+    }
+    return std::make_tuple(start_dims, size_dims, stride_dims);
+  }
+  nvinfer1::ITensor *PostProcess(TensorRTContext *ctx, nvinfer1::ITensor *input,
+                                 const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+    if (shrink_axis_ != 0) {
+      return Reshape(ctx, input, out_tensors.at(0).Shape());
+    }
+    return input;
+  }
+  void SetShrinkAxis(int shrink_axis) { shrink_axis_ = shrink_axis; }
+
+ private:
+  int shrink_axis_;
+};
+
+class SliceFusionTensorRTUtil final : public SliceTensorRTUtil {
+ public:
+  SliceFusionTensorRTUtil() = default;
+  ~SliceFusionTensorRTUtil() = default;
+  bool IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors) override {
+    if (in_tensors.size() != SLICE_INPUT_SIZE) {
+      MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+      return false;
+    }
+    if (out_tensors.size() != 1) {
+      MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+      return false;
+    }
+    return true;
+  }
+  std::tuple<nvinfer1::Dims, nvinfer1::Dims, nvinfer1::Dims> GetSliceParams(
+    const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+    const std::vector<mindspore::MSTensor> &out_tensors) override {
+    const auto &input = in_tensors.at(0);
+    const auto &begin = in_tensors.at(1);
+    const auto &size = in_tensors.at(SIZE_INDEX);
+
+    auto start_dims = lite::ConvertCudaDims(begin.Data().get(), begin.ElementNum());
+    auto size_dims = lite::ConvertCudaDims(size.Data().get(), size.ElementNum());
+    auto stride_dims = lite::ConvertCudaDims(1, begin.ElementNum());
+
+    return std::make_tuple(start_dims, size_dims, stride_dims);
+  }
+};
+
+class CropTensorRTUtil final : public SliceTensorRTUtil {
+ public:
+  CropTensorRTUtil() = default;
+  ~CropTensorRTUtil() = default;
+  bool IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors) override {
+    if (in_tensors.size() != CROP_INPUT_SIZE) {
+      MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+      return false;
+    }
+    if (out_tensors.size() != 1) {
+      MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+      return false;
+    }
+    auto crop_primitive = primitive->value_as_Crop();
+    if (crop_primitive == nullptr) {
+      MS_LOG(ERROR) << "Cast primitive to crop fail";
+      return false;
+    }
+    axis_ = static_cast<int>(crop_primitive->axis());
+    auto offsets_ptr = crop_primitive->offsets();
+    if (offsets_ptr == nullptr) {
+      MS_LOG(ERROR) << "Crop Op do not have offset attr";
+      return false;
+    }
+    if (axis_ < 0) {
+      axis_ += in_tensors.at(0).Shape().size();
+    }
+    if (axis_ < 0 || axis_ + offsets_ptr->size() != in_tensors.at(0).Shape().size()) {
+      MS_LOG(ERROR) << "axis and offsets not match input tensor shape, axis is " << crop_primitive->axis()
+                    << " , offsets size is " << offsets_ptr->size() << " , input size is "
+                    << in_tensors.at(0).Shape().size();
+      return false;
+    }
+    if (in_tensors.at(0).Shape().size() != in_tensors.at(1).Shape().size()) {
+      MS_LOG(ERROR) << "input tensor 0 and 1 size not equal,"
+                    << " input 0 size is " << in_tensors.at(0).Shape().size() << " , input tensor 1 size is "
+                    << in_tensors.at(1).Shape().size();
+      return false;
+    }
+    return true;
+  }
+  std::tuple<nvinfer1::Dims, nvinfer1::Dims, nvinfer1::Dims> GetSliceParams(
+    const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+    const std::vector<mindspore::MSTensor> &out_tensors) override {
+    auto crop_primitive = primitive->value_as_Crop();
+    auto offsets_ptr = crop_primitive->offsets();
+
+    std::vector<int> begin(in_tensors.at(0).Shape().size(), 0);
+    for (size_t i = 0; i != offsets_ptr->size(); ++i) {
+      begin[axis_ + i] = offsets_ptr->Get(i);
+    }
+
+    std::vector<int> size(in_tensors.at(0).Shape().size());
+    for (size_t i = 0; i != size.size(); ++i) {
+      size[i] = in_tensors.at(1).Shape().at(i);
+    }
+
+    auto start_dims = lite::ConvertCudaDims(&begin[0], begin.size());
+    auto size_dims = lite::ConvertCudaDims(&size[0], size.size());
+    auto stride_dims = lite::ConvertCudaDims(1, begin.size());
+
+    return std::make_tuple(start_dims, size_dims, stride_dims);
+  }
+
+ private:
+  int axis_;
+};
+}  // namespace
+
+SliceTensorRT::SliceTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                             const schema::QuantType &quant_type)
+    : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {
+  if (primitive->value_type() == schema::PrimitiveType_StridedSlice) {
+    auto slice_fusion_util = std::make_unique<StrideSliceTensorRTUtil>();
+    slice_fusion_util->SetShrinkAxis(primitive->value_as_StridedSlice()->shrink_axis_mask());
+    util_ = std::move(slice_fusion_util);
+  } else if (primitive->value_type() == schema::PrimitiveType_SliceFusion) {
+    util_ = std::make_unique<SliceFusionTensorRTUtil>();
+  } else if (primitive->value_type() == schema::PrimitiveType_Crop) {
+    util_ = std::make_unique<CropTensorRTUtil>();
+  } else {
+    util_ = nullptr;
+  }
+  if (util_ != nullptr) {
+    util_->op_name_ = op_name_;
+  }
+}
+
+int SliceTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                             const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (util_ == nullptr) {
+    MS_LOG(ERROR) << "Unsupported op_type: " << op_name_;
+    return RET_ERROR;
+  }
+  if (!util_->IsSupport(primitive, in_tensors, out_tensors)) {
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+}
+
+int SliceTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  ITensorHelper slice_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &slice_input);
+  if (ret != RET_OK || slice_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::Dims start_dims;
+  nvinfer1::Dims size_dims;
+  nvinfer1::Dims stride_dims;
+  std::tie(start_dims, size_dims, stride_dims) = util_->GetSliceParams(op_primitive_, in_tensors_, out_tensors_);
+  if (start_dims.nbDims == -1 || size_dims.nbDims == -1 || stride_dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ISliceLayer *slice_layer =
+    ctx->network()->addSlice(*slice_input.trt_tensor_, start_dims, size_dims, stride_dims);
+  if (slice_layer == nullptr) {
+    MS_LOG(ERROR) << "add Slice op failed for TensorRT: " << op_name_;
+    return RET_ERROR;
+  }
+  this->layer_ = slice_layer;
+  slice_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *out_tensor = slice_layer->getOutput(0);
+  out_tensor = util_->PostProcess(ctx, out_tensor, in_tensors_, out_tensors_);
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "output tensor create failed";
+    return RET_ERROR;
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, slice_input.format_, slice_input.same_format_});
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_StridedSlice, SliceTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_SliceFusion, SliceTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Crop, SliceTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h
new file mode 100644
index 00000000000..e1f82cbe183
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <tuple>
+#include <memory>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class SliceTensorRTUtil {
+ public:
+  SliceTensorRTUtil() = default;
+  virtual ~SliceTensorRTUtil() = default;
+  virtual bool IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                         const std::vector<mindspore::MSTensor> &out_tensors) = 0;
+  virtual std::tuple<nvinfer1::Dims, nvinfer1::Dims, nvinfer1::Dims> GetSliceParams(
+    const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+    const std::vector<mindspore::MSTensor> &out_tensors) = 0;
+  virtual nvinfer1::ITensor *PostProcess(TensorRTContext *ctx, nvinfer1::ITensor *input,
+                                         const std::vector<mindspore::MSTensor> &in_tensors,
+                                         const std::vector<mindspore::MSTensor> &out_tensors) {
+    return input;
+  }
+  std::string op_name_;
+};
+
+constexpr int BEGINS_INDEX = 1;
+constexpr int ENDS_INDEX = 2;
+constexpr int SIZE_INDEX = 2;
+constexpr int HAS_AXIS = 5;
+constexpr int AXIS_INDEX = 3;
+constexpr int CROP_INPUT_SIZE = 2;
+constexpr int SLICE_INPUT_SIZE = 3;
+class SliceTensorRT : public TensorRTOp {
+ public:
+  SliceTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type);
+
+  ~SliceTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  std::unique_ptr<SliceTensorRTUtil> util_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc
new file mode 100644
index 00000000000..29c68ddd97a
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/softmax_tensorrt.h"
+
+namespace mindspore::lite {
+int SoftMaxTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  softmax_op_ = primitive->value_as_Softmax();
+  if (softmax_op_ == nullptr) {
+    MS_LOG(ERROR) << "convert failed";
+    return RET_ERROR;
+  }
+
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int SoftMaxTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network is invalid";
+    return RET_ERROR;
+  }
+  nvinfer1::ISoftMaxLayer *softmax_layer_ = AddSoftMaxOp(ctx);
+  if (softmax_layer_ == nullptr) {
+    MS_LOG(ERROR) << "add softmax op failed for TensorRT.";
+    return RET_ERROR;
+  }
+  softmax_layer_->setName((op_name_ + "_softmax").c_str());
+  this->layer_ = softmax_layer_;
+
+  nvinfer1::ITensor *out_tensor = softmax_layer_->getOutput(0);
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "softmax output tensor create failed for TensorRT.";
+    return RET_ERROR;
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+nvinfer1::ISoftMaxLayer *SoftMaxTensorRT::AddSoftMaxOp(TensorRTContext *ctx) {
+  nvinfer1::ISoftMaxLayer *current_layer_ = ctx->network()->addSoftMax(*tensorrt_in_tensors_[0].trt_tensor_);
+  if (current_layer_ == nullptr) {
+    MS_LOG(ERROR) << "add softmax op failed for TensorRT.";
+    return nullptr;
+  }
+  auto axis = softmax_op_->axis();
+  if (axis == nullptr || axis->size() != 1) {
+    MS_LOG(ERROR) << "axis needs check";
+    return nullptr;
+  }
+  auto axis_val = std::vector<int64_t>(axis->begin(), axis->end());
+  if (axis_val[0] >= tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims) {
+    MS_LOG(ERROR) << "axis is larger than input tensor dims.";
+    return nullptr;
+  }
+  int64_t axis_format_value =
+    (axis_val[0] == -1) ? tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims - 1 : axis_val[0];
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose axis to NCHW
+    axis_format_value = ConvertAxisFromNHWC2NCHW(axis_format_value);
+  }
+  uint32_t axis_bit = 1 << axis_format_value;
+  MS_LOG(DEBUG) << op_name_ << " axis_value is " << axis_format_value << ", set axis to " << axis_bit;
+  current_layer_->setAxes(axis_bit);
+  return current_layer_;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Softmax, SoftMaxTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h
new file mode 100644
index 00000000000..a31d0f8b5a6
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class SoftMaxTensorRT : public TensorRTOp {
+ public:
+  SoftMaxTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~SoftMaxTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  nvinfer1::ISoftMaxLayer *AddSoftMaxOp(TensorRTContext *ctx);
+
+  const schema::Softmax *softmax_op_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc
new file mode 100644
index 00000000000..c4638bb2e49
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc
@@ -0,0 +1,160 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/split_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int SplitTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                             const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  int ret = ParseParams();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << op_name_ << " parse params failed.";
+    return ret;
+  }
+
+  axis_ = axis_ < 0 ? axis_ + in_tensors_[0].Shape().size() : axis_;
+
+  if (out_tensors.size() < 1 || out_tensors.size() != output_num_) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (axis_ < 0 || axis_ >= in_tensors_[0].Shape().size()) {
+    MS_LOG(ERROR) << "invalid axis : " << axis_;
+    return RET_ERROR;
+  }
+  int split_sum = std::accumulate(size_splits_.begin(), size_splits_.end(), 0);
+  int split_sum_expect = in_tensors_[0].Shape()[axis_];
+
+  if (size_splits_[size_splits_.size() - 1] == -1) {
+    size_splits_[size_splits_.size() - 1] = split_sum_expect - split_sum - 1;
+    split_sum = split_sum_expect;
+  }
+
+  if (split_sum != split_sum_expect) {
+    MS_LOG(ERROR) << "Sum of size splits not equal input tensor dim. ";
+    return RET_ERROR;
+  }
+
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+}
+
+int SplitTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  ITensorHelper split_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &split_input);
+  if (ret != RET_OK || split_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return ret;
+  }
+
+  int axis_dim_index = 0;
+  nvinfer1::Dims one_dims = lite::ConvertCudaDims(1, in_tensors_[0].Shape().size());
+  nvinfer1::ISliceLayer *slice_layer = nullptr;
+
+  for (int i = 0; i != output_num_; ++i) {
+    nvinfer1::Dims start_dims = lite::ConvertCudaDims(0, in_tensors_[0].Shape().size());
+    start_dims.d[axis_] = axis_dim_index;
+    axis_dim_index += size_splits_[i];
+
+    nvinfer1::Dims size_dims = lite::ConvertCudaDims(in_tensors_[0].Shape());
+    size_dims.d[axis_] = size_splits_[i];
+
+    slice_layer = ctx->network()->addSlice(*split_input.trt_tensor_, start_dims, size_dims, one_dims);
+    if (slice_layer == nullptr) {
+      MS_LOG(ERROR) << "add Slice op failed for TensorRT: " << op_name_;
+      return RET_ERROR;
+    }
+
+    nvinfer1::ITensor *out_tensor = slice_layer->getOutput(0);
+    if (type_ == schema::PrimitiveType_Unstack) {
+      auto shuffer_layer = ctx->network()->addShuffle(*out_tensor);
+      auto shuffer_dims_opt = SqueezeDims(out_tensor->getDimensions(), axis_);
+      if (!shuffer_dims_opt) {
+        MS_LOG(ERROR) << "SqueezeDims failed.";
+        return RET_ERROR;
+      }
+      shuffer_layer->setReshapeDimensions(shuffer_dims_opt.value());
+      out_tensor = shuffer_layer->getOutput(0);
+    }
+    out_tensor->setName((op_name_ + "_" + std::to_string(i)).c_str());
+    this->AddInnerOutTensors(ITensorHelper{out_tensor, split_input.format_, split_input.same_format_});
+  }
+  this->layer_ = slice_layer;
+  return RET_OK;
+}
+int SplitTensorRT::ParseParams() {
+  switch (type_) {
+    case schema::PrimitiveType_Split: {
+      auto split_op = op_primitive_->value_as_Split();
+      CHECK_NULL_RETURN(split_op);
+      axis_ = split_op->axis();
+      output_num_ = split_op->output_num();
+      auto size_splits_ptr = split_op->size_splits();
+      if (size_splits_ptr != nullptr) {
+        size_splits_.resize(size_splits_ptr->size());
+        std::copy(size_splits_ptr->begin(), size_splits_ptr->end(), size_splits_.begin());
+      } else if (in_tensors_.size() == INPUT_SIZE2 && in_tensors_[1].Data() != nullptr &&
+                 in_tensors_[1].DataType() == DataType::kNumberTypeInt32) {
+        size_splits_.resize(in_tensors_[1].ElementNum());
+        auto split_out_ptr = static_cast<const int *>(in_tensors_[1].Data().get());
+        for (int i = 0; i < in_tensors_[1].ElementNum(); i++) {
+          size_splits_[i] = split_out_ptr[i];
+        }
+      } else {
+        MS_LOG(ERROR) << op_name_ << " has invalid input size and size_splits: " << in_tensors_.size();
+        return RET_ERROR;
+      }
+      break;
+    }
+    case schema::PrimitiveType_Unstack: {
+      auto unstack_op = op_primitive_->value_as_Unstack();
+      CHECK_NULL_RETURN(unstack_op);
+      axis_ = unstack_op->axis();
+      output_num_ = out_tensors_.size();
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << op_name_ << " has invalid type for split";
+      return RET_ERROR;
+    }
+  }
+  if (size_splits_.empty()) {
+    if (output_num_ == 0 || in_tensors_[0].Shape().at(axis_) % output_num_ != 0) {
+      MS_LOG(ERROR) << "axis dim can not be split into same subdim";
+      return RET_ERROR;
+    }
+    int split_width = in_tensors_[0].Shape().at(axis_) / output_num_;
+    size_splits_.resize(output_num_);
+    std::fill(size_splits_.begin(), size_splits_.end(), split_width);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Split, SplitTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Unstack, SplitTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h
new file mode 100644
index 00000000000..df5b1c21533
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class SplitTensorRT : public TensorRTOp {
+ public:
+  SplitTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~SplitTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int ParseParams();
+  int64_t axis_;
+  int64_t output_num_;
+  std::vector<int64_t> size_splits_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc
new file mode 100644
index 00000000000..d35712924ea
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+#include <unordered_map>
+
+namespace mindspore::lite {
+const schema::Primitive *TensorRTOp::GetPrimitive() { return this->op_primitive_; }
+
+void TensorRTOp::AddInnerInTensors(ITensorHelper tensor) { this->tensorrt_in_tensors_.push_back(tensor); }
+
+void TensorRTOp::AddInnerOutTensors(ITensorHelper tensor) { this->tensorrt_out_tensors_.push_back(tensor); }
+
+std::vector<ITensorHelper> &TensorRTOp::GetInnerOutTensor() { return this->tensorrt_out_tensors_; }
+
+std::vector<ITensorHelper> &TensorRTOp::GetInnerInTensors() { return this->tensorrt_in_tensors_; }
+
+std::string TensorRTOp::GetOpName() { return this->op_name_; }
+
+std::vector<mindspore::MSTensor> &TensorRTOp::inputs() { return this->in_tensors_; }
+
+std::vector<mindspore::MSTensor> &TensorRTOp::outputs() { return this->out_tensors_; }
+
+schema::PrimitiveType TensorRTOp::type() const { return this->type_; }
+
+schema::QuantType TensorRTOp::GetQuantType() const { return this->quant_type_; }
+
+void TensorRTOp::set_in_ops(const std::vector<TensorRTOp *> &in_ops) { this->in_ops_ = in_ops; }
+
+void TensorRTOp::set_out_ops(const std::vector<TensorRTOp *> &out_ops) { this->out_ops_ = out_ops; }
+
+const std::vector<TensorRTOp *> &TensorRTOp::in_ops() const { return this->in_ops_; }
+
+const std::vector<TensorRTOp *> &TensorRTOp::out_ops() const { return this->out_ops_; }
+
+void TensorRTOp::SetRuntime(TensorRTRuntime *runtime) {
+  this->runtime_ = runtime;
+  device_id_ = runtime_->GetDeviceID();
+}
+
+bool TensorRTOp::IsShapeKnown() {
+  if (this->in_tensors_.size() == 1 && this->in_tensors_[0].Shape().size() == 0) {
+    return false;
+  }
+  return true;
+}
+
+int TensorRTOp::Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) {
+  if (op_binding_tensor_.size() != 0) {
+    MS_LOG(ERROR) << "need special op Prepare for " << op_name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+DynamicShapeParams TensorRTOp::GetDynamicShapeParams() const { return this->dynamic_shape_params_; }
+
+int TensorRTOp::SetInt8DynamicRange() {
+  // setting param layer_ forcely
+  if (this->layer_ == nullptr) {
+    MS_LOG(ERROR) << op_name_ << " layer is nullptr.";
+    return RET_ERROR;
+  }
+  if (in_tensors_.empty() || out_tensors_.empty()) {
+    MS_LOG(ERROR) << "input or output tensor empty.";
+    return RET_ERROR;
+  }
+  if (quant_type_ != schema::QuantType_QUANT_ALL) {
+    MS_LOG(DEBUG) << "op " << op_name_ << " not quantized.";
+    return RET_OK;
+  }
+
+  if (in_tensors_[0].QuantParams().empty() || out_tensors_[0].QuantParams().empty()) {
+    MS_LOG(WARNING) << op_name_ << " quant param is empty.";
+    MS_LOG(WARNING) << "in_tensor quant param size: " << in_tensors_[0].QuantParams().size()
+                    << " ,out_tensor quant param size: " << out_tensors_[0].QuantParams().size();
+  }
+  for (size_t i = 0; i < in_tensors_.size(); i++) {
+    auto tensor = in_tensors_.at(i);
+    if (!tensor.IsConst()) {
+      tensorrt_in_tensors_.at(i).trt_tensor_->setDynamicRange(tensor.QuantParams().at(0).min,
+                                                              tensor.QuantParams().at(0).max);
+      // Don't set the presion on non-computation layers as they don't support int8.
+      if (this->layer_->getType() != nvinfer1::LayerType::kCONSTANT &&
+          this->layer_->getType() != nvinfer1::LayerType::kCONCATENATION &&
+          this->layer_->getType() != nvinfer1::LayerType::kSHAPE) {
+        this->layer_->setPrecision(nvinfer1::DataType::kINT8);
+      }
+    }
+  }
+  for (size_t i = 0; i < out_tensors_.size(); i++) {
+    auto tensor = out_tensors_.at(0);
+    tensorrt_out_tensors_.at(i).trt_tensor_->setDynamicRange(tensor.QuantParams().at(0).min,
+                                                             tensor.QuantParams().at(0).max);
+    // set output type of execution tensors.
+    if (this->layer_->getOutput(i)->isExecutionTensor()) {
+      this->layer_->setOutputType(i, nvinfer1::DataType::kINT8);
+    }
+  }
+  return SetTransposeDynamicRange();
+}
+
+int TensorRTOp::SetTransposeDynamicRange() {
+  if (this->transpose_layer_ == nullptr) {
+    MS_LOG(INFO) << op_name_ << " transpose_layer is nullptr.";
+    return RET_OK;
+  }
+  if (!in_tensors_[0].QuantParams().empty() && !out_tensors_[0].QuantParams().empty()) {
+    this->transpose_layer_->getInput(0)->setDynamicRange(in_tensors_.front().QuantParams().at(0).min,
+                                                         in_tensors_.front().QuantParams().at(0).max);
+    this->transpose_layer_->getOutput(0)->setDynamicRange(in_tensors_.front().QuantParams().at(0).min,
+                                                          in_tensors_.front().QuantParams().at(0).max);
+    this->transpose_layer_->setOutputType(0, nvinfer1::DataType::kINT8);
+    this->transpose_layer_->setPrecision(nvinfer1::DataType::kINT8);
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h
new file mode 100644
index 00000000000..e37b77a051e
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h
@@ -0,0 +1,175 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_
+
+#include <utility>
+#include <NvInfer.h>
+#include <string>
+#include <vector>
+#include "include/api/kernel.h"
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_context.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/auto_registration_factory.h"
+#include "src/common/log_util.h"
+
+namespace mindspore::lite {
+constexpr int INPUT_SIZE2 = 2;
+constexpr int INPUT_SIZE3 = 3;
+constexpr int INPUT_SIZE4 = 4;
+
+struct BindingHelper {
+  std::string name_;
+  void *data_{nullptr};
+  nvinfer1::DataType data_type_;
+  size_t size_;
+  bool is_input_binding_{false};
+};
+
+struct DynamicShapeParams {
+  bool support_dynamic_{true};
+  bool support_hw_dynamic_{true};
+};
+
+class TensorRTRuntime;
+
+class TensorRTOp {
+ public:
+  explicit TensorRTOp(const schema::Primitive *primitive, std::vector<mindspore::MSTensor> in_tensors,
+                      std::vector<mindspore::MSTensor> out_tensors, std::string name, schema::QuantType quant_type)
+      : op_primitive_(primitive),
+        in_tensors_(std::move(in_tensors)),
+        out_tensors_(std::move(out_tensors)),
+        op_name_(std::move(name)),
+        quant_type_(quant_type) {
+    if (primitive != nullptr) {
+      this->type_ = primitive->value_type();
+    }
+  }
+
+  virtual ~TensorRTOp() = default;
+
+  virtual int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                        const std::vector<mindspore::MSTensor> &out_tensors) = 0;
+
+  virtual int AddInnerOp(TensorRTContext *ctx) = 0;
+
+  virtual int SetInt8DynamicRange();
+
+  virtual int Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine);
+
+  const schema::Primitive *GetPrimitive();
+
+  void AddInnerInTensors(ITensorHelper tensor);
+
+  void AddInnerOutTensors(ITensorHelper tensor);
+
+  std::vector<ITensorHelper> &GetInnerOutTensor();
+
+  std::vector<ITensorHelper> &GetInnerInTensors();
+
+  std::string GetOpName();
+
+  std::vector<mindspore::MSTensor> &inputs();
+
+  std::vector<mindspore::MSTensor> &outputs();
+
+  schema::PrimitiveType type() const;
+
+  schema::QuantType GetQuantType() const;
+
+  void set_in_ops(const std::vector<TensorRTOp *> &in_ops);
+
+  void set_out_ops(const std::vector<TensorRTOp *> &out_ops);
+
+  const std::vector<TensorRTOp *> &in_ops() const;
+
+  const std::vector<TensorRTOp *> &out_ops() const;
+
+  void SetRuntime(TensorRTRuntime *runtime);
+
+  DynamicShapeParams GetDynamicShapeParams() const;
+
+  nvinfer1::ILayer *layer() { return layer_; }
+
+ private:
+  int SetTransposeDynamicRange();
+
+ protected:
+  bool IsShapeKnown();
+
+  nvinfer1::ILayer *layer_ = nullptr;
+
+  nvinfer1::IShuffleLayer *transpose_layer_ = nullptr;
+
+  const schema::Primitive *op_primitive_{nullptr};
+
+  std::vector<mindspore::MSTensor> in_tensors_;
+
+  std::vector<mindspore::MSTensor> out_tensors_;
+
+  std::vector<ITensorHelper> tensorrt_in_tensors_;
+
+  std::vector<ITensorHelper> tensorrt_out_tensors_;
+
+  std::vector<TensorRTOp *> in_ops_;
+
+  std::vector<TensorRTOp *> out_ops_;
+
+  std::string op_name_;
+
+  schema::PrimitiveType type_ = schema::PrimitiveType_NONE;
+
+  schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE;
+
+  std::vector<BindingHelper> op_binding_tensor_;
+
+  TensorRTRuntime *runtime_{nullptr};
+
+  DynamicShapeParams dynamic_shape_params_;
+
+  uint32_t device_id_{0};
+};
+
+template <class T>
+TensorRTOp *GetTensorRTOp(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                          const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                          const schema::QuantType &quant_type) {
+  auto *op = new (std::nothrow) T(primitive, in_tensors, out_tensors, name, quant_type);
+  if (op == nullptr) {
+    MS_LOG(WARNING) << "TensorRT is nullptr.";
+    return nullptr;
+  }
+
+  auto ret = op->IsSupport(primitive, in_tensors, out_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(WARNING) << "TensorRT op is not supported: " << name;
+    delete op;
+    return nullptr;
+  }
+  return op;
+}
+typedef TensorRTOp *(*TensorRTGetOp)(const schema::Primitive *primitive,
+                                     const std::vector<mindspore::MSTensor> &in_tensors,
+                                     const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                                     const schema::QuantType &quant_type);
+
+#define REGISTER_TENSORRT_CREATOR(KEY, TENSORRT_OP) \
+  REGISTER_CLASS_CREATOR(schema::PrimitiveType, KEY, TensorRTGetOp, GetTensorRTOp<TENSORRT_OP>);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc
new file mode 100644
index 00000000000..1ecaa90167e
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstring>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+void SerializeValue(void **buffer, const void *value, size_t cpy_size) {
+  memcpy(*buffer, value, cpy_size);
+  *buffer = static_cast<char *>(*buffer) + cpy_size;
+}
+
+void DeserializeValue(void const **buffer, size_t *buffer_size, void *value, size_t cpy_size) {
+  if (cpy_size > *buffer_size) {
+    MS_LOG(ERROR) << "invalid desirialize size, buffer size: " << *buffer_size << ", value size: " << cpy_size;
+    return;
+  }
+  memcpy(value, *buffer, cpy_size);
+  *buffer = static_cast<const char *>(*buffer) + cpy_size;
+  *buffer_size -= cpy_size;
+}
+
+nvinfer1::DimsExprs TensorRTPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                        int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  return inputs[0];
+}
+
+bool TensorRTPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs,
+                                               int nbOutputs) noexcept {
+  return true;
+}
+
+void TensorRTPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                                     const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {}
+
+size_t TensorRTPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                        const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept {
+  return 0;
+}
+
+nvinfer1::DataType TensorRTPlugin::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                                     int nbInputs) const noexcept {
+  return inputTypes[0];
+}
+
+const char *TensorRTPlugin::getPluginType() const noexcept { return plugin_name_.c_str(); }
+
+const char *TensorRTPlugin::getPluginVersion() const noexcept { return plugin_version_.c_str(); }
+
+int TensorRTPlugin::getNbOutputs() const noexcept { return 1; }
+
+int TensorRTPlugin::initialize() noexcept { return 0; }
+
+void TensorRTPlugin::terminate() noexcept {}
+
+size_t TensorRTPlugin::getSerializationSize() const noexcept { return 0; }
+
+void TensorRTPlugin::serialize(void *buffer) const noexcept {}
+
+void TensorRTPlugin::destroy() noexcept {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void TensorRTPlugin::setPluginNamespace(const char *libNamespace) noexcept { name_space_ = libNamespace; }
+
+const char *TensorRTPlugin::getPluginNamespace() const noexcept { return name_space_.c_str(); }
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h
new file mode 100644
index 00000000000..d2fadb85828
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+#include "NvInferRuntimeCommon.h"
+#include <NvInfer.h>
+
+namespace mindspore::lite {
+void SerializeValue(void **buffer, const void *value, size_t cpy_size);
+void DeserializeValue(void const **buffer, size_t *buffer_size, void *value, size_t cpy_size);
+class TensorRTPlugin : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  TensorRTPlugin(const std::string &layer_name, const std::string &plugin_name, uint32_t device_id = 0)
+      : layer_name_(layer_name), plugin_name_(plugin_name), device_id_(device_id) {}
+
+  // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete
+  // default constructor.
+  TensorRTPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs,
+                                 int nbOutputs) noexcept override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const
+    noexcept override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const noexcept override;
+  const char *getPluginVersion() const noexcept override;
+  int getNbOutputs() const noexcept override;
+  int initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char *pluginNamespace) noexcept override;
+  const char *getPluginNamespace() const noexcept override;
+
+ protected:
+  std::string layer_name_;
+  std::string name_space_;
+  std::string plugin_version_{"1"};
+  std::string plugin_name_;
+  uint32_t device_id_{0};
+};
+
+template <class T>
+class TensorRTPluginCreater : public nvinfer1::IPluginCreator {
+ public:
+  explicit TensorRTPluginCreater(const std::string &plugin_name) : plugin_name_(plugin_name) {
+    // Fill PluginFieldCollection with PluginField arguments metadata
+    field_collection_.nbFields = fields_.size();
+    field_collection_.fields = fields_.data();
+  }
+
+  const char *getPluginName() const noexcept override { return plugin_name_.c_str(); }
+
+  const char *getPluginVersion() const noexcept override { return plugin_version_.c_str(); }
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() noexcept override { return &field_collection_; }
+
+  void setPluginNamespace(const char *pluginNamespace) noexcept override { name_space_ = std::string(pluginNamespace); }
+
+  const char *getPluginNamespace() const noexcept override { return name_space_.c_str(); }
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) noexcept {
+    return new (std::nothrow) T(name, fc);
+  }
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept {
+    return new (std::nothrow) T(name, serialData, serialLength);
+  }
+
+ protected:
+  static nvinfer1::PluginFieldCollection field_collection_;
+  static std::vector<nvinfer1::PluginField> fields_;
+  std::string name_space_;
+  std::string plugin_version_{"1"};
+  std::string plugin_name_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc
new file mode 100644
index 00000000000..37225ef49b0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/tile_tensorrt.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(TilePluginCreater);
+template class TensorRTPluginCreater<TilePlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int TileTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int TileTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  auto repeats_tensor = in_tensors_[1];
+  CHECK_NULL_RETURN(repeats_tensor.Data());
+  if (repeats_tensor.ElementNum() != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims) {
+    MS_LOG(ERROR) << op_name_ << " has input dims: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims
+                  << ", and invalid repeats cnt: " << repeats_tensor.ElementNum();
+    return RET_ERROR;
+  }
+  int ret = ParseData2Vector(in_tensors_[1], &repeats_);
+  if (ret != RET_OK || repeats_.size() == 0) {
+    MS_LOG(ERROR) << op_name_ << " has invalid repeats tensor";
+    return ret;
+  }
+  ITensorHelper tile_input;
+
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &tile_input);
+  if (ret != RET_OK || tile_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << op_name_ << " preprocess tensor failed.";
+    return RET_ERROR;
+  }
+
+  return RunAsConcat(ctx, tile_input);
+}
+int TileTensorRT::RunAsConcat(TensorRTContext *ctx, const ITensorHelper &tile_input) {
+  int axis = -1;
+  float tile_times = 0.0f;
+  for (int i = 0; i < repeats_.size(); i++) {
+    if (repeats_[i] > 1) {
+      if (axis != -1) {
+        MS_LOG(ERROR) << op_name_ << " has more than one axis to tile";
+        return RET_ERROR;
+      }
+      axis = i;
+      tile_times = repeats_[i];
+    }
+  }
+  // concat
+  nvinfer1::ITensor *concat_inputs[1024];
+  for (int i = 0; i < tile_times; i++) {
+    concat_inputs[i] = tile_input.trt_tensor_;
+  }
+  nvinfer1::IConcatenationLayer *concat_layer = ctx->network()->addConcatenation(concat_inputs, tile_times);
+  CHECK_NULL_RETURN(concat_layer);
+  concat_layer->setAxis(axis);
+  concat_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *tile_out = concat_layer->getOutput(0);
+  layer_ = concat_layer;
+  tile_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{tile_out, tile_input.format_, true});
+  return RET_OK;
+}
+int TileTensorRT::RunAsPlugin(TensorRTContext *ctx, const ITensorHelper &tile_input) {
+  // Floating point Exception
+  nvinfer1::ITensor *inputTensors[] = {tile_input.trt_tensor_};
+  auto plugin = std::make_shared<TilePlugin>(op_name_, repeats_, device_id_);
+  nvinfer1::IPluginV2Layer *tile_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  CHECK_NULL_RETURN(tile_layer);
+  nvinfer1::ITensor *tile_out = tile_layer->getOutput(0);
+  tile_layer->setName(op_name_.c_str());
+  tile_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{tile_out, tile_input.format_, true});
+  this->layer_ = tile_layer;
+  return RET_OK;
+}
+// plugin
+
+int TilePlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                        const void *const *inputs, void *const *outputs, void *workspace,
+                        cudaStream_t stream) noexcept {
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  if (device_input_shape_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&device_input_shape_, input_dims.nbDims * sizeof(size_t)));
+    CHECK_NULL_RETURN(device_input_shape_);
+  }
+  if (device_output_shape_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&device_output_shape_, output_dims.nbDims * sizeof(size_t)));
+    CHECK_NULL_RETURN(device_output_shape_);
+  }
+  size_t input_shape[nvinfer1::Dims::MAX_DIMS];
+  size_t output_shape[nvinfer1::Dims::MAX_DIMS];
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape[i] = static_cast<size_t>(input_dims.d[i]);
+    output_shape[i] = static_cast<size_t>(output_dims.d[i]);
+  }
+  CUDA_CHECK(cudaMemcpy(device_input_shape_, input_shape, input_dims.nbDims * sizeof(size_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(
+    cudaMemcpy(device_output_shape_, output_shape, output_dims.nbDims * sizeof(size_t), cudaMemcpyHostToDevice));
+  MS_LOG(ERROR) << layer_name_ << " has more axis to concat: " << repeats_.size();
+  return RET_ERROR;
+}
+
+nvinfer1::DimsExprs TilePlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                                    nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs[0].nbDims;
+  for (int i = 0; i < out_dims.nbDims; i++) {
+    auto repeat = exprBuilder.constant(repeats_[i]);
+    out_dims.d[i] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[0].d[i], *repeat);
+  }
+  return out_dims;
+}
+
+nvinfer1::IPluginV2DynamicExt *TilePlugin::clone() const noexcept {
+  auto *plugin = new TilePlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+void TilePlugin::terminate() noexcept {
+  if (device_input_shape_ != nullptr) {
+    auto cuda_ret = cudaFree(device_input_shape_);
+    if (cuda_ret != cudaSuccess) {
+      MS_LOG(ERROR) << "free cuda memory failed for " << layer_name_;
+    }
+  }
+  if (device_output_shape_ != nullptr) {
+    auto cuda_ret = cudaFree(device_output_shape_);
+    if (cuda_ret != cudaSuccess) {
+      MS_LOG(ERROR) << "free cuda memory failed for " << layer_name_;
+    }
+  }
+}
+
+size_t TilePlugin::getSerializationSize() const noexcept { return sizeof(float) * repeats_.size() + sizeof(size_t); }
+
+void TilePlugin::serialize(void *buffer) const noexcept {
+  size_t dims = repeats_.size();
+  SerializeValue(&buffer, &dims, sizeof(size_t));
+  for (float one_repeat : repeats_) {
+    SerializeValue(&buffer, &one_repeat, sizeof(float));
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_TileFusion, TileTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h
new file mode 100644
index 00000000000..750d09e8bd9
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh"
+
+namespace mindspore::lite {
+constexpr char *TILE_PLUGIN_NAME{"TilePluginCreater"};
+class TileTensorRT : public TensorRTOp {
+ public:
+  TileTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~TileTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int RunAsConcat(TensorRTContext *ctx, const ITensorHelper &tile_input);
+  int RunAsPlugin(TensorRTContext *ctx, const ITensorHelper &tile_input);
+  std::vector<float> repeats_;
+};
+
+class TilePlugin : public TensorRTPlugin {
+ public:
+  explicit TilePlugin(const std::string name, const std::vector<float> &repeats, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(TILE_PLUGIN_NAME), device_id), repeats_(repeats) {}
+
+  TilePlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(TILE_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    size_t dims = static_cast<const size_t *>(fields[0].data)[0];
+    for (size_t i = 0; i < dims; i++) {
+      float one_repeat = static_cast<const float *>(fields[0].data)[i + 1];
+      repeats_.push_back(one_repeat);
+    }
+  }
+
+  TilePlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(TILE_PLUGIN_NAME)) {
+    size_t dims;
+    DeserializeValue(&serialData, &serialLength, &dims, sizeof(size_t));
+    for (size_t i = 0; i < dims; i++) {
+      float one_repeat;
+      DeserializeValue(&serialData, &serialLength, &one_repeat, sizeof(float));
+      repeats_.push_back(one_repeat);
+    }
+  }
+
+  TilePlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+  void terminate() noexcept override;
+
+ private:
+  std::vector<float> repeats_;
+  size_t *device_input_shape_{nullptr};
+  size_t *device_output_shape_{nullptr};
+};
+class TilePluginCreater : public TensorRTPluginCreater<TilePlugin> {
+ public:
+  TilePluginCreater() : TensorRTPluginCreater(std::string(TILE_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc
new file mode 100644
index 00000000000..71da8be9555
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc
@@ -0,0 +1,160 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/topk_tensorrt.h"
+
+namespace mindspore::lite {
+int TopKTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int TopKTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  int ret = ParseParams(ctx);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ParseParams failed for " << op_name_;
+    return ret;
+  }
+
+  ITensorHelper topk_input;
+  ret = PreprocessInputs(ctx, &topk_input);
+  if (ret != RET_OK || topk_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "preprocess input failed for " << op_name_;
+    return ret;
+  }
+  axis_ = 1 << axis_value_;
+  MS_LOG(DEBUG) << "addTopK input " << GetTensorFormat(topk_input);
+  MS_LOG(DEBUG) << op_name_ << " has k: " << top_k_ << ", axis: " << axis_value_;
+
+  nvinfer1::ITopKLayer *topk_layer = ctx->network()->addTopK(*topk_input.trt_tensor_, topk_op_, top_k_, axis_);
+  CHECK_NULL_RETURN(topk_layer);
+  this->layer_ = topk_layer;
+  topk_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *value_out_tensor = topk_layer->getOutput(0);
+  nvinfer1::ITensor *index_out_tensor = topk_layer->getOutput(1);
+  // output 0 is data value, output 1 is index
+
+  if (value_out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) {
+    nvinfer1::Dims out_dims = ConvertCudaDims(out_tensors_[0].Shape());
+    out_dims.d[0] = value_out_tensor->getDimensions().d[0];
+    value_out_tensor = Reshape(ctx, value_out_tensor, out_dims);
+    CHECK_NULL_RETURN(value_out_tensor);
+    value_out_tensor->setName((op_name_ + "_value_output").c_str());
+    index_out_tensor = Reshape(ctx, index_out_tensor, out_dims);
+    CHECK_NULL_RETURN(index_out_tensor);
+    index_out_tensor->setName((op_name_ + "_index_output").c_str());
+  }
+  if (out_tensors_.size() == INPUT_SIZE2) {
+    AddInnerOutTensors(ITensorHelper{value_out_tensor, topk_input.format_, true});
+  }
+  AddInnerOutTensors(ITensorHelper{index_out_tensor, topk_input.format_, true});
+  return RET_OK;
+}
+
+int TopKTensorRT::ParseParams(TensorRTContext *ctx) {
+  switch (type_) {
+    case schema::PrimitiveType_ArgMaxFusion: {
+      topk_op_ = nvinfer1::TopKOperation::kMAX;
+      auto max_prim = op_primitive_->value_as_ArgMaxFusion();
+      CHECK_NULL_RETURN(max_prim);
+      axis_value_ = max_prim->axis();
+      axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_;
+      top_k_ = max_prim->top_k();
+      break;
+    }
+    case schema::PrimitiveType_ArgMinFusion: {
+      topk_op_ = nvinfer1::TopKOperation::kMIN;
+      auto mim_prim = op_primitive_->value_as_ArgMinFusion();
+      CHECK_NULL_RETURN(mim_prim);
+      axis_value_ = mim_prim->axis();
+      axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_;
+      top_k_ = mim_prim->top_k();
+      break;
+    }
+    case schema::PrimitiveType_TopKFusion: {
+      auto topk_prim = op_primitive_->value_as_TopKFusion();
+      CHECK_NULL_RETURN(topk_prim);
+      topk_op_ = topk_prim->largest() == 1 ? nvinfer1::TopKOperation::kMAX : nvinfer1::TopKOperation::kMIN;
+      axis_value_ = topk_prim->axis();
+      axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_;
+      if (in_tensors_.size() < INPUT_SIZE2) {
+        MS_LOG(ERROR) << "invalid input size " << in_tensors_.size() << "for " << op_name_;
+        return RET_ERROR;
+      }
+      std::vector<float> tmp(1);
+      int ret_k = ParseData2Vector(in_tensors_[1], &tmp);
+      if (ret_k != RET_OK) {
+        return ret_k;
+      }
+      top_k_ = tmp[0];
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << op_name_ << " has more primitive type: " << schema::EnumNamePrimitiveType(type_);
+      return RET_ERROR;
+    }
+  }
+  // Currently reduceAxes must specify exactly one dimension, and it must be one of the last four dimensions.
+  if (axis_value_ != in_tensors_[0].Shape().size() - 1) {
+    MS_LOG(ERROR) << op_name_ << " has unsupported axis : " << axis_value_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int TopKTensorRT::PreprocessInputs(TensorRTContext *ctx, ITensorHelper *topk_input) {
+  auto input_dim = tensorrt_in_tensors_[0].trt_tensor_->getDimensions();
+  int ret = RET_ERROR;
+  if (input_dim.nbDims == DIMENSION_4D) {
+    ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], topk_input);
+  } else if (input_dim.nbDims < DIMENSION_4D) {
+    // only support 4d
+    nvinfer1::Dims4 expect_dim;
+    for (int i = 0; i < DIMENSION_4D; i++) {
+      if (i < input_dim.nbDims) {
+        expect_dim.d[DIMENSION_4D - 1 - i] = input_dim.d[input_dim.nbDims - 1 - i];
+      } else {
+        expect_dim.d[DIMENSION_4D - 1 - i] = 1;
+      }
+    }
+    topk_input->trt_tensor_ = Reshape(ctx, tensorrt_in_tensors_[0].trt_tensor_, expect_dim);
+    CHECK_NULL_RETURN(topk_input->trt_tensor_);
+    axis_value_ += (DIMENSION_4D - input_dim.nbDims);
+    return RET_OK;
+  } else {
+    MS_LOG(ERROR) << op_name_ << " has invalid input dims: " << input_dim.nbDims;
+  }
+  return ret;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ArgMaxFusion, TopKTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ArgMinFusion, TopKTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_TopKFusion, TopKTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h
new file mode 100644
index 00000000000..5344d2fda93
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class TopKTensorRT : public TensorRTOp {
+ public:
+  TopKTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~TopKTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int ParseParams(TensorRTContext *ctx);
+
+  int PreprocessInputs(TensorRTContext *ctx, ITensorHelper *topk_input);
+
+  nvinfer1::TopKOperation topk_op_{nvinfer1::TopKOperation::kMAX};
+  uint32_t axis_{0};
+  int axis_value_{0};
+  int32_t top_k_{0};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc
new file mode 100644
index 00000000000..14300dfe687
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/unary_tensorrt.h"
+
+namespace mindspore::lite {
+int UnaryTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  auto it = unary_ops_.find(primitive->value_type());
+  if (it != unary_ops_.end()) {
+    unary_op_ = it->second;
+  } else {
+    MS_LOG(ERROR) << "unsupported unary ops type: " << schema::EnumNamePrimitiveType(primitive->value_type());
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int UnaryTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  nvinfer1::IUnaryLayer *cal_layer = ctx->network()->addUnary(*tensorrt_in_tensors_[0].trt_tensor_, unary_op_);
+  if (cal_layer == nullptr) {
+    MS_LOG(ERROR) << "addUnary failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  cal_layer->setName(op_name_.c_str());
+  this->layer_ = cal_layer;
+  if (type_ == schema::PrimitiveType_ExpFusion) {
+    auto exp_op = op_primitive_->value_as_ExpFusion();
+    CHECK_NULL_RETURN(exp_op);
+    float scale = exp_op->scale();
+    float shift = exp_op->shift();
+    float base = exp_op->base();
+    if (scale != 1.0f || shift != 0.0f || base != -1.0f) {
+      MS_LOG(ERROR) << op_name_ << " has fusion to calculate.";
+      return RET_ERROR;
+    }
+  }
+
+  nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0);
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Sqrt, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Abs, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Neg, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Log, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Sin, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Cos, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Ceil, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Floor, UnaryTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ExpFusion, UnaryTensorRT)
+#if TRT_VERSION_GE(7, 2)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalNot, UnaryTensorRT)
+#endif
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h
new file mode 100644
index 00000000000..5f7f18f6908
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class UnaryTensorRT : public TensorRTOp {
+ public:
+  UnaryTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~UnaryTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  std::map<schema::PrimitiveType, nvinfer1::UnaryOperation> unary_ops_ = {
+    {schema::PrimitiveType_Sqrt, nvinfer1::UnaryOperation::kSQRT},
+    {schema::PrimitiveType_Abs, nvinfer1::UnaryOperation::kABS},
+    {schema::PrimitiveType_Neg, nvinfer1::UnaryOperation::kNEG},
+    {schema::PrimitiveType_Log, nvinfer1::UnaryOperation::kLOG},
+    {schema::PrimitiveType_Sin, nvinfer1::UnaryOperation::kSIN},
+    {schema::PrimitiveType_Cos, nvinfer1::UnaryOperation::kCOS},
+    {schema::PrimitiveType_Ceil, nvinfer1::UnaryOperation::kCEIL},
+    {schema::PrimitiveType_Floor, nvinfer1::UnaryOperation::kFLOOR},
+    {schema::PrimitiveType_ExpFusion, nvinfer1::UnaryOperation::kEXP},
+#if TRT_VERSION_GE(7, 2)
+    {schema::PrimitiveType_LogicalNot, nvinfer1::UnaryOperation::kNOT},
+#endif
+  };
+  nvinfer1::UnaryOperation unary_op_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc
new file mode 100644
index 00000000000..ab40a64b4b8
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc
@@ -0,0 +1,150 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h"
+#include <cuda_runtime.h>
+#include <mutex>
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+
+namespace mindspore::lite {
+void *TensorRTAllocator::MallocDeviceMem(const mindspore::MSTensor &host_tensor, size_t size) {
+  if (host_tensor == NULL) {
+    return nullptr;
+  }
+  return MallocDeviceMem(host_tensor.Name(), size, ConvertDataType(host_tensor.DataType()));
+}
+
+void *TensorRTAllocator::MallocDeviceMem(const std::string &name, size_t size, nvinfer1::DataType data_type) {
+  if (cuda_tensor_map_.find(name) != cuda_tensor_map_.end() && size <= cuda_tensor_map_[name].size) {
+    MS_LOG(DEBUG) << "tensor :" << name << " has already in cuda Allocator pool.";
+    return cuda_tensor_map_[name].data;
+  }
+  void *device_ptr = nullptr;
+  auto cuda_ret = cudaMalloc(&device_ptr, size);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda Malloc failed for size:" << size;
+    return nullptr;
+  }
+  MS_LOG(INFO) << "cudaMalloc size: " << size << " for " << name;
+  if (cuda_tensor_map_[name].data != nullptr) {
+    cuda_ret = cudaFree(cuda_tensor_map_[name].data);
+    if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
+      MS_LOG(ERROR) << "free old cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
+      cuda_ret = cudaFree(device_ptr);
+      if (cuda_ret != cudaSuccess) {
+        MS_LOG(ERROR) << "free new cuda device_ptr failed for " << cudaGetErrorName(cuda_ret);
+        return nullptr;
+      }
+      return nullptr;
+    }
+  }
+  cuda_tensor_map_[name].data = device_ptr;
+  cuda_tensor_map_[name].is_valid_mem = false;
+  cuda_tensor_map_[name].size = size;
+  return device_ptr;
+}
+
+void TensorRTAllocator::MarkMemValid(const std::string &name, bool isValid) {
+  cuda_tensor_map_[name].is_valid_mem = isValid;
+  return;
+}
+
+bool TensorRTAllocator::GetMemIsValid(const std::string &name) {
+  if (cuda_tensor_map_.find(name) == cuda_tensor_map_.end()) {
+    MS_LOG(WARNING) << "tensor :" << name << " not in cuda Allocator pool.";
+    return false;
+  }
+  return cuda_tensor_map_[name].is_valid_mem;
+}
+
+void *TensorRTAllocator::GetDevicePtr(const std::string &tensor_name) {
+  if (tensor_name.empty()) {
+    return nullptr;
+  }
+  if (cuda_tensor_map_.find(tensor_name) == cuda_tensor_map_.end()) {
+    return nullptr;
+  }
+  return this->cuda_tensor_map_.find(tensor_name)->second.data;
+}
+
+int TensorRTAllocator::SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, const std::string &device_tensor_name,
+                                              bool is_host2device, bool sync) {
+  if (host_tensor == NULL) {
+    MS_LOG(ERROR) << "host tensor is null.";
+    return RET_ERROR;
+  }
+#if TRT_VERSION_GE(7, 2)
+  if (host_tensor.DataType() == DataType::kNumberTypeBool && !is_host2device) {
+    CudaTensorParam &current_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second;
+    auto device_ptr = current_cuda_tensor.data;
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "device_ptr is null for " << device_tensor_name;
+      return RET_ERROR;
+    }
+    Cast<int32_t, bool>(host_tensor.DataSize(), static_cast<int32_t *>(device_ptr), static_cast<bool *>(device_ptr),
+                        stream_);
+  }
+#endif
+  return SyncMemInHostAndDevice(host_tensor.MutableData(), device_tensor_name, host_tensor.DataSize(), is_host2device,
+                                sync);
+}
+
+int TensorRTAllocator::SyncMemInHostAndDevice(void *host_data, const std::string &device_tensor_name, size_t data_size,
+                                              bool is_host2device, bool sync) {
+  if (host_data == nullptr || cuda_tensor_map_.find(device_tensor_name) == cuda_tensor_map_.end()) {
+    MS_LOG(ERROR) << " host or device ptr is null.";
+    return RET_ERROR;
+  }
+  CudaTensorParam &current_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second;
+  // is memcpy from device to host, the host mem is valid, change tag for mem pool.
+  current_cuda_tensor.is_valid_mem = is_host2device ? current_cuda_tensor.is_valid_mem : true;
+  if (is_host2device && current_cuda_tensor.is_valid_mem) {
+    MS_LOG(DEBUG) << "no need memcpy for: " << device_tensor_name;
+    return RET_OK;
+  }
+  auto device_ptr = current_cuda_tensor.data;
+  if (device_ptr == nullptr) {
+    MS_LOG(ERROR) << "device_ptr is null for " << device_tensor_name;
+    return RET_ERROR;
+  }
+
+  void *src_ptr = is_host2device ? host_data : device_ptr;
+  void *dst_ptr = is_host2device ? device_ptr : host_data;
+  cudaMemcpyKind kind = is_host2device ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost;
+  auto cuda_ret = cudaMemcpy(dst_ptr, src_ptr, data_size, kind);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "copy mem failed,ret " << cudaGetErrorName(cuda_ret);
+    return RET_ERROR;
+  }
+  MS_LOG(INFO) << "cuda memcpy success for " << device_tensor_name;
+  return RET_OK;
+}
+
+int TensorRTAllocator::ClearDeviceMem() {
+  for (auto &iter : cuda_tensor_map_) {
+    auto cuda_ret = cudaFree(iter.second.data);
+    if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
+      MS_LOG(WARNING) << "free cuda failed for " << cudaGetErrorName(cuda_ret);
+    }
+    iter.second.data = nullptr;
+    iter.second.is_valid_mem = false;
+  }
+  return RET_OK;
+}
+std::map<std::string, CudaTensorParam> TensorRTAllocator::GetAllDevicePtr() { return this->cuda_tensor_map_; }
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h
new file mode 100644
index 00000000000..c0c592019ab
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_
+#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h"
+#include <map>
+#include <string>
+#include <NvInfer.h>
+#include "include/api/types.h"
+
+namespace mindspore::lite {
+struct CudaTensorParam {
+  void *data = nullptr;
+  bool is_valid_mem = false;
+  size_t size = 0;
+};
+class TensorRTAllocator {
+ public:
+  TensorRTAllocator() = default;
+
+  ~TensorRTAllocator() = default;
+
+  void *MallocDeviceMem(const mindspore::MSTensor &host_tensor, size_t size);
+
+  void *MallocDeviceMem(const std::string &name, size_t size, nvinfer1::DataType data_type);
+
+  void *GetDevicePtr(const std::string &tensor_name);
+
+  void SetCudaStream(cudaStream_t stream) { stream_ = stream; }
+
+  std::map<std::string, CudaTensorParam> GetAllDevicePtr();
+
+  int SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, const std::string &device_tensor_name,
+                             bool is_host2device, bool sync = true);
+
+  int SyncMemInHostAndDevice(void *host_data, const std::string &device_tensor_name, size_t data_size,
+                             bool is_host2device, bool sync = true);
+
+  int ClearDeviceMem();
+
+  void MarkMemValid(const std::string &name, bool isValid);
+
+  bool GetMemIsValid(const std::string &name);
+
+ private:
+  std::map<std::string, CudaTensorParam> cuda_tensor_map_;
+  cudaStream_t stream_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc
new file mode 100644
index 00000000000..e13b08997ba
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_context.h"
+
+namespace mindspore::lite {
+TensorRTContext::~TensorRTContext() {
+  if (network_ != nullptr) {
+    network_->destroy();
+    network_ = nullptr;
+  }
+}
+
+bool TensorRTContext::Init() {
+  network_ = runtime_->GetBuilder()->createNetworkV2(
+    1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+  if (network_ == nullptr) {
+    MS_LOG(ERROR) << "New network init failed.";
+    return false;
+  }
+  return true;
+}
+
+void TensorRTContext::SetRuntime(TensorRTRuntime *runtime) { runtime_ = runtime; }
+
+nvinfer1::INetworkDefinition *TensorRTContext::network() { return network_; }
+
+void TensorRTContext::RegisterLayer(nvinfer1::ILayer *layer, const std::string &basename) {
+  if (layer == nullptr) {
+    MS_LOG(ERROR) << "Register null layer!";
+    return;
+  }
+  layer->setName((basename + "_" + std::to_string(counter_++)).c_str());
+}
+
+void TensorRTContext::RegisterTensor(nvinfer1::ITensor *tensor, const std::string &basename) {
+  if (tensor == nullptr) {
+    MS_LOG(ERROR) << "Register null tensor!";
+    return;
+  }
+  tensor->setName((basename + "_" + std::to_string(counter_++)).c_str());
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h
new file mode 100644
index 00000000000..bbcba89b223
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_
+
+#include <NvInfer.h>
+#include <string>
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+namespace mindspore::lite {
+class TensorRTContext {
+ public:
+  TensorRTContext() = default;
+  ~TensorRTContext();
+  bool Init();
+  void SetRuntime(TensorRTRuntime *runtime);
+  nvinfer1::INetworkDefinition *network();
+  void RegisterLayer(nvinfer1::ILayer *layer, const std::string &basename);
+  void RegisterTensor(nvinfer1::ITensor *tensor, const std::string &basename);
+
+ private:
+  int counter_{0};
+  nvinfer1::INetworkDefinition *network_{nullptr};
+  TensorRTRuntime *runtime_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc
new file mode 100644
index 00000000000..1882681a8b6
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc
@@ -0,0 +1,243 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_delegate.h"
+#include <cuda_runtime.h>
+#include <vector>
+#include <fstream>
+#include <string>
+#include "src/runtime/delegate/delegate_utils.h"
+#include "src/runtime/delegate/auto_registration_factory.h"
+
+namespace mindspore::lite {
+TensorRTDelegate::~TensorRTDelegate() {
+  if (runtime_ != nullptr) {
+    delete runtime_;
+  }
+  if (stream_ != nullptr) {
+    cudaStreamDestroy(stream_);
+  }
+}
+bool IsHardwareSupport() {
+  int driver_version = 0;
+  int ret = cudaDriverGetVersion(&driver_version);
+  if (ret != cudaSuccess || driver_version == 0) {
+    MS_LOG(WARNING) << "No nvidia GPU driver.";
+    return false;
+  }
+  return true;
+}
+
+Status TensorRTDelegate::Init() {
+  if (!IsHardwareSupport()) {
+    return mindspore::kLiteNotSupport;
+  }
+  std::vector<std::shared_ptr<DeviceInfoContext>> device_list = context_->MutableDeviceInfo();
+  auto iter = std::find_if(device_list.begin(), device_list.end(), [](std::shared_ptr<DeviceInfoContext> device) {
+    return device->GetDeviceType() == DeviceType::kGPU;
+  });
+  if (iter == device_list.end()) {
+    MS_LOG(ERROR) << "no gpu device info found for TensorRT.";
+    return mindspore::kLiteError;
+  }
+  auto gpu_info = (*iter)->Cast<GPUDeviceInfo>();
+  if (gpu_info == nullptr) {
+    MS_LOG(ERROR) << "no gpu device info found for TensorRT.";
+    return mindspore::kLiteError;
+  }
+  device_info_ = gpu_info;
+  int ret = lite::SetCudaDevice(device_info_);
+  if (ret != RET_OK) {
+    return mindspore::kLiteError;
+  }
+  if (runtime_ == nullptr) {
+    runtime_ = new (std::nothrow) TensorRTRuntime();
+    if (runtime_ == nullptr) {
+      MS_LOG(ERROR) << "create TensorRTRuntime failed.";
+      return mindspore::kLiteError;
+    }
+  }
+  if (runtime_->Init() != RET_OK) {
+    MS_LOG(ERROR) << "TensorRTRuntime init failed.";
+    return mindspore::kLiteError;
+  }
+  runtime_->SetDeviceID(device_info_->GetDeviceID());
+
+  auto cuda_ret = cudaStreamCreate(&stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda create stream failed";
+    return mindspore::kLiteError;
+  }
+
+  cache_mgr_ = std::make_shared<cache::EmbeddingCacheManager>();
+  if (cache_mgr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc EmbeddingCacheManager failed.";
+    return kLiteMemoryFailed;
+  }
+  auto cache_ret = cache_mgr_->Init(cache_model_path_, vocab_size_, device_cache_size_);
+  if (cache_ret != mindspore::kSuccess) {
+    MS_LOG(ERROR) << "cache_mgr_ init failed.";
+    return cache_ret;
+  }
+
+  return mindspore::kSuccess;
+}
+
+Status TensorRTDelegate::BuildSubGraph(DelegateModel<schema::Primitive> *model) {
+  KernelIter from, end;
+  std::vector<TensorRTOp *> tensorrt_ops;
+  int tensorrt_subgraph_index = 0;
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    kernel::Kernel *kernel = *iter;
+    auto tensorrt_op = FindTensorRTOp(kernel, model->GetPrimitive(kernel));
+    if (tensorrt_op != nullptr) {
+      if (cache_mgr_->CheckIsCacheKernel(kernel)) {
+        auto cache_ret = cache_mgr_->InitCacheKernel(kernel, device_info_->GetDeviceID(), &stream_);
+        if (cache_ret != kSuccess) {
+          MS_LOG(ERROR) << "InitCacheKernel failed " << kernel->name();
+          return cache_ret;
+        }
+      }
+
+      // If tensorrt_ops does not equal nullptr, this kernel can be supported by delegate
+      if (tensorrt_ops.size() == 0) {
+        from = iter;
+      }
+      tensorrt_op->SetRuntime(this->runtime_);
+      tensorrt_ops.push_back(tensorrt_op);
+      end = iter;
+    } else {
+      if (tensorrt_ops.size() > 0) {
+        auto tensorrt_subgraph = CreateTensorRTGraph(tensorrt_ops, model, from, end, tensorrt_subgraph_index);
+        if (tensorrt_subgraph == nullptr) {
+          MS_LOG(ERROR) << "Create TensorRT Graph failed.";
+          return mindspore::kLiteNullptr;
+        }
+        tensorrt_subgraph_index++;
+        iter = model->Replace(from, end + 1, tensorrt_subgraph);
+        tensorrt_ops.clear();
+      }
+    }
+  }
+  if (tensorrt_ops.size() > 0) {
+    auto tensorrt_subgraph = CreateTensorRTGraph(tensorrt_ops, model, from, end, tensorrt_subgraph_index);
+    if (tensorrt_subgraph == nullptr) {
+      MS_LOG(ERROR) << "Create TensorRT Graph failed.";
+      return mindspore::kLiteNullptr;
+    }
+    model->Replace(from, end + 1, tensorrt_subgraph);
+    tensorrt_ops.clear();
+  }
+  return mindspore::kSuccess;
+}
+
+Status TensorRTDelegate::Build(DelegateModel<schema::Primitive> *model) {
+  int ret = lite::SetCudaDevice(device_info_);
+  if (ret != RET_OK) {
+    return mindspore::kLiteError;
+  }
+  if (cache_model_path_.empty() && vocab_size_ > 0) {
+    auto cache_ret = cache_mgr_->Init(model, vocab_size_, device_cache_size_);
+    if (cache_ret != mindspore::kSuccess) {
+      MS_LOG(ERROR) << "cache_mgr_ init failed.";
+      return cache_ret;
+    }
+  }
+
+  auto build_ret = BuildSubGraph(model);
+  if (build_ret != kSuccess) {
+    MS_LOG(INFO) << "BuildSubGraph failed";
+    return build_ret;
+  }
+
+  return mindspore::kSuccess;
+}
+
+TensorRTOp *TensorRTDelegate::FindTensorRTOp(kernel::Kernel *kernel, const schema::Primitive *primitive) {
+  auto in_tensors = kernel->inputs();
+  auto out_tensors = kernel->outputs();
+  auto name = kernel->name();
+  auto node_type = primitive->value_type();
+  auto &plugin_factory = AutoRegistrationFactory<schema::PrimitiveType, TensorRTGetOp>::Get();
+  if (plugin_factory.HasKey(node_type)) {
+    TensorRTOp *tensorrt_op =
+      plugin_factory.GetCreator(node_type)(primitive, in_tensors, out_tensors, name, kernel->quant_type());
+    if (tensorrt_op == nullptr) {
+      return nullptr;
+    }
+    if (!support_resize_) {
+      return tensorrt_op;
+    }
+    support_resize_ = tensorrt_op->GetDynamicShapeParams().support_dynamic_ ? support_resize_ : false;
+    if (!tensorrt_op->GetDynamicShapeParams().support_dynamic_) {
+      MS_LOG(WARNING) << "TensorRT subgraph don't support dynamic shape resize, because of op " << name;
+      support_hw_resize_ = false;
+      return tensorrt_op;
+    }
+    if (!support_hw_resize_) {
+      return tensorrt_op;
+    }
+    support_hw_resize_ = tensorrt_op->GetDynamicShapeParams().support_hw_dynamic_ ? support_hw_resize_ : false;
+    if (!tensorrt_op->GetDynamicShapeParams().support_hw_dynamic_) {
+      MS_LOG(WARNING) << "TensorRT subgraph don't support dynamic hw dims resize, because of op " << name;
+    }
+    return tensorrt_op;
+  } else {
+    MS_LOG(WARNING) << "Unsupported op type for TensorRT. kernel->name:" << kernel->name()
+                    << " type:" << schema::EnumNamePrimitiveType(primitive->value_type());
+    return nullptr;
+  }
+}
+
+TensorRTSubGraph *TensorRTDelegate::CreateTensorRTGraph(const std::vector<TensorRTOp *> &ops,
+                                                        DelegateModel<schema::Primitive> *model, KernelIter from,
+                                                        KernelIter end, int index) {
+  auto in_tensors = GraphInTensors<TensorRTOp>(ops, model, from, end);
+  auto out_tensors = GraphOutTensors<TensorRTOp>(ops, model, from, end);
+  auto *tensorrt_graph = new (std::nothrow) TensorRTSubGraph(ops, in_tensors, out_tensors, context_, device_info_,
+                                                             runtime_, support_resize_, support_hw_resize_);
+  if (tensorrt_graph == nullptr) {
+    MS_LOG(ERROR) << "new tensorrt_graph failed.";
+    return nullptr;
+  }
+  tensorrt_graph->SetCacheManager(cache_mgr_);
+  if (serialize_path_.size() > 0) {
+    tensorrt_graph->SetSerializePath(serialize_path_ + "_trt" + std::to_string(GetRankID()) + ".bin_" +
+                                     std::to_string(index));
+  }
+
+  // 1. For every op, find pre and next ops
+  FindPreNextOps<TensorRTOp>(ops);
+
+  // 2. Init TensorRT SubGraph.
+  auto ret = tensorrt_graph->Init(stream_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "TensorRTGraph init failed.";
+    delete tensorrt_graph;
+    return nullptr;
+  }
+
+  // 3. Build TensorRT Model.
+  ret = tensorrt_graph->BuildTensorRTGraph();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "TensorRTGraph build failed.";
+    delete tensorrt_graph;
+    return nullptr;
+  }
+
+  return tensorrt_graph;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h
new file mode 100644
index 00000000000..aa543a669ff
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <memory>
+#include "include/api/delegate.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_subgraph.h"
+#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h"
+#include "include/api/kernel.h"
+#include "include/errorcode.h"
+#include "src/common/log_adapter.h"
+#include "include/api/context.h"
+
+namespace mindspore::lite {
+class TensorRTDelegate : public Delegate {
+ public:
+  explicit TensorRTDelegate(mindspore::Context *context, const std::string &cache_model_path, size_t vocab_size,
+                            size_t device_cache_size, const std::string &serialize_path)
+      : context_(context),
+        cache_model_path_(cache_model_path),
+        vocab_size_(vocab_size),
+        device_cache_size_(device_cache_size),
+        serialize_path_(serialize_path) {}
+
+  ~TensorRTDelegate() override;
+
+  Status Init() override;
+
+  Status Build(DelegateModel<schema::Primitive> *model) override;
+
+ private:
+  Status BuildSubGraph(DelegateModel<schema::Primitive> *model);
+
+  TensorRTOp *FindTensorRTOp(kernel::Kernel *kernel, const schema::Primitive *primitive);
+
+  TensorRTSubGraph *CreateTensorRTGraph(const std::vector<TensorRTOp *> &ops, DelegateModel<schema::Primitive> *model,
+                                        KernelIter from, KernelIter end, int index);
+
+  std::unordered_map<schema::PrimitiveType, TensorRTGetOp> op_func_lists_;
+  mindspore::Context *context_{nullptr};
+  std::shared_ptr<GPUDeviceInfo> device_info_{nullptr};
+  TensorRTRuntime *runtime_{nullptr};
+  bool support_hw_resize_{true};
+  bool support_resize_{true};
+  const std::string cache_model_path_;
+  size_t vocab_size_{0};
+  size_t device_cache_size_{0};
+  std::shared_ptr<cache::EmbeddingCacheManager> cache_mgr_{nullptr};
+  const std::string serialize_path_;
+  cudaStream_t stream_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc
new file mode 100644
index 00000000000..73b0dc31287
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+#include <mutex>
+#include <vector>
+
+namespace mindspore::lite {
+int TensorRTRuntime::Init() {
+  if (is_init_) {
+    return RET_OK;
+  }
+  builder_ = nvinfer1::createInferBuilder(this->logger_);
+  if (builder_ == nullptr) {
+    MS_LOG(ERROR) << "create infer builder failed.";
+    return RET_ERROR;
+  }
+  builder_->setMaxBatchSize(MAX_BATCH_SIZE);
+  allocator_ = new (std::nothrow) TensorRTAllocator();
+  if (allocator_ == nullptr) {
+    MS_LOG(ERROR) << "Create allocator failed.";
+    return RET_ERROR;
+  }
+  is_init_ = true;
+  return RET_OK;
+}
+
+TensorRTRuntime::~TensorRTRuntime() {
+  if (builder_ != nullptr) {
+    builder_->destroy();
+    builder_ = nullptr;
+  }
+  if (allocator_ != nullptr) {
+    allocator_->ClearDeviceMem();
+    delete allocator_;
+    allocator_ = nullptr;
+  }
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h
new file mode 100644
index 00000000000..29ccd3f701b
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_
+#include <NvInfer.h>
+#include "include/errorcode.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h"
+#include "src/common/log_adapter.h"
+#define MAX_BATCH_SIZE 64
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::lite {
+class TensorRTLogger : public nvinfer1::ILogger {
+  void log(Severity severity, const char *msg) noexcept override {
+    if (severity == Severity::kINTERNAL_ERROR || severity == Severity::kERROR) {
+      MS_LOG(ERROR) << msg;
+    } else if (severity == Severity::kWARNING) {
+      MS_LOG(WARNING) << msg;
+    } else if (severity == Severity::kINFO) {
+      MS_LOG(INFO) << msg;
+    } else {
+      MS_LOG(DEBUG) << msg;
+    }
+  }
+};
+
+enum RuntimePrecisionMode : int { RuntimePrecisionMode_FP32, RuntimePrecisionMode_FP16 };
+
+class TensorRTRuntime {
+ public:
+  TensorRTRuntime() = default;
+
+  ~TensorRTRuntime();
+
+  int Init();
+
+  nvinfer1::IBuilder *GetBuilder() { return this->builder_; }
+
+  int GetBatchSize() { return batch_size_; }
+
+  void SetBatchSize(int batch_size) { batch_size_ = batch_size; }
+
+  void SetCudaStream(cudaStream_t stream) { allocator_->SetCudaStream(stream); }
+
+  RuntimePrecisionMode GetRuntimePrecisionMode() { return runtime_percision_mode_; }
+
+  void SetRuntimePrecisionMode(RuntimePrecisionMode runtime_percision_mode) {
+    runtime_percision_mode_ = runtime_percision_mode;
+  }
+
+  TensorRTAllocator *GetAllocator() { return this->allocator_; }
+
+  void SetDeviceID(uint32_t device_id) { device_id_ = device_id; }
+
+  uint32_t GetDeviceID() { return device_id_; }
+
+ private:
+  bool is_init_ = false;
+  nvinfer1::IBuilder *builder_{nullptr};
+  TensorRTLogger logger_;
+  TensorRTAllocator *allocator_{nullptr};
+  int batch_size_{0};
+  uint32_t device_id_{0};
+  RuntimePrecisionMode runtime_percision_mode_{RuntimePrecisionMode::RuntimePrecisionMode_FP32};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc
new file mode 100644
index 00000000000..8047cac6e9e
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_serializer.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+#include "src/common/file_utils.h"
+
+namespace mindspore::lite {
+nvinfer1::ICudaEngine *TensorRTSerializer::GetSerializedEngine() {
+  if (serialize_file_path_.size() == 0) {
+    return nullptr;
+  }
+  void *trt_model_stream{nullptr};
+  size_t size{0};
+  trt_model_stream = ReadFile(serialize_file_path_.c_str(), &size);
+  if (trt_model_stream == nullptr || size == 0) {
+    MS_LOG(WARNING) << "read engine file failed : " << serialize_file_path_;
+    return nullptr;
+  }
+  nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger_);
+  if (runtime == nullptr) {
+    delete[] trt_model_stream;
+    MS_LOG(ERROR) << "createInferRuntime failed.";
+    return nullptr;
+  }
+  nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(trt_model_stream, size, nullptr);
+  delete[] trt_model_stream;
+  runtime->destroy();
+  return engine;
+}
+void TensorRTSerializer::SaveSerializedEngine(nvinfer1::ICudaEngine *engine) {
+  if (serialize_file_path_.size() == 0) {
+    return;
+  }
+  nvinfer1::IHostMemory *ptr = engine->serialize();
+  if (ptr == nullptr) {
+    MS_LOG(ERROR) << "serialize engine failed";
+    return;
+  }
+
+  int ret = WriteToBin(serialize_file_path_, ptr->data(), ptr->size());
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "save engine failed " << serialize_file_path_;
+  } else {
+    MS_LOG(INFO) << "save engine to " << serialize_file_path_;
+  }
+  ptr->destroy();
+  return;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h
new file mode 100644
index 00000000000..d5ae0b1baf8
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_
+#include <string>
+#include <utility>
+#include <NvInfer.h>
+#include "include/errorcode.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::lite {
+class TensorRTSerializer {
+ public:
+  explicit TensorRTSerializer(const std::string &serialize_file_path)
+      : serialize_file_path_(std::move(serialize_file_path)) {}
+
+  ~TensorRTSerializer() = default;
+
+  nvinfer1::ICudaEngine *GetSerializedEngine();
+
+  void SaveSerializedEngine(nvinfer1::ICudaEngine *engine);
+
+ private:
+  std::string serialize_file_path_;
+  TensorRTLogger logger_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc
new file mode 100644
index 00000000000..9fbceab0d3c
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc
@@ -0,0 +1,681 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/tensorrt_subgraph.h"
+#include <cuda_runtime_api.h>
+#include <string>
+#include <vector>
+#include <set>
+#include <queue>
+#include <algorithm>
+#include "src/runtime/delegate/delegate_utils.h"
+
+namespace mindspore::lite {
+TensorRTSubGraph::~TensorRTSubGraph() {
+  if (ctx_ != nullptr) {
+    delete ctx_;
+  }
+  if (config_ != nullptr) {
+    config_->destroy();
+    config_ = nullptr;
+  }
+  if (trt_context_ != nullptr) {
+    trt_context_->destroy();
+    trt_context_ = nullptr;
+  }
+  if (engine_ != nullptr) {
+    engine_->destroy();
+    engine_ = nullptr;
+  }
+  if (tensor_bindings_ != nullptr) {
+    delete[] tensor_bindings_;
+    tensor_bindings_ = nullptr;
+  }
+  for (auto op : all_ops_) {
+    delete op;
+  }
+}
+
+int TensorRTSubGraph::Init(cudaStream_t stream) {
+  auto ret = GetGraphInOutOps(inputs_, outputs_, &in_ops_, &out_ops_, all_ops_);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get TensorRT subgraph input and output ops failed.";
+    return RET_ERROR;
+  }
+  profile_ = runtime_->GetBuilder()->createOptimizationProfile();
+  if (profile_ == nullptr) {
+    MS_LOG(ERROR) << "createOptimizationProfile failed.";
+    return RET_ERROR;
+  }
+  ctx_ = new (std::nothrow) TensorRTContext();
+  if (ctx_ == nullptr) {
+    MS_LOG(ERROR) << "New TensorRTContext failed.";
+    return RET_OK;
+  }
+  ctx_->SetRuntime(runtime_);
+  if (!ctx_->Init()) {
+    MS_LOG(ERROR) << "New TensorRTContext failed.";
+    return RET_OK;
+  }
+  if (SetDeviceConfig(stream) != RET_OK) {
+    MS_LOG(WARNING) << "set tensorrt config failed.";
+  }
+  serializer_ = std::make_shared<TensorRTSerializer>(serialize_file_path_);
+  if (serializer_ == nullptr) {
+    MS_LOG(ERROR) << "create Serializer failed.";
+    return RET_ERROR;
+  }
+  engine_ = serializer_->GetSerializedEngine();
+  if (engine_ != nullptr) {
+    MS_LOG(INFO) << "using serialized engine " << serialize_file_path_;
+    return RET_OK;
+  }
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    if (inputs_[i].Shape().size() != DIMENSION_4D) {
+      input_hw_index_ = -1;
+    }
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::BuildEngine() {
+  // print all network ops
+  if (this->config_->addOptimizationProfile(profile_) == -1) {
+    MS_LOG(ERROR) << "addOptimizationProfile failed.";
+    return RET_ERROR;
+  }
+  MS_LOG(INFO) << "build engine for tensorrt network: " << ctx_->network()->getName();
+  for (int i = 0; i < ctx_->network()->getNbLayers(); i++) {
+    MS_LOG(DEBUG) << "tensorrt op: " << ctx_->network()->getLayer(i)->getName();
+  }
+  MS_LOG(DEBUG) << "end of tensorrt network: " << ctx_->network()->getName();
+
+  this->engine_ = runtime_->GetBuilder()->buildEngineWithConfig(*ctx_->network(), *this->config_);
+  if (this->engine_ == nullptr) {
+    MS_LOG(ERROR) << "Create engine failed in TensorRT network";
+    return RET_ERROR;
+  }
+  if (serialize_file_path_.size() > 0) {
+    serializer_->SaveSerializedEngine(engine_);
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::SetDeviceConfig(cudaStream_t stream) {
+  if (config_ == nullptr) {
+    this->config_ = runtime_->GetBuilder()->createBuilderConfig();
+    if (this->config_ == nullptr) {
+      MS_LOG(ERROR) << "create builder config failed.";
+      return RET_ERROR;
+    }
+  }
+  // set fp16
+  if (device_info_->GetEnableFP16() && runtime_->GetBuilder()->platformHasFastFp16()) {
+    MS_LOG(INFO) << "set fp16 flag successfully for tensorrt.";
+    config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+    runtime_->SetRuntimePrecisionMode(RuntimePrecisionMode_FP16);
+  }
+
+  // set int8
+  if (IsInt8Mode() && runtime_->GetBuilder()->platformHasFastInt8()) {
+    MS_LOG(INFO) << "set int8 flag successfully for tensorrt.";
+    config_->setFlag(nvinfer1::BuilderFlag::kINT8);
+    // Mark calibrator as null
+    config_->setInt8Calibrator(nullptr);
+    input_hw_index_ = -1;
+  } else {
+    MS_LOG(INFO) << "inputs no quant params or platform not support int8.";
+  }
+  runtime_->SetCudaStream(stream);
+  config_->setProfileStream(stream);
+  stream_ = stream;
+  MS_LOG(INFO) << GetRankID() << " tensorrt subgraph stream: " << stream_;
+
+  // config setMaxWorkspaceSize to 1152 MB for max limit
+  config_->setMaxWorkspaceSize(1152 * (1 << 20));
+  return RET_OK;
+}
+
+bool TensorRTSubGraph::IsInt8Mode() {
+  for (auto cur_op : all_ops_) {
+    if (cur_op->GetQuantType() == schema::QuantType_QUANT_ALL) {
+      return true;
+    }
+  }
+  return false;
+}
+
+nvinfer1::ITensor *TensorRTSubGraph::SetTensorRTNetworkInput(const mindspore::MSTensor &in_tensor) {
+  for (int i = 0; i < ctx_->network()->getNbInputs(); i++) {
+    if (in_tensor.Name().compare(ctx_->network()->getInput(i)->getName()) == 0) {
+      MS_LOG(INFO) << "input tensor is already added in network: " << in_tensor.Name();
+      return ctx_->network()->getInput(i);
+    }
+  }
+
+  auto cuda_dtype = ConvertDataType(in_tensor.DataType());
+  if (static_cast<int>(cuda_dtype) == -1) {
+    MS_LOG(ERROR) << "Unsupported input data type " << static_cast<int>(in_tensor.DataType());
+    return nullptr;
+  }
+  nvinfer1::Dims input_dims = ParseInputDimsProfile(in_tensor);
+  MS_LOG(INFO) << "add network input: " << in_tensor.Name();
+  return ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims);
+}
+
+nvinfer1::Dims TensorRTSubGraph::ParseInputDimsProfile(const mindspore::MSTensor &in_tensor) {
+  nvinfer1::Dims input_dims = ConvertCudaDims(in_tensor.Shape());
+  if (profile_ == nullptr) {
+    MS_LOG(ERROR) << "profile is null.";
+    return input_dims;
+  }
+  if (runtime_->GetBatchSize() == 0) {
+    runtime_->SetBatchSize(input_dims.d[0]);
+    MS_LOG(INFO) << "batch size init as " << runtime_->GetBatchSize();
+    if (input_batchsize_index_ != -1) {
+      input_dims.d[0] = -1;  // dynamic batch size with wildcard N, default batchsize is first dims
+      input_batchsize_index_ = 0;
+    }
+  } else {
+    if (input_batchsize_index_ != -1) {
+      for (int n = 0; n < input_dims.nbDims; n++) {
+        if (input_dims.d[n] == runtime_->GetBatchSize()) {
+          runtime_->SetBatchSize(std::max(input_dims.d[0], runtime_->GetBatchSize()));
+          // first dims equals to batchsize
+          input_dims.d[n] = -1;
+          input_batchsize_index_ = n;
+          break;
+        }
+      }
+    }
+  }
+  // only support NHWC HW dim resize
+  if (input_hw_index_ != -1) {
+    MS_LOG(INFO) << "input tensor format is (NHWC:1, NCHW:0): " << in_tensor.format();
+    input_hw_index_ = in_tensor.format() == Format::NHWC ? 1 : 2;  // NCHW is 2
+    input_dims.d[input_hw_index_] = -1;
+    input_dims.d[input_hw_index_ + 1] = -1;
+  }
+  // We do not need to check the return of setDimension and addOptimizationProfile here as all dims are explicitly set
+  nvinfer1::Dims input_dims_min = ConvertCudaDims(in_tensor.Shape());
+  if (input_batchsize_index_ != -1) {
+    input_dims_min.d[input_batchsize_index_] = 1;
+    if (input_hw_index_ != -1) {
+      input_dims_min.d[input_hw_index_] = 1;
+      input_dims_min.d[input_hw_index_ + 1] = 1;
+    }
+  }
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims_min)) {
+    MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name();
+    return input_dims;
+  }
+  nvinfer1::Dims input_dims_opt = ConvertCudaDims(in_tensor.Shape());
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims_opt)) {
+    MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name();
+    return input_dims;
+  }
+  nvinfer1::Dims input_dims_max = ConvertCudaDims(in_tensor.Shape());
+  // input_dims_max should be the same with input network dims
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims_max)) {
+    MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name();
+    return input_dims;
+  }
+  return input_dims;
+}
+
+int TensorRTSubGraph::ParseInputsProfile() {
+  MS_LOG(INFO) << "using serialied engine.";
+  for (auto in_tensor : inputs_) {
+    auto dim = ParseInputDimsProfile(in_tensor);
+    if (dim.nbDims <= 0) {
+      MS_LOG(ERROR) << "input dims is invalid.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::BuildTensorRTGraph() {
+  MS_ASSERT(!all_ops_.empty());
+  int ret;
+  if (engine_ != nullptr) {
+    return ParseInputsProfile();
+  }
+  // build engine online
+  for (auto cur_op : all_ops_) {
+    cur_op->SetRuntime(runtime_);
+    for (auto in_tensor : cur_op->inputs()) {
+      // Data From CPU
+      if (IsSubGraphInputTensor(this->inputs(), in_tensor)) {
+        nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(in_tensor);
+        if (trt_tensor == nullptr) {
+          MS_LOG(ERROR) << "SetTensorRTNetworkInput failed for " << in_tensor.Name();
+          return RET_ERROR;
+        }
+#if TRT_VERSION_GE(7, 2)
+        // avoid bool input tensor
+        if (trt_tensor->getType() == nvinfer1::DataType::kBOOL) {
+          trt_tensor = TRTTensorCast(ctx_, trt_tensor, nvinfer1::DataType::kINT32, in_tensor.Name() + "_cast_int32");
+        }
+#endif
+        cur_op->AddInnerInTensors(ITensorHelper{trt_tensor, in_tensor.format(), true});
+        continue;
+      }
+
+      ITensorHelper trt_tensor = FindTensorRTInputs(cur_op, in_tensor);
+      if (trt_tensor.trt_tensor_ == nullptr) {
+        // weight tensor
+        if (IsCached(cur_op, in_tensor) && in_tensor.Data() != nullptr) {
+          ret = HandleCacheTensor(cur_op, in_tensor);
+          if (ret != RET_OK) {
+            MS_LOG(ERROR) << "HandleCacheTensor failed for " << in_tensor.Name();
+            return RET_ERROR;
+          }
+        } else if (trt_specific_weight_nodes_.find(cur_op->type()) == trt_specific_weight_nodes_.end()) {
+          if (in_tensor.Data() == nullptr) {
+            MS_LOG(ERROR) << "Weight Tensor data is nullptr.";
+            return RET_ERROR;
+          }
+          trt_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx_, in_tensor, cur_op->GetOpName());
+          trt_tensor.format_ = Format::NHWC;
+          MS_LOG(INFO) << "auto convert constant tensor for: " << in_tensor.Name();
+          cur_op->AddInnerInTensors(trt_tensor);
+        }
+      } else {
+        cur_op->AddInnerInTensors(trt_tensor);
+      }
+    }
+    MS_LOG(DEBUG) << "Parsing TensorRT op for " << cur_op->GetOpName();
+
+    ret = cur_op->AddInnerOp(ctx_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Add op failed in TensorRT network: " << cur_op->GetOpName();
+      return RET_ERROR;
+    }
+    ret = cur_op->SetInt8DynamicRange();
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Set Int8 dynamic range failed in TensorRT network: " << cur_op->GetOpName();
+      return RET_ERROR;
+    }
+  }
+  ret = MarkOutputs();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MarkOutputs failed in TensorRT network";
+    return ret;
+  }
+
+  std::string network_name = "network_" + std::string(ctx_->network()->getInput(0)->getName()) + "_" +
+                             std::string(ctx_->network()->getOutput(0)->getName());
+  ctx_->network()->setName(network_name.c_str());
+  this->name_ = network_name;
+  ret = BuildEngine();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Create engine failed in TensorRT network";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::MarkOutputs() {
+  // Mark NetWork Output Tensor.
+  for (const auto &out_tensor : outputs_) {
+    for (auto out_op : this->out_ops_) {
+      for (size_t index = 0; index < out_op->outputs().size(); index++) {
+        if (out_op->outputs()[index] == out_tensor) {
+          MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
+          nvinfer1::ITensor *out_trt_tensor = out_op->GetInnerOutTensor()[index].trt_tensor_;
+          if (out_op->GetInnerOutTensor()[index].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+              out_op->GetInnerOutTensor()[index].format_ == Format::NCHW &&
+              !SameDims(out_op->GetInnerOutTensor()[index].trt_tensor_->getDimensions(), out_tensor.Shape())) {
+            // transpose subgraph output from nchw to nhwc
+            nvinfer1::IShuffleLayer *transpose_layer_out =
+              NCHW2NHWC(ctx_, *out_op->GetInnerOutTensor()[index].trt_tensor_);
+            if (transpose_layer_out == nullptr) {
+              MS_LOG(ERROR) << "op action convert failed";
+              return RET_ERROR;
+            }
+            transpose_layer_out->setName((out_tensor.Name() + "_transpose2NHWC").c_str());
+            out_trt_tensor = transpose_layer_out->getOutput(0);
+          }
+
+          out_trt_tensor->setName(out_tensor.Name().c_str());
+          ctx_->network()->markOutput(*out_trt_tensor);
+          for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
+            if (out_trt_tensor->getDimensions().d[n] == -1) {
+              output_batchsize_index_ = n;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::Prepare() {
+  int ret = lite::SetCudaDevice(device_info_);
+  if (ret != RET_OK) {
+    return ret;
+  }
+  if (this->engine_ == nullptr) {
+    MS_LOG(ERROR) << "engine_ is null in this builder_";
+    return RET_ERROR;
+  }
+  this->trt_context_ = this->engine_->createExecutionContext();
+  if (this->trt_context_ == nullptr) {
+    MS_LOG(ERROR) << "TensorRTSubGraph create context failed.";
+    return RET_ERROR;
+  }
+  int binding_num = this->engine_->getNbBindings();
+  tensor_bindings_ = new (std::nothrow) void *[binding_num];
+  if (tensor_bindings_ == nullptr) {
+    MS_LOG(ERROR) << "malloc tensor binding array failed.";
+    return RET_ERROR;
+  }
+
+  for (auto tensor : inputs_) {
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, tensor.DataSize());
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "malloc for inputs tensor device memory failed.";
+      return RET_ERROR;
+    }
+    int index = this->engine_->getBindingIndex(tensor.Name().c_str());
+    tensor_bindings_[index] = device_ptr;
+    trt_in_tensor_name_.push_back(tensor.Name());
+    nvinfer1::Dims input_dims = ConvertCudaDims(tensor.Shape());
+    for (int od = 0; od < input_dims.nbDims; od++) {
+      MS_LOG(DEBUG) << "in tensor " << tensor.Name() << " dims at " << od << " is " << input_dims.d[od];
+    }
+
+    if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
+      MS_LOG(ERROR) << "invalid input dims of " << tensor.Name();
+      return RET_ERROR;
+    }
+  }
+
+  // malloc for cache weight tensor
+  for (auto cache_tensor : cache_const_inputs_) {
+    size_t data_size = cache_mgr_->GetCacheDataSize(cache_tensor);
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(cache_tensor, data_size);
+    runtime_->GetAllocator()->MarkMemValid(cache_tensor.Name().c_str(), true);
+    int index = this->engine_->getBindingIndex(cache_tensor.Name().c_str());
+    tensor_bindings_[index] = device_ptr;
+    auto cache_ret = cache_mgr_->SetDeviceCacheAddr(cache_tensor.Name(), device_ptr, data_size);
+    if (cache_ret != kSuccess) {
+      MS_LOG(ERROR) << "SetDeviceCacheAddr failed, cache tensor: " << cache_tensor.Name();
+      return RET_ERROR;
+    }
+  }
+
+  if (!this->trt_context_->allInputDimensionsSpecified()) {
+    MS_LOG(ERROR) << "input dims need to be specified.";
+    return RET_ERROR;
+  }
+  for (auto op : all_ops_) {
+    ret = op->Prepare(tensor_bindings_, engine_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "prepare op failed of " << op->GetOpName();
+      return RET_ERROR;
+    }
+  }
+  for (auto tensor : outputs_) {
+    (void)tensor.MutableData();
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, tensor.DataSize());
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "malloc for outputs tensor device memory failed.";
+      return RET_ERROR;
+    }
+    int index = this->engine_->getBindingIndex(tensor.Name().c_str());
+    tensor_bindings_[index] = device_ptr;
+    trt_out_tensor_name_.push_back(tensor.Name());
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::ReSize() {
+  if (input_batchsize_index_ == -1) {
+    MS_LOG(ERROR) << "current network don't support resize.";
+    return RET_ERROR;
+  }
+  for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) {
+    if (ctx_->network() != nullptr) {
+      for (int j = 0; j < ctx_->network()->getNbInputs(); j++) {
+        if (trt_in_tensor_name_[i].compare(ctx_->network()->getInput(j)->getName()) != 0) {
+          continue;
+        }
+        nvinfer1::Dims construct_dims = ctx_->network()->getInput(j)->getDimensions();
+        bool ret = ValidInputResizeDims(construct_dims, inputs_[i].Shape());
+        if (!ret) {
+          MS_LOG(ERROR) << "input resize shape is invalid.";
+          return RET_ERROR;
+        }
+      }
+    }
+
+    MS_LOG(INFO) << "resize at input_batch_index " << input_batchsize_index_ << ", update batch size to "
+                 << inputs_[i].Shape()[input_batchsize_index_];
+    runtime_->SetBatchSize(inputs_[i].Shape()[input_batchsize_index_]);
+
+    // inputs_ is dupulated by mindrt, name is untustable.
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_in_tensor_name_[i], inputs_[i].DataSize(),
+                                                                ConvertDataType(inputs_[i].DataType()));
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "realloc for input tensor device memory failed.";
+      return RET_ERROR;
+    }
+    int index = this->engine_->getBindingIndex(trt_in_tensor_name_[i].c_str());
+    tensor_bindings_[index] = device_ptr;
+    // Set actual input size
+    nvinfer1::Dims input_dims = ConvertCudaDims(inputs_[i].Shape());
+    for (int od = 0; od < input_dims.nbDims; od++) {
+      MS_LOG(DEBUG) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od];
+    }
+
+    if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
+      MS_LOG(ERROR) << "invalid input dims of " << inputs_[i].Name();
+      return RET_ERROR;
+    }
+  }
+  if (!this->trt_context_->allInputDimensionsSpecified()) {
+    MS_LOG(ERROR) << "input dims need to be specified.";
+    return RET_ERROR;
+  }
+
+  for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
+    int index = this->engine_->getBindingIndex(trt_out_tensor_name_[i].c_str());
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_out_tensor_name_[i], outputs_[i].DataSize(),
+                                                                ConvertDataType(outputs_[i].DataType()));
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "realloc for outputs tensor device memory failed.";
+      return RET_ERROR;
+    }
+    tensor_bindings_[index] = device_ptr;
+  }
+  return RET_OK;
+}
+
+bool TensorRTSubGraph::ValidInputResizeDims(const nvinfer1::Dims &construct_dims,
+                                            const std::vector<int64_t> &resize_input_shape) {
+  if (static_cast<size_t>(construct_dims.nbDims) != resize_input_shape.size()) {
+    MS_LOG(ERROR) << "invalid resize input.";
+    return false;
+  }
+  if (input_hw_index_ == -1) {
+    // only NHWC format support HW resize, otherwise only support batchsize resize
+    for (int d = 0; d < construct_dims.nbDims; d++) {
+      if (d != input_batchsize_index_ && construct_dims.d[d] != resize_input_shape[d]) {
+        MS_LOG(ERROR) << "only support dynamic batch size resize input.";
+        return false;
+      }
+    }
+  } else if ((input_hw_index_ == 1 && construct_dims.d[DIMENSION_3D] != resize_input_shape[DIMENSION_3D]) ||
+             (input_hw_index_ == DIMENSION_2D && construct_dims.d[1] != resize_input_shape[1])) {
+    // input may be nhwc || nchw
+    MS_LOG(ERROR) << "don't support dynamic channel resize input.";
+    return false;
+  }
+  return true;
+}
+
+int TensorRTSubGraph::Execute() {
+  int ret = lite::SetCudaDevice(device_info_);
+  if (ret != RET_OK) {
+    return ret;
+  }
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    if (runtime_->GetAllocator()->GetMemIsValid(trt_in_tensor_name_[i])) {
+      MS_LOG(INFO) << "no need memcpy to cuda for input tensor: " << trt_in_tensor_name_[i];
+      continue;
+    }
+
+    auto iter = model_input_to_cache_tensors_.find(trt_in_tensor_name_[i]);
+    if (iter != model_input_to_cache_tensors_.end()) {
+      for (auto &cache_tensor : iter->second) {
+        ret = cache_mgr_->CacheHandle(cache_tensor.Name(), inputs_[i],
+                                      runtime_->GetAllocator()->GetDevicePtr(trt_in_tensor_name_[i]));
+        if (ret != RET_OK) {
+          MS_LOG(ERROR) << "handle cache failed " << trt_in_tensor_name_[i];
+          return RET_ERROR;
+        }
+        runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], true);
+        MS_LOG(DEBUG) << cache_tensor.Name() << " CacheHandle succ " << trt_in_tensor_name_[i];
+      }
+      continue;
+    }
+
+    ret = runtime_->GetAllocator()->SyncMemInHostAndDevice(inputs_[i], trt_in_tensor_name_[i], true);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "sync mem from host to device failed for " << trt_in_tensor_name_[i];
+      return ret;
+    }
+    runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], true);
+  }
+
+  if (!this->trt_context_->executeV2(tensor_bindings_)) {
+    MS_LOG(ERROR) << "TensorRT execute failed.";
+    return RET_ERROR;
+  }
+
+  for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) {
+    int index = this->engine_->getBindingIndex(trt_out_tensor_name_[i].c_str());
+    // actual output tensor dims
+    auto out_dims = this->trt_context_->getBindingDimensions(index);
+    std::vector<int64_t> new_shape = lite::ConvertMSShape(out_dims);
+    // batchsize resize need set new batch size
+    if (input_batchsize_index_ != -1) {
+      if (runtime_->GetBatchSize() != new_shape[output_batchsize_index_]) {
+        new_shape[output_batchsize_index_] = runtime_->GetBatchSize();
+      }
+    }
+    for (int od = 0; od < out_dims.nbDims; od++) {
+      MS_LOG(DEBUG) << "out tensor " << trt_out_tensor_name_[i] << " dims at " << od << " is " << new_shape[od];
+    }
+    outputs_[i].SetShape(new_shape);
+
+    if (outputs_[i].MutableData() == nullptr) {
+      MS_LOG(ERROR) << "realloc for outputs tensor failed.";
+      return RET_ERROR;
+    }
+    runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name_[i], true);
+    int sync_ret = runtime_->GetAllocator()->SyncMemInHostAndDevice(outputs_[i], trt_out_tensor_name_[i], false);
+    if (sync_ret != RET_OK) {
+      MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name_[i];
+      return sync_ret;
+    }
+    runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name_[i], false);
+  }
+  // make mem invalid, prepare for next execute
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], false);
+  }
+  return RET_OK;
+}
+
+ITensorHelper TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) {
+  for (auto input_op : cur_op->in_ops()) {
+    for (size_t i = 0; i < input_op->outputs().size(); i++) {
+      auto out_tensor = input_op->outputs().at(i);
+      if (in_tensor.Name().compare(out_tensor.Name()) == 0) {
+        return input_op->GetInnerOutTensor().at(i);
+      }
+    }
+  }
+  return ITensorHelper{};
+}
+bool TensorRTSubGraph::IsCached(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) {
+  return cache_mgr_ != nullptr && cache_mgr_->IsCacheTensor(in_tensor);
+}
+
+void TensorRTSubGraph::FindCacheTensorInfo(TensorRTOp *cur_op, mindspore::MSTensor device_cache_tensor) {
+  auto iter = network_cache_tensor_info_.find(cur_op->GetOpName());
+  if (iter != network_cache_tensor_info_.end()) {
+    return;
+  }
+  std::queue<TensorRTOp *> front_ops;
+  front_ops.push(cur_op);
+  network_cache_tensor_info_[cur_op->GetOpName()].front_op_can_cache_ = true;
+  iter = network_cache_tensor_info_.find(cur_op->GetOpName());
+  while (!front_ops.empty()) {
+    auto front_op = front_ops.front();
+    iter->second.front_op_can_cache_ = CanOpCache(front_op) ? iter->second.front_op_can_cache_ : false;
+    for (auto in_tensor : front_op->inputs()) {
+      if (IsSubGraphInputTensor(this->inputs(), in_tensor)) {
+        iter->second.network_input_tensor_.push_back(in_tensor);
+        model_input_to_cache_tensors_[in_tensor.Name()].push_back(device_cache_tensor);
+        MS_LOG(DEBUG) << cur_op->GetOpName() << "'s network input tensor name is " << in_tensor.Name()
+                      << ", can cache: " << iter->second.front_op_can_cache_;
+      }
+    }
+    for (auto fronts_op : front_op->in_ops()) {
+      front_ops.push(fronts_op);
+    }
+    front_ops.pop();
+  }
+}
+
+bool TensorRTSubGraph::CanOpCache(TensorRTOp *cur_op) { return true; }
+
+int TensorRTSubGraph::HandleCacheTensor(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) {
+  FindCacheTensorInfo(cur_op, in_tensor);
+  // cache kernel weight tensor
+  cache_const_inputs_.push_back(in_tensor);
+  auto shape = cache_mgr_->GetCacheShape(in_tensor);
+  MS_LOG(INFO) << "auto add cache constant tensor for: " << in_tensor.Name();
+  auto cuda_dtype = ConvertDataType(in_tensor.DataType());
+  nvinfer1::Dims input_dims = ConvertCudaDims(shape);
+  nvinfer1::ITensor *cache_input = ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims);
+  if (cache_input == nullptr) {
+    MS_LOG(ERROR) << "add cache Weight Tensor data is nullptr.";
+    return RET_ERROR;
+  }
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims)) {
+    MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name();
+    return RET_ERROR;
+  }
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims)) {
+    MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name();
+    return RET_ERROR;
+  }
+  if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims)) {
+    MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name();
+    return RET_ERROR;
+  }
+  ITensorHelper trt_tensor{cache_input, Format::NHWC, true};
+  cur_op->AddInnerInTensors(trt_tensor);
+  return RET_OK;
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h
new file mode 100644
index 00000000000..7134f450e37
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h
@@ -0,0 +1,159 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_
+#include <utility>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+#include "include/api/kernel.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_serializer.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h"
+#include "include/api/context.h"
+
+namespace mindspore::lite {
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+struct CacheTensorInfo {
+  std::vector<mindspore::MSTensor> network_input_tensor_;
+  bool front_op_can_cache_;
+};
+
+class TensorRTSubGraph : public kernel::Kernel {
+ public:
+  TensorRTSubGraph(std::vector<TensorRTOp *> ops, const std::vector<mindspore::MSTensor> &inputs,
+                   const std::vector<mindspore::MSTensor> &outputs, const mindspore::Context *ctx,
+                   std::shared_ptr<GPUDeviceInfo> device_info, TensorRTRuntime *runtime, bool support_resize,
+                   bool support_hw_resize)
+      : kernel::Kernel(inputs, outputs, nullptr, ctx),
+        all_ops_(std::move(ops)),
+        device_info_(device_info),
+        runtime_(runtime) {
+    trt_specific_weight_nodes_ = {
+      schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_ReduceFusion,    schema::PrimitiveType_Transpose,
+      schema::PrimitiveType_Gather,       schema::PrimitiveType_Reshape,         schema::PrimitiveType_PowFusion,
+      schema::PrimitiveType_AddFusion,    schema::PrimitiveType_DivFusion,       schema::PrimitiveType_SubFusion,
+      schema::PrimitiveType_MatMulFusion, schema::PrimitiveType_PowFusion,       schema::PrimitiveType_Eltwise,
+      schema::PrimitiveType_ScaleFusion,  schema::PrimitiveType_MulFusion,       schema::PrimitiveType_Minimum,
+      schema::PrimitiveType_StridedSlice, schema::PrimitiveType_PadFusion,       schema::PrimitiveType_FullConnection,
+      schema::PrimitiveType_Cast,         schema::PrimitiveType_ExpandDims,      schema::PrimitiveType_Resize,
+      schema::PrimitiveType_Maximum,      schema::PrimitiveType_BiasAdd,         schema::PrimitiveType_LSTM,
+      schema::PrimitiveType_RealDiv,      schema::PrimitiveType_LayerNormFusion, schema::PrimitiveType_Greater,
+      schema::PrimitiveType_Less,         schema::PrimitiveType_TopKFusion,      schema::PrimitiveType_TileFusion,
+      schema::PrimitiveType_Equal};
+    if (!support_resize) {
+      input_batchsize_index_ = -1;
+      input_hw_index_ = -1;
+    }
+    if (!support_hw_resize) {
+      input_hw_index_ = -1;
+    }
+  }
+
+  ~TensorRTSubGraph() override;
+
+  int Prepare() override;
+
+  int Execute() override;
+
+  int ReSize();
+
+  int BuildTensorRTGraph();
+
+  int Init(cudaStream_t stream);
+
+  void SetCacheManager(const std::shared_ptr<cache::EmbeddingCacheManager> &cache_mgr) { cache_mgr_ = cache_mgr; }
+
+  void SetSerializePath(const std::string &path) { serialize_file_path_ = std::move(path); }
+
+ private:
+  int BuildEngine();
+
+  int SetDeviceConfig(cudaStream_t stream);
+
+  bool IsInt8Mode();
+
+  bool SupportFP16();
+
+  nvinfer1::ITensor *SetTensorRTNetworkInput(const mindspore::MSTensor &in_tensor);
+
+  ITensorHelper FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);
+
+  int MarkOutputs();
+
+  bool IsCached(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);
+
+  void FindCacheTensorInfo(TensorRTOp *cur_op, mindspore::MSTensor device_cache_tensor);
+
+  bool CanOpCache(TensorRTOp *cur_op);
+
+  int HandleCacheTensor(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);
+
+  nvinfer1::Dims ParseInputDimsProfile(const mindspore::MSTensor &in_tensor);
+  int ParseInputsProfile();
+
+  bool ValidInputResizeDims(const nvinfer1::Dims &construct_dims, const std::vector<int64_t> &resize_input_shape);
+
+  std::vector<TensorRTOp *> all_ops_{};
+  // subgraph input nodes.
+  std::vector<TensorRTOp *> in_ops_{};
+  // subgraph output nodes.
+  std::vector<TensorRTOp *> out_ops_{};
+
+  void **tensor_bindings_{nullptr};
+
+  std::shared_ptr<GPUDeviceInfo> device_info_{nullptr};
+
+  TensorRTRuntime *runtime_{nullptr};  // all subgraph in one delegate share a runtime_
+
+  std::set<mindspore::schema::PrimitiveType> trt_specific_weight_nodes_;
+
+  // save in/out tensor name for subgraph isolate.
+  std::vector<std::string> trt_in_tensor_name_;
+  std::vector<std::string> trt_out_tensor_name_;
+
+  std::vector<mindspore::MSTensor> cache_const_inputs_;
+  std::map<std::string, CacheTensorInfo> network_cache_tensor_info_;
+
+  nvinfer1::INetworkDefinition *network_{nullptr};
+  nvinfer1::IBuilderConfig *config_{nullptr};
+  nvinfer1::ICudaEngine *engine_{nullptr};
+  nvinfer1::IExecutionContext *trt_context_{nullptr};
+  nvinfer1::IOptimizationProfile *profile_{nullptr};
+
+  TensorRTContext *ctx_;
+
+  // -1 means don't support resize
+  int input_batchsize_index_{0};
+  int output_batchsize_index_{0};
+  int input_hw_index_{0};
+
+  std::map<std::string, std::vector<mindspore::MSTensor>> model_input_to_cache_tensors_;
+
+  std::shared_ptr<cache::EmbeddingCacheManager> cache_mgr_{nullptr};
+
+  std::shared_ptr<TensorRTSerializer> serializer_{nullptr};
+
+  std::string serialize_file_path_;
+  cudaStream_t stream_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc
new file mode 100644
index 00000000000..1e43b4b5fd0
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc
@@ -0,0 +1,721 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime_api.h>
+#include <map>
+#include <unordered_set>
+#include <numeric>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/cast_plugin.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+nvinfer1::Dims ConvertCudaDims(int data, size_t size) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = -1;
+  if (size > static_cast<size_t>(dims.MAX_DIMS)) {
+    MS_LOG(ERROR) << "invalid shape size: " << size;
+    return dims;
+  }
+  dims.nbDims = size;
+  for (size_t i = 0; i < size; i++) {
+    dims.d[i] = data;
+  }
+  return dims;
+}
+
+nvinfer1::Dims ConvertCudaDims(const void *data, int64_t size) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = -1;
+  if (size > static_cast<int64_t>(dims.MAX_DIMS)) {
+    MS_LOG(ERROR) << "invalid shape size: " << size;
+    return dims;
+  }
+  dims.nbDims = size;
+  const int *dims_data = static_cast<const int *>(data);
+  for (int i = 0; i < size; i++) {
+    dims.d[i] = *(dims_data + i);
+  }
+  return dims;
+}
+
+bool SameDims(nvinfer1::Dims dims, const std::vector<int64_t> &shape) {
+  if (dims.nbDims != static_cast<int>(shape.size())) {
+    return false;
+  }
+  // dynamic dim, only channel dim know
+  for (int i = 0; i < dims.nbDims; i++) {
+    if (dims.d[i] == -1) {
+      continue;
+    }
+    if (dims.d[i] != shape[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<int64_t> ConvertMSShape(const nvinfer1::Dims dims) {
+  std::vector<int64_t> shape;
+  for (int i = 0; i < dims.nbDims; i++) {
+    shape.push_back(dims.d[i]);
+  }
+  return shape;
+}
+
+std::vector<int64_t> NHWC2NCHW(std::vector<int64_t> nhwc_shape) {
+  std::vector<int64_t> nchw_shape;
+  if (nhwc_shape.size() != DIMENSION_4D) {
+    return nhwc_shape;
+  }
+  nchw_shape.push_back(nhwc_shape[kNHWC_N]);
+  nchw_shape.push_back(nhwc_shape[kNHWC_C]);
+  nchw_shape.push_back(nhwc_shape[kNHWC_H]);
+  nchw_shape.push_back(nhwc_shape[kNHWC_W]);
+  return nchw_shape;
+}
+
+nvinfer1::IShuffleLayer *SetTranspose(TensorRTContext *ctx, const nvinfer1::ITensor &input,
+                                      nvinfer1::Permutation permutation) {
+  nvinfer1::IShuffleLayer *layer = ctx->network()->addShuffle(const_cast<nvinfer1::ITensor &>(input));
+  if (layer == nullptr) {
+    MS_LOG(ERROR) << "failed to create ShuffleLayer when create transpose op.";
+    return nullptr;
+  }
+  layer->setFirstTranspose(permutation);
+  return layer;
+}
+
+nvinfer1::DataType ConvertDataType(DataType type_id) {
+  std::map<DataType, nvinfer1::DataType> data_type_map = {
+#if TRT_VERSION_GE(7, 2)
+    {DataType::kNumberTypeBool, nvinfer1::DataType::kBOOL},
+#endif
+    {DataType::kNumberTypeInt8, nvinfer1::DataType::kINT8},
+    {DataType::kNumberTypeInt32, nvinfer1::DataType::kINT32},
+    {DataType::kNumberTypeFloat32, nvinfer1::DataType::kFLOAT},
+    {DataType::kNumberTypeFloat16, nvinfer1::DataType::kHALF},
+  };
+  auto iter = data_type_map.find(type_id);
+  nvinfer1::DataType data_type;
+  if (iter != data_type_map.end()) {
+    data_type = iter->second;
+  } else {
+    data_type = nvinfer1::DataType::kFLOAT;
+    MS_LOG(WARNING) << "invalid data_type for TensorRT, need check: " << static_cast<int>(type_id);
+  }
+  return data_type;
+}
+
+cudaDataType ConvertDataType(nvinfer1::DataType type_id) {
+  std::map<nvinfer1::DataType, cudaDataType> data_type_map = {
+    {nvinfer1::DataType::kINT8, CUDA_R_8I},
+    {nvinfer1::DataType::kINT32, CUDA_R_32I},
+    {nvinfer1::DataType::kFLOAT, CUDA_R_32F},
+    {nvinfer1::DataType::kHALF, CUDA_R_16F},
+  };
+  auto iter = data_type_map.find(type_id);
+  cudaDataType data_type;
+  if (iter != data_type_map.end()) {
+    data_type = iter->second;
+  } else {
+    data_type = CUDA_R_32F;
+    MS_LOG(WARNING) << "invalid data_type for TensorRT, need check: " << static_cast<int>(type_id);
+  }
+  return data_type;
+}
+
+nvinfer1::IShuffleLayer *NHWC2NCHW(TensorRTContext *ctx, const nvinfer1::ITensor &input) {
+  // NHWC 0123 NCHW 0312
+  nvinfer1::Permutation perm{{0, 3, 1, 2}};
+  return SetTranspose(ctx, input, perm);
+}
+
+nvinfer1::IShuffleLayer *NCHW2NHWC(TensorRTContext *ctx, const nvinfer1::ITensor &input) {
+  // NCHW 0123 NHWC 0231
+  nvinfer1::Permutation perm{{0, 2, 3, 1}};
+  return SetTranspose(ctx, input, perm);
+}
+
+nvinfer1::ITensor *ConvertConstantTensor(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                         const std::string &op_name) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor";
+    return nullptr;
+  }
+  nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape());
+  if (dims.nbDims == -1) {
+    MS_LOG(WARNING) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar.";
+    dims.nbDims = 1;
+    dims.d[0] = 1;
+  }
+  nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType());
+  if (ms_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name();
+    return nullptr;
+  }
+  nvinfer1::Weights weights{data_type, ms_tensor.Data().get(), ms_tensor.ElementNum()};
+  nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights);
+  if (constant_tensor == nullptr) {
+    MS_LOG(ERROR) << "create constant_tensor failed.";
+    return nullptr;
+  }
+  ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name);
+  return constant_tensor->getOutput(0);
+}
+
+nvinfer1::ITensor *ConvertScalarToITensor(TensorRTContext *ctx, size_t shape_size, const void *value,
+                                          const DataType data_type, const std::string &op_name) {
+  nvinfer1::Dims dims = ConvertCudaDims(1, shape_size);
+  if (dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name;
+    return nullptr;
+  }
+  nvinfer1::Weights weights{ConvertDataType(data_type), value, 1};
+  nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights);
+  if (constant_tensor == nullptr) {
+    MS_LOG(ERROR) << "create constant_tensor failed.";
+    return nullptr;
+  }
+  ctx->RegisterLayer(constant_tensor, op_name + "_constant");
+  return constant_tensor->getOutput(0);
+}
+
+std::experimental::optional<ActivationParams> TryConvertActivationType(schema::ActivationType activation_type) {
+  std::map<schema::ActivationType, ActivationParams> action_map = {
+    {schema::ActivationType_RELU, ActivationParams{nvinfer1::ActivationType::kRELU, false, 0, false, 0}},
+    {schema::ActivationType_SIGMOID, ActivationParams{nvinfer1::ActivationType::kSIGMOID, false, 0, false, 0}},
+    {schema::ActivationType_TANH, ActivationParams{nvinfer1::ActivationType::kTANH, false, 0, false, 0}},
+    {schema::ActivationType_LEAKY_RELU, ActivationParams{nvinfer1::ActivationType::kLEAKY_RELU, true, 0, false, 0}},
+    {schema::ActivationType_ELU, ActivationParams{nvinfer1::ActivationType::kELU, true, 0, false, 0}},
+    {schema::ActivationType_SELU, ActivationParams{nvinfer1::ActivationType::kSELU, true, 0, true, 0}},
+    {schema::ActivationType_SOFTSIGN, ActivationParams{nvinfer1::ActivationType::kSOFTSIGN, false, 0, false, 0}},
+    {schema::ActivationType_SOFTPLUS, ActivationParams{nvinfer1::ActivationType::kSOFTPLUS, true, 0, true, 0}},
+    {schema::ActivationType_THRESHOLDRELU,
+     ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, true, 0, false, 0}},
+    {schema::ActivationType_RELU6, ActivationParams{nvinfer1::ActivationType::kCLIP, true, 0, true, 6}},
+    {schema::ActivationType_RELU1, ActivationParams{nvinfer1::ActivationType::kCLIP, true, 0, true, 1}},
+    {schema::ActivationType_HARD_TANH, ActivationParams{nvinfer1::ActivationType::kCLIP, true, -1, true, 1}},
+    // using plugin
+    {schema::ActivationType_GELU, ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, false, 0, false, 0}},
+    {schema::ActivationType_SWISH, ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, false, 0, false, 0}}};
+  return action_map.find(activation_type) != action_map.end()
+           ? std::experimental::optional<ActivationParams>(action_map[activation_type])
+           : std::experimental::nullopt;
+}
+
+void AlignShapeRank(std::vector<int64_t> *in_shape_ptr, const std::vector<int64_t> &out_shape) {
+  const size_t last_dim = in_shape_ptr->size() - 1;
+  const int in_rank = in_shape_ptr->size();
+  int index = out_shape.size() - 1;
+  for (; index >= 0; index--) {
+    if (out_shape[index] == in_shape_ptr->at(last_dim)) {
+      break;
+    }
+  }
+  const int align_rank = index + 1;
+  if (index <= 0 || align_rank == in_rank) return;
+  for (int i = 0; i < index + 1 - in_rank; i++) {
+    in_shape_ptr->insert(in_shape_ptr->begin(), 1);
+  }
+}
+
+nvinfer1::ITensor *ConvertTensorWithExpandDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                               const std::vector<int64_t> &expect_shape, const std::string &op_name) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network is null for ConvertTensorWithExpandDims";
+    return nullptr;
+  }
+  auto origin_shape = ms_tensor.Shape();
+  std::vector<int64_t> convert_shape(expect_shape);
+  AlignShapeRank(&origin_shape, convert_shape);
+  size_t origin_index = 0;
+  for (size_t i = 0; i < convert_shape.size(); ++i) {
+    if (origin_index >= origin_shape.size()) {
+      convert_shape[i] = 1;
+      continue;
+    }
+    if (origin_shape[origin_index] != convert_shape[i]) {
+      convert_shape[i] = origin_shape[origin_index];
+    }
+    origin_index++;
+  }
+  if (ms_tensor.ElementNum() !=
+      std::accumulate(convert_shape.begin(), convert_shape.end(), 1, std::multiplies<int64_t>())) {
+    MS_LOG(ERROR) << "ExpandDims failed for " << op_name;
+    return nullptr;
+  }
+  nvinfer1::Dims dims = ConvertCudaDims(convert_shape);
+  if (dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name;
+    return nullptr;
+  }
+  nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType());
+  if (ms_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "ConvertTensorWithExpandDims from a MSTensor with nullptr data";
+    return nullptr;
+  }
+  nvinfer1::Weights weights{data_type, ms_tensor.Data().get(), ms_tensor.ElementNum()};
+  nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights);
+  if (constant_tensor == nullptr) {
+    MS_LOG(ERROR) << "create constant_tensor failed.";
+    return nullptr;
+  }
+  ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name);
+  return constant_tensor->getOutput(0);
+}
+
+nvinfer1::ITensor *ConvertConstantTensorWithDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                                 const std::vector<int64_t> &expect_shape, const std::string &op_name) {
+  nvinfer1::ITensor *constant_input{nullptr};
+  std::string tensor_name = op_name + "_" + ms_tensor.Name();
+  if (ms_tensor.Shape().size() == 0 || ms_tensor.ElementNum() == 1) {
+    constant_input =
+      lite::ConvertScalarToITensor(ctx, expect_shape.size(), ms_tensor.Data().get(), ms_tensor.DataType(), tensor_name);
+    if (constant_input == nullptr) {
+      MS_LOG(ERROR) << "create Itensor from scalar tensor failed: " << tensor_name;
+      return nullptr;
+    }
+  } else if (ms_tensor.Shape().size() == expect_shape.size()) {
+    constant_input = lite::ConvertConstantTensor(ctx, ms_tensor, tensor_name);
+    if (constant_input == nullptr) {
+      MS_LOG(ERROR) << "create Itensor from constant tensor failed: " << tensor_name;
+      return nullptr;
+    }
+  } else if (ms_tensor.ElementNum() >= 1) {
+    constant_input = ConvertTensorWithExpandDims(ctx, ms_tensor, expect_shape, tensor_name);
+    if (constant_input == nullptr) {
+      MS_LOG(ERROR) << "create Itensor from ConvertTensorWithExpandDims failed: " << tensor_name;
+      return nullptr;
+    }
+  } else {
+    MS_LOG(ERROR) << "const tensor value needs check: " << tensor_name;
+  }
+  return constant_input;
+}
+
+nvinfer1::Weights TransposeWeight4D(const mindspore::MSTensor &ms_tensor, void **pack_weight) {
+  // usage notice: malloc addr saved to pack_weight, save pack_weight ptr and free it when deconstruct
+  nvinfer1::Weights weights{};
+  weights.count = ms_tensor.ElementNum();
+  auto weight_shape = ms_tensor.Shape();
+  if (weight_shape.size() != DIMENSION_4D) {
+    MS_LOG(ERROR) << ms_tensor.Name() << " dims is " << weight_shape.size();
+    return weights;
+  }
+  if (ms_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << ms_tensor.Name() << " has null data";
+    return weights;
+  }
+  void *pack_weight_tmp = malloc(ms_tensor.DataSize());
+  if (pack_weight_tmp == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return weights;
+  }
+  *pack_weight = pack_weight_tmp;
+  weights.values = pack_weight_tmp;
+
+  switch (ms_tensor.DataType()) {
+    case DataType::kNumberTypeFloat16: {
+      weights.type = nvinfer1::DataType::kHALF;
+      PackNHWCToNCHWFp16(ms_tensor.Data().get(), pack_weight_tmp, weight_shape[0], weight_shape[1] * weight_shape[2],
+                         weight_shape[3], 0, 0);
+      break;
+    }
+    case DataType::kNumberTypeFloat32: {
+      weights.type = nvinfer1::DataType::kFLOAT;
+      PackNHWCToNCHWFp32(ms_tensor.Data().get(), pack_weight_tmp, weight_shape[0], weight_shape[1] * weight_shape[2],
+                         weight_shape[3], 0, 0);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << ms_tensor.Name() << " has unsupported tensor datatype for transpose data : "
+                    << static_cast<int>(ms_tensor.DataType());
+    }
+  }
+  return weights;
+}
+
+nvinfer1::Weights TransposeWeight2D(const mindspore::MSTensor &ms_tensor, void **pack_weight) {
+  // usage notice: malloc addr saved to pack_weight, save pack_weight ptr and free it when deconstruct
+  nvinfer1::Weights weights{};
+  weights.count = ms_tensor.ElementNum();
+  auto weight_shape = ms_tensor.Shape();
+  if (weight_shape.size() != DIMENSION_2D) {
+    MS_LOG(ERROR) << ms_tensor.Name() << " dims is " << weight_shape.size();
+    return weights;
+  }
+  if (ms_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << ms_tensor.Name() << " has null data";
+    return weights;
+  }
+  void *pack_weight_tmp = malloc(ms_tensor.DataSize());
+  if (pack_weight_tmp == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return weights;
+  }
+  *pack_weight = pack_weight_tmp;
+  weights.values = pack_weight_tmp;
+
+  int row = weight_shape[0];
+  int col = weight_shape[1];
+
+  switch (ms_tensor.DataType()) {
+    case DataType::kNumberTypeFloat16: {
+      weights.type = nvinfer1::DataType::kHALF;
+      auto src = static_cast<const uint16_t *>(ms_tensor.Data().get());
+      auto dst = static_cast<uint16_t *>(pack_weight_tmp);
+      for (int r = 0; r < row; ++r) {
+        for (int c = 0; c < col; ++c) {
+          dst[c * row + r] = src[r * col + c];
+        }
+      }
+      break;
+    }
+    case DataType::kNumberTypeFloat32: {
+      weights.type = nvinfer1::DataType::kFLOAT;
+      auto dst = static_cast<float *>(pack_weight_tmp);
+      auto src = static_cast<const float *>(ms_tensor.Data().get());
+      for (int r = 0; r < row; ++r) {
+        for (int c = 0; c < col; ++c) {
+          dst[c * row + r] = src[r * col + c];
+        }
+      }
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << ms_tensor.Name() << " has unsupported tensor datatype for transpose data : "
+                    << static_cast<int>(ms_tensor.DataType());
+    }
+  }
+  return weights;
+}
+
+nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor) {
+  nvinfer1::Weights weights{};
+  weights.type = ConvertDataType(ms_tensor.DataType());
+  weights.values = ms_tensor.Data().get();
+  weights.count = ms_tensor.ElementNum();
+  if (weights.values == nullptr) {
+    MS_LOG(ERROR) << "ConvertWeight from a MSTensor with nullptr data";
+  }
+  return weights;
+}
+
+nvinfer1::ITensor *TRTTensorCast(TensorRTContext *ctx, nvinfer1::ITensor *trt_tensor, nvinfer1::DataType data_type,
+                                 const std::string &name) {
+#if TRT_VERSION_GE(7, 2)
+  data_type == nvinfer1::DataType::kBOOL ? nvinfer1::DataType::kINT32 : data_type;
+  auto cast_layer = ctx->network()->addIdentity(*trt_tensor);
+#else
+  auto plugin = std::make_shared<CastPlugin>(name, trt_tensor->getType(), data_type);
+  nvinfer1::ITensor *inputTensors[] = {trt_tensor};
+  nvinfer1::IPluginV2Layer *cast_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+#endif
+  if (cast_layer == nullptr) {
+    MS_LOG(ERROR) << "create cast layer failed for: " << name;
+    return nullptr;
+  }
+#if TRT_VERSION_GE(7, 2)
+  cast_layer->setOutputType(0, data_type);
+#endif
+  cast_layer->setName(name.c_str());
+  nvinfer1::ITensor *cast_out = cast_layer->getOutput(0);
+  cast_out->setName((name + "_output").c_str());
+  return cast_out;
+}
+
+int SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_) {
+  return SetCudaDevice(static_cast<int>(device_info_->GetDeviceID()));
+}
+
+int SetCudaDevice(int device_id) {
+  int device = 0;
+  auto ret = cudaGetDevice(&device);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaGetDevice failed, device is untrustable. error code: " << ret;
+    return RET_ERROR;
+  }
+  int set_device_id = device_id;
+  int deviceCnt = 0;
+
+  ret = cudaGetDeviceCount(&deviceCnt);
+  if (ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaGetDeviceCount failed.";
+    return RET_ERROR;
+  }
+
+  if (set_device_id > deviceCnt - 1) {
+    MS_LOG(ERROR) << "invalid input device id as " << set_device_id << " for current device count " << deviceCnt;
+    return RET_ERROR;
+  }
+  if (device != set_device_id) {
+    ret = cudaSetDevice(set_device_id);
+    if (ret != cudaSuccess) {
+      MS_LOG(ERROR) << "cudaSetDevice failed, error code: " << ret;
+      return RET_ERROR;
+    }
+  }
+  if (cudaGetDevice(&device) != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaGetDevice failed, device is untrustable.";
+    return RET_ERROR;
+  }
+  MS_LOG(DEBUG) << "cuda is running on device: " << device;
+  return RET_OK;
+}
+
+Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm) {
+  if (input_format == Format::NHWC) {
+    if (perm.order[kNHWC_N] == kNHWC_N && perm.order[kNHWC_H] == kNHWC_C && perm.order[kNHWC_W] == kNHWC_W &&
+        perm.order[kNHWC_C] == kNHWC_H) {
+      return Format::NCHW;
+    }
+  } else if (input_format == Format::NCHW) {
+    if (perm.order[kNCHW_N] == kNCHW_N && perm.order[kNCHW_C] == kNCHW_H && perm.order[kNCHW_H] == kNCHW_W &&
+        perm.order[kNCHW_W] == kNCHW_C) {
+      return Format::NHWC;
+    }
+  }
+  MS_LOG(WARNING) << "transpose out format needs to check for " << input_format;
+  return input_format;
+}
+int ConvertAxisFromNHWC2NCHW(int nhwc_axis) {
+  // N0H1W2C3->N0C1H2W3
+  if (nhwc_axis > kNHWC_C) {
+    return nhwc_axis;
+  }
+  switch (nhwc_axis) {
+    case kNHWC_N:
+      return kNCHW_N;
+    case kNHWC_H:
+      return kNCHW_H;
+    case kNHWC_W:
+      return kNCHW_W;
+    case kNHWC_C:
+      return kNCHW_C;
+    default:
+      MS_LOG(ERROR) << "invalid input axis for nhwc: " << nhwc_axis;
+  }
+  return nhwc_axis;
+}
+
+void PackNHWCToNCHWFp16(const void *src, void *dst, size_t batches, size_t plane, size_t channel, size_t task_id,
+                        size_t thread_count) {
+  size_t hw8 = plane / C8NUM;
+  size_t task_start = 0;
+  size_t task_end = plane;
+  if (thread_count > 0) {
+    size_t offset_hw = UP_DIV(hw8, thread_count) * C8NUM;
+    task_start = offset_hw * task_id;
+    size_t count = plane - task_start;
+    if (count == 0) {
+      return;
+    }
+    task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw);
+    hw8 = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0);
+  } else {
+    hw8 *= C8NUM;
+  }
+  size_t c8 = channel / C8NUM * C8NUM;
+  size_t batch = plane * channel;
+  for (size_t n = 0; n < batches; n++) {
+    const uint16_t *src_batch = static_cast<const uint16_t *>(src) + n * batch;
+    uint16_t *dst_batch = static_cast<uint16_t *>(dst) + n * batch;
+    size_t hw = task_start;
+    for (; hw < hw8; hw += C8NUM) {
+      size_t c = 0;
+      for (; c < c8; c += C8NUM) {
+        const uint16_t *src_ptr = src_batch + hw * channel + c;
+        uint16_t *dst_ptr = dst_batch + c * plane + hw;
+        for (size_t tr = 0; tr < C8NUM; tr++) {
+          for (size_t tc = 0; tc < C8NUM; tc++) {
+            dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc];
+          }
+        }
+      }
+      for (; c < channel; c++) {
+        const uint16_t *src_ptr = src_batch + hw * channel + c;
+        uint16_t *dst_ptr = dst_batch + c * plane + hw;
+        for (size_t i = 0; i < C8NUM; i++) {
+          dst_ptr[i] = src_ptr[i * channel];
+        }
+      }
+    }
+    for (; hw < task_end; hw++) {
+      const uint16_t *src_ptr = src_batch + hw * channel;
+      uint16_t *dst_ptr = dst_batch + hw;
+      for (size_t i = 0; i < channel; i++) {
+        dst_ptr[i * plane] = src_ptr[i];
+      }
+    }
+  }
+}
+std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor, mindspore::Format format, bool is_same) {
+  nvinfer1::Dims dims = trt_tensor->getDimensions();
+  std::string is_same_string = is_same ? " is same with ms tensor " : " is different from ms tensor ";
+  std::string out_string = "tensor " + std::string(trt_tensor->getName()) + ": format (NHWC:1, NCHW:0) is " +
+                           std::to_string(static_cast<int>(format)) + is_same_string + ", dims is ";
+  std::string dim_string = "[";
+  for (int i = 0; i < dims.nbDims; i++) {
+    dim_string += std::to_string(dims.d[i]);
+    if (i != dims.nbDims - 1) {
+      dim_string += ", ";
+    }
+  }
+  dim_string += "]";
+  out_string += dim_string;
+  return out_string;
+}
+
+std::string GetTensorFormat(ITensorHelper tensor_helper) {
+  return GetTensorFormat(tensor_helper.trt_tensor_, tensor_helper.format_, tensor_helper.same_format_);
+}
+
+std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor) { return GetTensorFormat(trt_tensor, Format::NHWC, true); }
+
+std::experimental::optional<nvinfer1::ReduceOperation> TryConvertTRTReduceMode(schema::ReduceMode mode) {
+  std::map<schema::ReduceMode, nvinfer1::ReduceOperation> reduce_ops_ = {
+    {schema::ReduceMode::ReduceMode_ReduceMean, nvinfer1::ReduceOperation::kAVG},
+    {schema::ReduceMode::ReduceMode_ReduceMax, nvinfer1::ReduceOperation::kMAX},
+    {schema::ReduceMode::ReduceMode_ReduceMin, nvinfer1::ReduceOperation::kMIN},
+    {schema::ReduceMode::ReduceMode_ReduceProd, nvinfer1::ReduceOperation::kPROD},
+    {schema::ReduceMode::ReduceMode_ReduceL2, nvinfer1::ReduceOperation::kSUM},
+    {schema::ReduceMode::ReduceMode_ReduceSum, nvinfer1::ReduceOperation::kSUM},
+  };
+  return reduce_ops_.find(mode) != reduce_ops_.end()
+           ? std::experimental::optional<nvinfer1::ReduceOperation>(reduce_ops_[mode])
+           : std::experimental::nullopt;
+}
+int PreprocessInputs2SameDim(TensorRTContext *ctx, const ITensorHelper &input_tensor_helper,
+                             ITensorHelper *out_tensor_helper) {
+  out_tensor_helper->trt_tensor_ = input_tensor_helper.trt_tensor_;
+  out_tensor_helper->format_ = input_tensor_helper.format_;
+  out_tensor_helper->same_format_ = true;
+  if (input_tensor_helper.trt_tensor_->getDimensions().nbDims == DIMENSION_4D && !input_tensor_helper.same_format_) {
+    if (input_tensor_helper.format_ == Format::NCHW) {
+      // transpose: NCHW->NHWC
+      nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *input_tensor_helper.trt_tensor_);
+      if (transpose_layer_in == nullptr) {
+        MS_LOG(ERROR) << "op action convert failed";
+        return RET_ERROR;
+      }
+      transpose_layer_in->setName(
+        (std::string(input_tensor_helper.trt_tensor_->getName()) + "_input_transpose2NHWC").c_str());
+      out_tensor_helper->trt_tensor_ = transpose_layer_in->getOutput(0);
+      out_tensor_helper->format_ = Format::NHWC;
+    } else {
+      // transpose: NHWC->NCHW
+      nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *input_tensor_helper.trt_tensor_);
+      if (transpose_layer_in == nullptr) {
+        MS_LOG(ERROR) << "op action convert failed";
+        return RET_ERROR;
+      }
+      transpose_layer_in->setName(
+        (std::string(input_tensor_helper.trt_tensor_->getName()) + "_input_transpose2NCHW").c_str());
+      out_tensor_helper->trt_tensor_ = transpose_layer_in->getOutput(0);
+      out_tensor_helper->format_ = Format::NCHW;
+    }
+  }
+  return RET_OK;
+}
+
+int GetDimsVolume(const nvinfer1::Dims &dims) {
+  if (dims.nbDims <= 0) {
+    return 0;
+  }
+  return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>());
+}
+
+int GetDimsVolume(const std::vector<int64_t> &shape) {
+  if (shape.size() == 0) {
+    return 0;
+  }
+  return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+}
+
+std::experimental::optional<nvinfer1::Dims> SqueezeDims(const nvinfer1::Dims &in_dims, int pos) {
+  if (in_dims.nbDims <= 1) {
+    MS_LOG(ERROR) << "invalid shape size: " << in_dims.nbDims << "for squeeze.";
+    return {};
+  }
+  nvinfer1::Dims out_dims;
+  int i = 0;
+  for (int j = 0; j <= in_dims.nbDims; ++j) {
+    if (j != pos) {
+      out_dims.d[i++] = in_dims.d[j];
+    }
+  }
+  out_dims.nbDims = in_dims.nbDims - 1;
+  return std::experimental::optional<nvinfer1::Dims>(out_dims);
+}
+
+std::experimental::optional<nvinfer1::Dims> UnsqueezeDims(const nvinfer1::Dims &in_dims, int pos, int val) {
+  if (in_dims.nbDims >= static_cast<size_t>(in_dims.MAX_DIMS)) {
+    MS_LOG(ERROR) << "invalid shape size: " << in_dims.nbDims << "for unsqueeze.";
+    return {};
+  }
+  nvinfer1::Dims out_dims;
+  int i = 0;
+  for (int j = 0; j <= in_dims.nbDims; ++j) {
+    if (j == pos) {
+      out_dims.d[j] = val;
+    } else {
+      out_dims.d[j] = in_dims.d[i++];
+    }
+  }
+  out_dims.nbDims = in_dims.nbDims + 1;
+  return std::experimental::optional<nvinfer1::Dims>(out_dims);
+}
+
+int ParseData2Vector(const mindspore::MSTensor &ms_tensor, std::vector<float> *dst) {
+  if (ms_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "ignore tensor: " << ms_tensor.Name();
+    return RET_ERROR;
+  }
+  dst->clear();
+  dst->resize(ms_tensor.ElementNum());
+  switch (ms_tensor.DataType()) {
+    case DataType::kNumberTypeInt64: {
+      Data2Vector<int64_t>(dst, ms_tensor.Data().get());
+      break;
+    }
+    case DataType::kNumberTypeInt32: {
+      Data2Vector<int>(dst, ms_tensor.Data().get());
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << ms_tensor.Name() << " has more datatype to parse";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const std::vector<int64_t> &shape) {
+  return Reshape(ctx, input, ConvertCudaDims(shape));
+}
+
+nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const nvinfer1::Dims &shape) {
+  auto reshape_layer = ctx->network()->addShuffle(*input);
+  if (reshape_layer == nullptr) {
+    MS_LOG(ERROR) << "add reshape_layer failed";
+    return nullptr;
+  }
+  reshape_layer->setReshapeDimensions(shape);
+  return reshape_layer->getOutput(0);
+}
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h
new file mode 100644
index 00000000000..ad2bd3d3889
--- /dev/null
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h
@@ -0,0 +1,184 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_UTILS_H_
+#include <experimental/optional>
+#include <vector>
+#include <NvInfer.h>
+#include <NvInferVersion.h>
+#include <memory>
+#include <string>
+#include "src/runtime/delegate/tensorrt/tensorrt_context.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h"
+#include "mindspore/core/ir/dtype/type_id.h"
+#include "schema/ops_generated.h"
+#include "nnacl/pack.h"
+#include "include/api/context.h"
+
+#define kNCHW_N 0
+#define kNCHW_C 1
+#define kNCHW_H 2
+#define kNCHW_W 3
+#define kNHWC_N 0
+#define kNHWC_H 1
+#define kNHWC_W 2
+#define kNHWC_C 3
+
+namespace mindspore::lite {
+#define TRT_VERSION_GE(major, minor) \
+  (NV_TENSORRT_MAJOR > major) || ((NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR >= minor))
+#define TRT_VERSION_LS(major, minor) \
+  (NV_TENSORRT_MAJOR < major) || ((NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR < minor))
+struct ITensorHelper {
+  nvinfer1::ITensor *trt_tensor_{nullptr};
+  mindspore::Format format_{Format::NHWC};
+  bool same_format_{true};
+};
+struct ActivationParams {
+  nvinfer1::ActivationType activation_type;
+  bool has_alpha;
+  float alpha;
+  bool has_beta;
+  float beta;
+};
+
+typedef union float32_bits {
+  unsigned int u;
+  float f;
+} float32_bits;
+
+// Convert Tensor data to Cuda dims.
+nvinfer1::Dims ConvertCudaDims(const void *data, int64_t size);
+
+nvinfer1::Dims ConvertCudaDims(int data, size_t size);
+
+bool SameDims(nvinfer1::Dims dims, const std::vector<int64_t> &shape);
+
+std::vector<int64_t> ConvertMSShape(const nvinfer1::Dims dims);
+
+std::vector<int64_t> NHWC2NCHW(std::vector<int64_t> nhwc_shape);
+
+nvinfer1::DataType ConvertDataType(DataType type_id);
+
+cudaDataType ConvertDataType(nvinfer1::DataType type_id);
+
+nvinfer1::IShuffleLayer *NHWC2NCHW(TensorRTContext *ctx, const nvinfer1::ITensor &input);
+
+nvinfer1::IShuffleLayer *NCHW2NHWC(TensorRTContext *ctx, const nvinfer1::ITensor &input);
+
+std::experimental::optional<ActivationParams> TryConvertActivationType(schema::ActivationType activation_type);
+
+nvinfer1::ITensor *ConvertConstantTensor(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                         const std::string &op_name);
+
+nvinfer1::ITensor *ConvertTensorWithExpandDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                               const std::vector<int64_t> &expect_shape, const std::string &op_name);
+
+nvinfer1::ITensor *ConvertScalarToITensor(TensorRTContext *ctx, size_t shape_size, const void *value,
+                                          const DataType data_type, const std::string &op_name);
+
+nvinfer1::ITensor *ConvertConstantTensorWithDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor,
+                                                 const std::vector<int64_t> &expect_shape, const std::string &op_name);
+
+nvinfer1::Weights TransposeWeight4D(const mindspore::MSTensor &ms_tensor, void **pack_weight);
+
+nvinfer1::Weights TransposeWeight2D(const mindspore::MSTensor &ms_tensor, void **pack_weight);
+
+nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor);
+
+nvinfer1::ITensor *TRTTensorCast(TensorRTContext *ctx, nvinfer1::ITensor *tensor, nvinfer1::DataType data_type,
+                                 const std::string &name);
+
+int SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_);
+
+int SetCudaDevice(int device_id);
+
+Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm);
+
+int ConvertAxisFromNHWC2NCHW(int nhwc_axis);
+
+void PackNHWCToNCHWFp16(const void *src, void *dst, size_t batch, size_t plane, size_t channel, size_t task_id,
+                        size_t thread_count);
+
+std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor, mindspore::Format format, bool is_same);
+
+std::string GetTensorFormat(ITensorHelper tensor_helper);
+
+std::string GetTensorFormat(nvinfer1::ITensor *trt_tensors);
+
+std::experimental::optional<nvinfer1::ReduceOperation> TryConvertTRTReduceMode(schema::ReduceMode mode);
+
+int PreprocessInputs2SameDim(TensorRTContext *ctx, const ITensorHelper &input_tensor_helper,
+                             ITensorHelper *out_tensor_helper);
+
+int GetDimsVolume(const nvinfer1::Dims &dims);
+
+int GetDimsVolume(const std::vector<int64_t> &shape);
+
+std::experimental::optional<nvinfer1::Dims> SqueezeDims(const nvinfer1::Dims &in_dims, int pos);
+
+std::experimental::optional<nvinfer1::Dims> UnsqueezeDims(const nvinfer1::Dims &in_dims, int pos, int val);
+
+nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const std::vector<int64_t> &shape);
+
+nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const nvinfer1::Dims &shape);
+
+int ParseData2Vector(const mindspore::MSTensor &ms_tensor, std::vector<float> *dst);
+
+template <typename T1, typename T2>
+bool SameDims(const std::vector<T1> &shape1, const std::vector<T2> &shape2) {
+  if (shape1.size() != shape2.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < shape1.size(); i++) {
+    if (std::abs(shape1[i] - shape2[i]) > 1e-6) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+nvinfer1::Dims ConvertCudaDims(const std::vector<T> &shape) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = -1;
+  if (!shape.empty() && shape.size() <= static_cast<size_t>(dims.MAX_DIMS)) {
+    dims.nbDims = shape.size();
+    for (int i = 0; i < dims.nbDims; i++) {
+      dims.d[i] = static_cast<int>(shape[i]);
+    }
+  } else {
+    MS_LOG(WARNING) << "ms shape is invalid or empty.";
+  }
+  return dims;
+}
+
+inline size_t IntToSize(int u) {
+  if (u < 0) {
+    MS_LOG(WARNING) << "The int value(" << u << ") is less than 0.";
+    return SIZE_MAX;
+  }
+  return static_cast<size_t>(u);
+}
+template <typename T>
+void Data2Vector(std::vector<float> *dst, const void *src) {
+  auto src_ptr = static_cast<const T *>(src);
+  for (int i = 0; i < dst->size(); i++) {
+    dst->at(i) = static_cast<float>(src_ptr[i]);
+  }
+}
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_