!37251 [MS][LITE][STABLE]copy tensorRT impl to src/runtime

Merge pull request !37251 from chenjianping/master_dev1
2022-07-05 05:46:26 +00:00 · 2022-07-05 05:46:26 +00:00 · e67997ff67
parent 570b2b6176 ed8e877f02
commit e67997ff67
133 changed files with 14041 additions and 2 deletions
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@ -498,14 +498,14 @@ if(SUPPORT_TENSORRT)
        set(CUDA_LIB_PATH ${CUDA_PATH}/lib64)
        include_directories(${TENSORRT_PATH}/include)
        include_directories(${CUDA_PATH}/include)
-        add_subdirectory(extendrt/delegate/tensorrt)
+        add_subdirectory(runtime/delegate/tensorrt)
    endif()
    target_link_libraries(mindspore-lite tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective)
    target_link_libraries(mindspore-lite_static tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective)
 else()
    if(NOT MSLITE_ENABLE_CLOUD_FUSION_INFERENCE)
        set(TENSORRT_STUB
-            ${CMAKE_CURRENT_SOURCE_DIR}/extendrt/delegate/tensorrt/distribution/distribution_base.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/delegate/tensorrt/distribution/distribution_base.cc
        )
        add_library(tensorrt_stub OBJECT ${TENSORRT_STUB})
    endif()
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc
@ -381,6 +381,10 @@ int TensorRTSubGraph::Prepare() {
    return RET_ERROR;
  }
  int binding_num = this->engine_->getNbBindings();
+  if (binding_num < 0) {
+    MS_LOG(ERROR) << "invalid binding_num " << binding_num;
+    return RET_ERROR;
+  }
  tensor_bindings_ = new (std::nothrow) void *[binding_num];
  if (tensor_bindings_ == nullptr) {
    MS_LOG(ERROR) << "malloc tensor binding array failed.";
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
+
+#include <vector>
+#include "include/api/status.h"
+
+namespace mindspore {
+namespace cache {
+struct CacheNoe {
+  CacheNoe(int _index, int _frequency, int _value) : key(_index), frequency(_frequency), value(_value) {}
+  int key;  // host input index
+  int frequency;
+  int value;  // cache index
+};
+
+class CacheAlgorithm {
+ public:
+  virtual ~CacheAlgorithm() {}
+  virtual int Get(int key) = 0;
+  virtual void Put(int key, int value) = 0;
+  virtual Status Init(size_t cache_size, int min_host_index, int max_host_index) = 0;
+  virtual Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                               std::vector<int> *need_swap_indies, std::vector<int> *need_swap_indies_cache_index) = 0;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h
@ -0,0 +1,41 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
+#include <utility>
+#include <memory>
+
+namespace mindspore {
+namespace cache {
+class CacheMemBase {
+ public:
+  CacheMemBase() = default;
+  virtual ~CacheMemBase() = default;
+  virtual bool InitDevice(uint32_t device_id, const void *context) = 0;
+  virtual void *MallocMemory(size_t size) = 0;
+  virtual void FreeMemory(void *buf) = 0;
+  virtual bool SynchronizeStream() = 0;
+  virtual bool CopyHostMemToDevice(void *dst, const void *src, size_t size) = 0;
+  virtual bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) = 0;
+  virtual bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr,
+                           size_t cache_vocab_size, size_t embedding_size, size_t swap_out_size) = 0;
+  virtual bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr,
+                          size_t cache_vocab_size, size_t embedding_size, size_t swap_in_size) = 0;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc
@ -0,0 +1,237 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/delegate/parameter_cache/embedding_cache.h"
+#include <cuda_runtime.h>
+#include <memory>
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <string>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h"
+#include "src/runtime/delegate/parameter_cache/lfu_cache.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+
+namespace {
+constexpr size_t kEmbeddingTensorShapeSize = 2;
+}
+namespace mindspore {
+namespace cache {
+void LookUpTableTask(size_t indices_lens, size_t first_dim_size, const char *input_addr, const int *indices_addr,
+                     char *output_addr, size_t embedding_len, int min_host_index) {
+  for (size_t i = 0; i < indices_lens; ++i) {
+    int index = indices_addr[i] - min_host_index;
+    if (index >= 0 && index < static_cast<int>(first_dim_size)) {
+      size_t pos = index * embedding_len;
+      std::memcpy(output_addr, input_addr + pos, embedding_len);
+    } else {
+      memset(output_addr, 0, embedding_len);
+    }
+    output_addr += embedding_len;
+  }
+}
+
+EmbeddingCache::~EmbeddingCache() {
+  if (hash_swap_value_device_addr_ != nullptr) {
+    device_cache_->FreeMemory(hash_swap_value_device_addr_);
+    hash_swap_value_device_addr_ = nullptr;
+  }
+  if (hash_swap_value_addr_ != nullptr) {
+    free(hash_swap_value_addr_);
+    hash_swap_value_addr_ = nullptr;
+  }
+  if (hash_swap_index_addr_ != nullptr) {
+    device_cache_->FreeMemory(hash_swap_index_addr_);
+    hash_swap_index_addr_ = nullptr;
+  }
+}
+
+Status EmbeddingCache::Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor) {
+  MS_ASSERT(device_tensor.Shape().size() == kEmbeddingTensorShapeSize);
+  MS_ASSERT(host_cache_tensor.Shape().size() == kEmbeddingTensorShapeSize);
+  MS_ASSERT(device_tensor.DataType() == host_cache_tensor.DataType());
+  MS_ASSERT(host_cache_tensor.Data() != nullptr);
+
+  if (device_tensor.Shape()[1] != host_cache_tensor.Shape()[1]) {
+    MS_LOG(ERROR) << device_tensor.Name() << " embedding_size is invalid, device size is " << device_tensor.Shape()[1]
+                  << ", host size is " << host_cache_tensor.Shape()[1];
+    return kLiteError;
+  }
+  if (host_cache_size_ != host_cache_tensor.Shape()[0]) {
+    MS_LOG(ERROR) << device_tensor.Name() << " host_cache_size is invalid, host_cache_size"
+                  << host_cache_tensor.Shape()[0] << ", index begin:" << min_host_index_
+                  << ", index end:" << max_host_index_ << "rank_group_size_ num:" << rank_group_size_
+                  << ", rank id:" << rank_id_ << ", vocab_size_:" << vocab_size_;
+    return kLiteError;
+  }
+
+  data_type_ = device_tensor.DataType();
+  switch (data_type_) {
+    case DataType::kNumberTypeFloat32:
+      sizeof_data_type_ = sizeof(float);
+      break;
+    default:
+      MS_LOG(ERROR) << device_tensor.Name() << " unsupported data type " << static_cast<int>(data_type_);
+      return kLiteError;
+  }
+  host_addr_ = host_cache_tensor.MutableData();
+  embedding_size_ = device_tensor.Shape()[1];
+  device_start_index_ = device_cache_size_ * rank_id_;
+  // host cache tensor is device tensor
+  if (device_tensor.Shape()[0] == host_cache_tensor.Shape()[0]) {
+    device_start_index_ = min_host_index_;
+  }
+  return kSuccess;
+}
+
+Status EmbeddingCache::MallocCacheMemory() {
+  auto hash_swap_value_size = embedding_size_ * batch_elements_ * sizeof_data_type_;
+  hash_swap_value_device_addr_ = device_cache_->MallocMemory(hash_swap_value_size);
+  if (hash_swap_value_device_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_value_device failed, malloc size " << hash_swap_value_size;
+    return kLiteMemoryFailed;
+  }
+
+  hash_swap_value_addr_ = malloc(hash_swap_value_size);
+  if (hash_swap_value_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_value failed, malloc size " << hash_swap_value_size;
+    return kLiteMemoryFailed;
+  }
+
+  // data type of index
+  hash_swap_index_addr_ = static_cast<int *>(device_cache_->MallocMemory(batch_elements_ * sizeof(int)));
+  if (hash_swap_index_addr_ == nullptr) {
+    MS_LOG(ERROR) << "malloc hash_swap_index failed, malloc size " << batch_elements_ * sizeof(int);
+    return kLiteMemoryFailed;
+  }
+  return kSuccess;
+}
+
+Status EmbeddingCache::Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor,
+                            mindspore::MSTensor device_tensor) {
+  auto ret = Init(host_cache_tensor, device_tensor);
+  if (ret != kSuccess) {
+    return ret;
+  }
+  cache_ = lite::FactoryManagerBase<std::string, cache::CacheAlgorithm>::Instance().GetProduct("lfu");
+  if (cache_ == nullptr) {
+    MS_LOG(ERROR) << "malloc LFUCacheAlgorithm failed";
+    return kLiteMemoryFailed;
+  }
+  ret = cache_->Init(device_cache_size_, min_host_index_, max_host_index_);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "init cache failed," << ret.CodeAsString;
+    return kLiteError;
+  }
+
+  device_cache_ = lite::FactoryManagerBase<std::string, cache::CacheMemBase>::Instance().GetProduct("gpu");
+  if (device_cache_ == nullptr) {
+    MS_LOG(ERROR) << "get cache failed";
+    return kLiteMemoryFailed;
+  }
+  if (!device_cache_->InitDevice(device_id, context)) {
+    MS_LOG(ERROR) << "init device failed";
+    return kLiteError;
+  }
+  ret = MallocCacheMemory();
+  if (ret != kSuccess) {
+    return ret;
+  }
+
+  MS_LOG(INFO) << "init succ,  rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_
+               << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_
+               << ", device_cache_size_:" << device_cache_size_ << ", embedding_size_:" << embedding_size_
+               << ", batch_elements_:" << batch_elements_ << ", index begin:" << min_host_index_
+               << ", index end:" << max_host_index_;
+  return kSuccess;
+}
+
+Status EmbeddingCache::SetHostCacheAddr(void *addr, size_t size) {
+  if (sizeof_data_type_ * host_cache_size_ * embedding_size_ != size) {
+    return kLiteParamInvalid;
+  }
+  host_addr_ = addr;
+
+  // copy part of host mem to device
+  auto ret =
+    device_cache_->CopyHostMemToDevice(device_addr_, addr, sizeof_data_type_ * device_cache_size_ * embedding_size_);
+  if (!ret) {
+    MS_LOG(ERROR) << "CopyHostMemToDevice failed, copy size "
+                  << sizeof_data_type_ * device_cache_size_ * embedding_size_;
+    return kLiteMemoryFailed;
+  }
+
+  // init cache
+  auto index_num = device_cache_size_;
+  for (size_t i = 0; i < index_num; i++) {
+    cache_->Put(min_host_index_ + i, i);
+  }
+
+  return kSuccess;
+}
+
+Status EmbeddingCache::SetDeviceCacheAddr(void *device_mem_addr, size_t size) {
+  if (sizeof_data_type_ * device_cache_size_ * embedding_size_ != size) {
+    return kLiteParamInvalid;
+  }
+
+  device_addr_ = device_mem_addr;
+  SetHostCacheAddr(host_addr_, sizeof_data_type_ * host_cache_size_ * embedding_size_);
+
+  return kSuccess;
+}
+
+Status EmbeddingCache::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index) {
+  std::vector<int> need_swap_indies;
+  std::vector<int> need_swap_indies_cache_index;
+  auto ret =
+    cache_->CheckCacheHit(batch_ids, batch_ids_len, cache_index, &need_swap_indies, &need_swap_indies_cache_index);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "CheckCacheHit failed";
+    return ret;
+  }
+  auto swap_indices_size = need_swap_indies.size();
+  if (swap_indices_size > 0) {
+    LookUpTableTask(swap_indices_size, host_cache_size_, static_cast<char *>(host_addr_), need_swap_indies.data(),
+                    static_cast<char *>(hash_swap_value_addr_), embedding_size_ * sizeof_data_type_, min_host_index_);
+
+    auto device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_value_device_addr_, hash_swap_value_addr_,
+                                                               swap_indices_size * embedding_size_ * sizeof_data_type_);
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "copy swap value to device failed";
+      return kLiteMemoryFailed;
+    }
+
+    device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_index_addr_, need_swap_indies_cache_index.data(),
+                                                          swap_indices_size * sizeof(int));
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "copy swap indies to device failed";
+      return kLiteMemoryFailed;
+    }
+
+    device_cache_ret = device_cache_->HashSwapIn(device_addr_, hash_swap_value_device_addr_, hash_swap_index_addr_,
+                                                 device_cache_size_, embedding_size_, swap_indices_size);
+    if (!device_cache_ret) {
+      MS_LOG(ERROR) << "HashSwapIn failed";
+      return kLiteMemoryFailed;
+    }
+  }
+
+  return kSuccess;
+}
+}  // namespace cache
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h
@ -0,0 +1,89 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
+#include <cmath>
+#include <algorithm>
+#include <memory>
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/cache_algorithm.h"
+#include "src/runtime/delegate/parameter_cache/cache_mem_base.h"
+
+namespace mindspore {
+namespace cache {
+class EmbeddingCache {
+ public:
+  EmbeddingCache(size_t vocab_size, size_t device_cache_size, size_t batch_elements, int rank_id, int rank_group_size)
+      : vocab_size_(vocab_size),
+        device_cache_size_(device_cache_size),
+        batch_elements_(batch_elements),
+        rank_id_(rank_id),
+        rank_group_size_(rank_group_size) {
+    MS_ASSERT(rank_group_size_ != 0);
+    auto local_shard_size = static_cast<int>(std::ceil(static_cast<float>(vocab_size_) / rank_group_size_));
+    min_host_index_ = local_shard_size * rank_id_;
+    max_host_index_ = std::min(min_host_index_ + local_shard_size, static_cast<int>(vocab_size_));
+    host_cache_size_ = max_host_index_ - min_host_index_;
+
+    MS_LOG(INFO) << "rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_
+                 << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_
+                 << ", index begin:" << min_host_index_ << ", index end:" << max_host_index_;
+  }
+
+  ~EmbeddingCache();
+  Status Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor,
+              mindspore::MSTensor device_tensor);
+  Status SetHostCacheAddr(void *addr, size_t size);
+  Status SetDeviceCacheAddr(void *host_mem_addr, size_t size);
+  Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *hash_index);
+  size_t GetDeviceStartIndex() { return device_start_index_; }
+
+ private:
+  Status Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor);
+  Status MallocCacheMemory();
+
+ private:
+  std::shared_ptr<cache::CacheMemBase> device_cache_{nullptr};
+  std::shared_ptr<CacheAlgorithm> cache_{nullptr};
+
+  size_t vocab_size_{0};         // total size
+  size_t host_cache_size_{0};    // local host size
+  size_t device_cache_size_{0};  // local device cache size
+  size_t device_start_index_{0};
+  size_t embedding_size_{0};
+  size_t batch_elements_{0};
+
+  DataType data_type_{DataType::kNumberTypeFloat32};
+  size_t sizeof_data_type_{0};
+
+  void *device_addr_{nullptr};  // hash_info.device_address.addr
+  void *host_addr_{nullptr};
+
+  int *hash_swap_index_addr_;  // embedding_device_cache_->hash_swap_index_addr_
+  void *hash_swap_value_addr_;
+  void *hash_swap_value_device_addr_;
+
+  int rank_id_;
+  int rank_group_size_;
+  int min_host_index_{0};
+  int max_host_index_{0};
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc
@ -0,0 +1,194 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h"
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstring>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+
+namespace {
+constexpr size_t kGatherInputsSize = 3;
+}
+namespace mindspore {
+namespace cache {
+Status EmbeddingCacheManager::Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size) {
+  if (cache_model_path.empty() || vocab_size == 0 || device_cache_size >= vocab_size) {
+    MS_LOG(INFO) << "no cache model ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+    return kSuccess;
+  }
+
+  host_cache_model_ = std::make_shared<HostCacheModel>();
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "HostCacheModel malloc failed";
+    return kLiteMemoryFailed;
+  }
+  auto ret = host_cache_model_->LoadCache(cache_model_path);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "load cache failed";
+    return ret;
+  }
+  vocab_size_ = vocab_size;
+  device_cache_size_ = device_cache_size;
+
+  MS_LOG(INFO) << "cache manager init succ, cache model" << cache_model_path << " ,  vocab_size " << vocab_size
+               << ",  device_cache_size " << device_cache_size;
+  return ret;
+}
+
+Status EmbeddingCacheManager::Init(DelegateModel<schema::Primitive> *model, size_t vocab_size,
+                                   size_t device_cache_size) {
+  if (model == nullptr || vocab_size == 0 || device_cache_size >= vocab_size) {
+    MS_LOG(INFO) << "no cache model ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+    return kSuccess;
+  }
+
+  host_cache_model_ = std::make_shared<HostCacheModel>();
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "HostCacheModel malloc failed";
+    return kLiteMemoryFailed;
+  }
+  auto ret = host_cache_model_->LoadCache(model);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "load cache failed";
+    return ret;
+  }
+  vocab_size_ = vocab_size;
+  device_cache_size_ = device_cache_size;
+
+  MS_LOG(INFO) << "cache manager init succ,  vocab_size " << vocab_size << ",  device_cache_size " << device_cache_size;
+  return ret;
+}
+
+bool EmbeddingCacheManager::CheckIsCacheKernel(kernel::Kernel *kernel) {
+  if (host_cache_model_ == nullptr) {
+    return false;
+  }
+  return host_cache_model_->CheckIsCacheKernel(kernel);
+}
+
+Status EmbeddingCacheManager::InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context) {
+  if (host_cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "cache model is nullptr, kernel " << kernel->name() << " init cache failed";
+    return kLiteError;
+  }
+  auto host_cache_tensor = host_cache_model_->GetHostCacheTensor(kernel);
+  if (host_cache_tensor == nullptr) {
+    MS_LOG(ERROR) << kernel->name() << ": invalid cache kernel";
+    return kLiteError;
+  }
+
+  // only support embedding cache
+  if (kernel->type() != schema::PrimitiveType_Gather) {
+    MS_LOG(ERROR) << kernel->name() << " is not embedding kernel";
+    return kLiteError;
+  }
+  MS_ASSERT(kernel->inputs().size() == kGatherInputsSize);
+  auto device_tensor = kernel->inputs()[0];
+  size_t batch_elements = kernel->inputs()[1].ElementNum();
+  auto cache =
+    std::make_shared<EmbeddingCache>(vocab_size_, device_cache_size_, batch_elements, rank_id_, rank_group_size_);
+  if (cache == nullptr) {
+    MS_LOG(ERROR) << kernel->name() << ": malloc EmbeddingCache failed";
+    return kLiteError;
+  }
+
+  auto ret = cache->Init(device_id, context, host_cache_tensor, device_tensor);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << kernel->name() << ": EmbeddingCache init failed";
+    return kLiteError;
+  }
+
+  caches_[device_tensor.Name()] = cache;
+  MS_LOG(INFO) << kernel->name() << " is cache kernel, input tensor " << kernel->inputs()[1].Name() << ", cache tensor "
+               << device_tensor.Name();
+
+  return kSuccess;
+}
+
+bool EmbeddingCacheManager::IsCacheTensor(mindspore::MSTensor tensor) {
+  if (host_cache_model_ == nullptr) {
+    return false;
+  }
+  auto cache = caches_.find(tensor.Name());
+  if (cache != caches_.end()) {
+    return true;
+  }
+  return false;
+}
+
+std::vector<int64_t> EmbeddingCacheManager::GetCacheShape(mindspore::MSTensor tensor) {
+  std::vector<int64_t> shape = tensor.Shape();
+  if (shape.size() > 0 && IsCacheTensor(tensor)) {
+    shape[0] = device_cache_size_;
+  }
+  return shape;
+}
+
+size_t EmbeddingCacheManager::GetCacheDataSize(mindspore::MSTensor tensor) {
+  auto data_size = tensor.DataSize();
+  auto &shape = tensor.Shape();
+  if (shape.size() > 0 && IsCacheTensor(tensor) && shape[0] > 0) {
+    data_size = data_size * device_cache_size_ / shape[0];
+  }
+  return data_size;
+}
+
+Status EmbeddingCacheManager::SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size) {
+  auto cache_iter = caches_.find(tensor_name);
+  if (cache_iter == caches_.end() || cache_iter->second == nullptr) {
+    MS_LOG(ERROR) << "not find cache, " << tensor_name;
+    return kLiteError;
+  }
+  auto cache = cache_iter->second;
+  return cache->SetDeviceCacheAddr(device_mem_addr, size);
+}
+
+// device_addr is model input device addr
+int EmbeddingCacheManager::CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor,
+                                       void *model_input_device_addr) {
+  auto cache_iter = caches_.find(tensor_name);
+  if (cache_iter == caches_.end()) {
+    MS_LOG(ERROR) << "not find cache, " << tensor_name;
+    return lite::RET_ERROR;
+  }
+  auto cache = cache_iter->second;
+  hash_indices_.resize(model_input_tensor.ElementNum());
+  auto ret = cache->CheckCacheHit(static_cast<int *>(model_input_tensor.MutableData()), hash_indices_.size(),
+                                  hash_indices_.data());
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "CheckCacheHit failed, " << model_input_tensor.Name();
+    return lite::RET_ERROR;
+  }
+
+  for (size_t i = 0; i < hash_indices_.size(); i++) {
+    if (hash_indices_[i] != -1) {
+      hash_indices_[i] += cache->GetDeviceStartIndex();
+    }
+  }
+
+  auto cuda_ret = cudaMemcpy(model_input_device_addr, hash_indices_.data(), hash_indices_.size() * sizeof(int),
+                             cudaMemcpyHostToDevice);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "copy mem failed, " << model_input_tensor.Name();
+    return lite::RET_ERROR;
+  }
+  MS_LOG(INFO) << "cache handle succ, " << model_input_tensor.Name() << "," << tensor_name;
+
+  return lite::RET_OK;
+}
+}  // namespace cache
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h
@ -0,0 +1,60 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
+#include <memory>
+#include <map>
+#include <string>
+#include <vector>
+#include "include/api/kernel.h"
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "src/runtime/delegate/parameter_cache/embedding_cache.h"
+#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore {
+namespace cache {
+class EmbeddingCacheManager {
+ public:
+  EmbeddingCacheManager() {
+    rank_id_ = lite::GetRankID();
+    rank_group_size_ = lite::GetGPUGroupSize();
+  }
+  Status Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size);
+  Status Init(DelegateModel<schema::Primitive> *model, size_t vocab_size, size_t device_cache_size);
+  bool CheckIsCacheKernel(kernel::Kernel *kernel);
+  Status InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context);
+  bool IsCacheTensor(mindspore::MSTensor tensor);
+  int CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor, void *device_addr);
+  Status SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size);
+  std::vector<int64_t> GetCacheShape(mindspore::MSTensor tensor);
+  size_t GetCacheDataSize(mindspore::MSTensor tensor);
+
+ private:
+  std::map<std::string, std::shared_ptr<EmbeddingCache>> caches_;
+  std::vector<int> hash_indices_;
+  int rank_id_{0};
+  int rank_group_size_{1};
+
+  std::shared_ptr<HostCacheModel> host_cache_model_;
+  size_t vocab_size_;
+  size_t device_cache_size_;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h
@ -0,0 +1,81 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
+#include <map>
+#include <memory>
+#include "include/api/status.h"
+
+namespace mindspore {
+namespace lite {
+template <typename KEY, typename PRODUCT>
+class ProcductRegistrar {
+ public:
+  virtual std::shared_ptr<PRODUCT> Create() = 0;
+
+ protected:
+  ProcductRegistrar() {}
+  virtual ~ProcductRegistrar() {}
+
+ private:
+  ProcductRegistrar(const ProcductRegistrar &);
+  const ProcductRegistrar &operator=(const ProcductRegistrar &);
+};
+
+template <typename KEY, typename PRODUCT>
+class FactoryManagerBase {
+ public:
+  static FactoryManagerBase &Instance() {
+    static FactoryManagerBase<KEY, PRODUCT> instance;
+    return instance;
+  }
+  void RegProduct(const KEY &key, ProcductRegistrar<KEY, PRODUCT> *registrar) { registrars[key] = registrar; }
+
+  std::shared_ptr<PRODUCT> GetProduct(const KEY &key) {
+    auto registrar_iter = registrars.find(key);
+    if (registrar_iter != registrars.end()) {
+      if (registrar_iter->second != nullptr) {
+        return registrar_iter->second->Create();
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  FactoryManagerBase() = default;
+  ~FactoryManagerBase() = default;
+  FactoryManagerBase(const FactoryManagerBase &);
+  const FactoryManagerBase &operator=(const FactoryManagerBase &);
+
+ private:
+  std::map<KEY, ProcductRegistrar<KEY, PRODUCT> *> registrars;
+};
+
+template <typename KEY, typename PRODUCT, typename PRODUCT_IMPL>
+class CommonProcductRegistrar : public ProcductRegistrar<KEY, PRODUCT> {
+ public:
+  explicit CommonProcductRegistrar(const KEY &key) {
+    FactoryManagerBase<KEY, PRODUCT>::Instance().RegProduct(key, this);
+  }
+  std::shared_ptr<PRODUCT> Create() { return std::make_shared<PRODUCT_IMPL>(); }
+};
+
+#define RET_COMMON_PRODUCT_REGISTRAR(KEY, PRODUCT, PRODUCT_IMPL, key, name) \
+  static mindspore::lite::CommonProcductRegistrar<KEY, PRODUCT, PRODUCT_IMPL> g_commonProcductRegistrar##name(key);
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc
@ -0,0 +1,158 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h"
+#include <cuda_runtime.h>
+#include <string>
+#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh"
+#include "plugin/device/gpu/hal/device/cuda_driver.h"
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+namespace mindspore {
+namespace cache {
+namespace gpu {
+RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheMemBase, cache::gpu::GPUCacheMem, "gpu", GPUCacheMem);
+bool GPUCacheMem::InitDevice(uint32_t device_id, const void *context) {
+  auto cuda_ret = cudaSetDevice(static_cast<int>(device_id));
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Failed to set device id " << device_id << ", cuda_ret " << cuda_ret << " "
+                  << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+  if (context != nullptr) {
+    stream_ = *(reinterpret_cast<const cudaStream_t *>(context));
+    return true;
+  }
+
+  cuda_ret = cudaStreamCreate(&stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda create stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+void *GPUCacheMem::MallocMemory(size_t size) {
+  void *device_ptr = nullptr;
+  auto cuda_ret = cudaMalloc(&device_ptr, size);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda Malloc failed for size:" << size << ", cuda_ret " << cuda_ret << " "
+                  << cudaGetErrorString(cuda_ret);
+    return nullptr;
+  }
+  MS_LOG(DEBUG) << "cudaMalloc size: " << size;
+  return device_ptr;
+}
+
+void GPUCacheMem::FreeMemory(void *device_addr) {
+  auto cuda_ret = cudaFree(device_addr);
+  if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) {
+    MS_LOG(WARNING) << "free cuda memory failed, "
+                    << ", cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+  }
+}
+
+bool GPUCacheMem::SynchronizeStream() {
+  auto cuda_ret = cudaStreamSynchronize(stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda sync stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::CopyHostMemToDevice(void *dst, const void *src, size_t size) {
+  if (dst == nullptr) {
+    MS_LOG(ERROR) << "dst is nullptr";
+    return false;
+  }
+  if (src == nullptr) {
+    MS_LOG(ERROR) << "src is nullptr";
+    return false;
+  }
+
+  auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::CopyDeviceMemToHost(void *dst, const void *src, size_t size) {
+  if (dst == nullptr) {
+    MS_LOG(ERROR) << "dst is nullptr";
+    return false;
+  }
+  if (src == nullptr) {
+    MS_LOG(ERROR) << "src is nullptr";
+    return false;
+  }
+
+  auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream_);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret);
+    return false;
+  }
+
+  return true;
+}
+
+bool GPUCacheMem::HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t,
+                              size_t embedding_size, size_t swap_out_size) {
+  if (hash_table_addr == nullptr) {
+    MS_LOG(ERROR) << "hash_table_addr is nullptr";
+    return false;
+  }
+  if (swap_out_value_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_out_value_addr is nullptr";
+    return false;
+  }
+  if (swap_out_index_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_out_index_addr is nullptr";
+    return false;
+  }
+
+  DoHashSwapOut(reinterpret_cast<float *>(hash_table_addr), reinterpret_cast<float *>(swap_out_value_addr),
+                reinterpret_cast<int *>(swap_out_index_addr), swap_out_size, embedding_size, stream_);
+  return true;
+}
+
+bool GPUCacheMem::HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t,
+                             size_t embedding_size, size_t swap_in_size) {
+  if (hash_table_addr == nullptr) {
+    MS_LOG(ERROR) << "hash_table_addr is nullptr";
+    return false;
+  }
+  if (swap_in_value_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_in_value_addr is nullptr";
+    return false;
+  }
+  if (swap_in_index_addr == nullptr) {
+    MS_LOG(ERROR) << "swap_in_index_addr is nullptr";
+    return false;
+  }
+
+  DoHashSwapIn(reinterpret_cast<float *>(hash_table_addr), reinterpret_cast<float *>(swap_in_value_addr),
+               reinterpret_cast<int *>(swap_in_index_addr), swap_in_size, embedding_size, stream_);
+  return true;
+}
+}  // namespace gpu
+}  // namespace cache
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
+
+#include <cuda_runtime_api.h>
+#include <memory>
+#include "src/runtime/delegate/parameter_cache/cache_mem_base.h"
+
+namespace mindspore {
+namespace cache {
+namespace gpu {
+class GPUCacheMem : public cache::CacheMemBase {
+ public:
+  GPUCacheMem() = default;
+  ~GPUCacheMem() override = default;
+  bool InitDevice(uint32_t device_id, const void *context) override;
+  void *MallocMemory(size_t size) override;
+  void FreeMemory(void *buf) override;
+  bool SynchronizeStream() override;
+  bool CopyHostMemToDevice(void *dst, const void *src, size_t size) override;
+  bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) override;
+  bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t cache_vocab_size,
+                   size_t embedding_size, size_t swap_out_size) override;
+  bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t cache_vocab_size,
+                  size_t embedding_size, size_t swap_in_size) override;
+
+ private:
+  cudaStream_t stream_;
+};
+}  // namespace gpu
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc
@ -0,0 +1,243 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vector>
+#include <string>
+#include "src/common/log_adapter.h"
+#include "src/runtime/delegate/parameter_cache/lfu_cache.h"
+#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h"
+namespace mindspore {
+namespace cache {
+RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheAlgorithm, cache::LFUCacheAlgorithm, "lfu", LFUCacheAlgorithm);
+
+LFUCacheAlgorithm::~LFUCacheAlgorithm() {
+  for (auto iter : key_table_) {
+    delete *(iter.second);
+  }
+  key_table_.clear();
+  frequency_table_.clear();
+}
+
+Status LFUCacheAlgorithm::Init(size_t cache_size, int min_host_index, int max_host_index) {
+  if (cache_size <= 0 || min_host_index < 0 || max_host_index <= 0) {
+    return kLiteParamInvalid;
+  }
+  cache_size_ = cache_size;
+  min_host_index_ = min_host_index;
+  max_host_index_ = max_host_index;
+  return kSuccess;
+}
+
+CacheNoe *LFUCacheAlgorithm::GetNode(int key) {
+  auto key_table_iter = key_table_.find(key);
+  if (key_table_iter == key_table_.end()) {
+    return nullptr;
+  }
+  auto node_iter = key_table_iter->second;
+  auto node = *node_iter;
+
+  auto node_list_iter = frequency_table_.find(key);
+  if (node_list_iter == frequency_table_.end()) {
+    return nullptr;
+  }
+  auto &node_list = node_list_iter->second;
+  node_list.erase(node_iter);
+
+  if (node_list.empty()) {
+    frequency_table_.erase(node_list_iter);
+  }
+
+  node->frequency += 1;
+  frequency_table_[node->frequency].emplace_front(node);
+  key_table_[key] = frequency_table_[node->frequency].begin();
+  return node;
+}
+
+int LFUCacheAlgorithm::Get(int key) {
+  auto node = GetNode(key);
+  if (node != nullptr) {
+    return node->value;
+  }
+  return -1;
+}
+
+void LFUCacheAlgorithm::Put(int key, int value) {
+  auto node = GetNode(key);
+  if (node != nullptr) {
+    node->value = value;
+    return;
+  }
+
+  if (cache_size_ == 0) {
+    return;
+  }
+
+  CacheNoe *add_node = nullptr;
+  if (key_table_.size() == cache_size_) {
+    add_node = frequency_table_.begin()->second.back();
+    key_table_.erase(add_node->key);
+    frequency_table_.begin()->second.pop_back();
+    if (frequency_table_.begin()->second.size() == 0) {
+      frequency_table_.erase(frequency_table_.begin()->first);
+    }
+    add_node->value = value;
+    add_node->key = key;
+    add_node->frequency = 1;
+  } else {
+    add_node = new CacheNoe(key, 1, value);
+    if (add_node == nullptr) {
+      return;
+    }
+  }
+
+  frequency_table_[1].emplace_front(add_node);
+  key_table_[key] = frequency_table_[1].begin();
+}
+
+void LFUCacheAlgorithm::GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                                                std::unordered_map<int, CacheNoe *> *hit_index_nodes,
+                                                std::unordered_map<int, std::vector<int>> *need_swap_map) {
+  // 找到没有命中和命中的index
+  for (size_t i = 0; i < batch_ids_len; i++) {
+    auto key = batch_ids[i];
+    if (key < min_host_index_ || key >= max_host_index_) {
+      cache_index[i] = -1;
+      // out range
+      continue;
+    }
+
+    auto hit_iter = hit_index_nodes->find(key);
+    if (hit_iter != hit_index_nodes->end()) {
+      auto node = hit_iter->second;
+      node->frequency += 1;
+      cache_index[i] = node->value;
+      continue;
+    }
+
+    auto swap_iter = need_swap_map->find(key);
+    if (swap_iter != need_swap_map->end()) {
+      swap_iter->second.push_back(i);
+      continue;
+    }
+
+    auto node_iter_iter = key_table_.find(key);
+    if (node_iter_iter == key_table_.end()) {
+      (*need_swap_map)[key].push_back(i);
+      continue;
+    }
+    auto node_iter = node_iter_iter->second;
+    auto node = *node_iter;
+
+    auto node_list_iter = frequency_table_.find(node->frequency);
+    if (node_list_iter == frequency_table_.end()) {
+      continue;
+    }
+    auto &node_list = node_list_iter->second;
+    node_list.erase(node_iter);
+
+    if (node_list.empty()) {
+      frequency_table_.erase(node_list_iter);
+    }
+    // hit
+    node->frequency += 1;
+    cache_index[i] = node->value;
+    (*hit_index_nodes)[key] = node;
+  }
+  return;
+}
+
+std::list<CacheNoe *> LFUCacheAlgorithm::GetSwapNodes(const std::unordered_map<int, std::vector<int>> &need_swap_map) {
+  std::list<CacheNoe *> need_swap_nodes;
+  auto swap_size = need_swap_map.size();
+
+  while (swap_size > 0 && !frequency_table_.empty()) {
+    auto node_list_iter = frequency_table_.begin();
+    if (node_list_iter->second.size() > swap_size) {
+      auto iter = node_list_iter->second.begin();
+      std::advance(iter, swap_size);
+      need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second, node_list_iter->second.begin(), iter);
+      swap_size = 0;
+    } else {
+      swap_size -= node_list_iter->second.size();
+      need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second);
+      frequency_table_.erase(node_list_iter);
+    }
+  }
+  return need_swap_nodes;
+}
+
+Status LFUCacheAlgorithm::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                                        std::vector<int> *need_swap_indies,
+                                        std::vector<int> *need_swap_indies_cache_index) {
+  if (batch_ids == nullptr) {
+    MS_LOG(ERROR) << "batch_ids is nullptr";
+    return kLiteNullptr;
+  }
+  if (cache_index == nullptr) {
+    MS_LOG(ERROR) << "cache_index is nullptr";
+    return kLiteNullptr;
+  }
+  std::unordered_map<int, std::vector<int>> need_swap_map;
+  std::unordered_map<int, CacheNoe *> hit_index_nodes;
+  GetHitNodesAndSwapIndex(batch_ids, batch_ids_len, cache_index, &hit_index_nodes, &need_swap_map);
+
+  // get need_swap_indies.size() least recently used node
+  std::list<CacheNoe *> need_swap_nodes = GetSwapNodes(need_swap_map);
+
+  // 更新老节点的值
+  {
+    if (need_swap_map.size() != need_swap_nodes.size()) {
+      MS_LOG(ERROR) << " need_swap_map.size() " << need_swap_map.size() << " != need_swap_nodes.size() "
+                    << need_swap_nodes.size();
+      return kLiteError;
+    }
+    need_swap_indies_cache_index->reserve(need_swap_map.size());
+    auto need_swap_map_iter = need_swap_map.begin();
+    for (auto iter = need_swap_nodes.begin();
+         iter != need_swap_nodes.end() && need_swap_map_iter != need_swap_map.end(); iter++, need_swap_map_iter++) {
+      auto node = *iter;
+      key_table_.erase(node->key);
+      node->key = need_swap_map_iter->first;
+      node->frequency = 1;
+      for (auto index : need_swap_map_iter->second) {
+        cache_index[index] = node->value;
+      }
+      need_swap_indies->push_back(need_swap_map_iter->first);
+      need_swap_indies_cache_index->push_back(node->value);
+      MS_LOG(INFO) << "device index " << node->value << ",for host index " << need_swap_map_iter->first;
+      key_table_[(*iter)->key] = iter;
+    }
+
+    auto node_list_iter = frequency_table_.begin();
+    if (node_list_iter->second.size() > 0) {
+      auto iter = node_list_iter->second.begin();
+      if ((*iter)->frequency == 1) {
+        node_list_iter->second.splice(node_list_iter->second.begin(), need_swap_nodes);
+      } else {
+        frequency_table_[1] = need_swap_nodes;
+      }
+    } else {
+      frequency_table_[1] = need_swap_nodes;
+    }
+  }
+  for (auto node_iter : hit_index_nodes) {
+    auto node = node_iter.second;
+    frequency_table_[node->frequency].emplace_front(node);
+    key_table_[node->key] = frequency_table_[node->frequency].begin();
+  }
+  return kSuccess;
+}
+}  // namespace cache
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
+
+#include <map>
+#include <unordered_map>
+#include <list>
+#include <vector>
+#include "include/api/status.h"
+#include "src/runtime/delegate/parameter_cache/cache_algorithm.h"
+namespace mindspore {
+namespace cache {
+class LFUCacheAlgorithm : public CacheAlgorithm {
+ public:
+  LFUCacheAlgorithm() {}
+  ~LFUCacheAlgorithm() override;
+
+  int Get(int key) override;
+  void Put(int key, int value) override;
+  Status Init(size_t cache_size, int min_host_index, int max_host_index) override;
+  Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                       std::vector<int> *need_swap_indies, std::vector<int> *need_swap_indies_cache_index) override;
+
+ private:
+  CacheNoe *GetNode(int key);
+  void GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index,
+                               std::unordered_map<int, CacheNoe *> *hit_index_nodes,
+                               std::unordered_map<int, std::vector<int>> *need_swap_map);
+  std::list<CacheNoe *> GetSwapNodes(const std::unordered_map<int, std::vector<int>> &need_swap_map);
+
+  std::unordered_map<int, std::list<CacheNoe *>::iterator> key_table_;
+  std::map<int, std::list<CacheNoe *>> frequency_table_;
+  size_t cache_size_{0};
+
+  int min_host_index_{0};
+  int max_host_index_{1};
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc
@ -0,0 +1,148 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h"
+#include "src/common/log_adapter.h"
+#include "src/common/common.h"
+#include "include/errorcode.h"
+#include "src/common/file_utils.h"
+
+namespace {
+constexpr size_t kGatherInputsSize = 3;
+}
+namespace mindspore {
+namespace cache {
+HostCacheModel::~HostCacheModel() {
+  if (cache_model_ != nullptr) {
+    delete cache_model_;
+    cache_model_ = nullptr;
+  }
+}
+MSTensor *SchemaTensorToMSTensor(lite::SchemaTensorWrapper *schema_tensor_wrapper,
+                                 mindspore::schema::Tensor *schema_tensor) {
+  std::vector<int64_t> shape;
+  for (size_t j = 0; j < schema_tensor->dims()->size(); j++) {
+    shape.push_back(schema_tensor->dims()->data()[j]);
+  }
+  std::string tensor_name;
+  if (schema_tensor->name() != nullptr) {
+    tensor_name = schema_tensor->name()->str();
+  }
+  return MSTensor::CreateRefTensor(tensor_name, (DataType)schema_tensor->dataType(), shape,
+                                   schema_tensor_wrapper->data(), schema_tensor_wrapper->length());
+}
+
+Status HostCacheModel::LoadCache(const std::string &model_path) {
+  cache_model_ = lite::LiteImportFromPath(model_path.c_str());
+  if (cache_model_ == nullptr) {
+    MS_LOG(ERROR) << "Import model failed";
+    return kLiteGraphFileError;
+  }
+
+  auto allTensors = cache_model_->graph_.all_tensors_;
+  for (auto node : cache_model_->graph_.all_nodes_) {
+    // only support embedding cache
+    if (node == nullptr || node->node_type_ != schema::PrimitiveType_Gather) {
+      continue;
+    }
+
+    auto input_index = node->input_indices_[0];
+    if (input_index > allTensors.size() - 1) {
+      MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index << ",allTensors.size() "
+                    << allTensors.size();
+      return kLiteOutOfTensorRange;
+    }
+    auto schema_tensor_wrapper = cache_model_->GetSchemaTensor(input_index);
+    if (schema_tensor_wrapper == nullptr) {
+      MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index;
+      return kLiteOutOfTensorRange;
+    }
+
+    auto schema_tensor = allTensors[input_index];
+    if (schema_tensor != nullptr && schema_tensor_wrapper->data() != nullptr) {
+      auto tensor = SchemaTensorToMSTensor(schema_tensor_wrapper, schema_tensor);
+      if (tensor == nullptr) {
+        return kLiteMemoryFailed;
+      }
+      cache_tensor_[tensor->Name()] = *tensor;
+      MS_LOG(INFO) << tensor->Name() << " is cache tensor, and the node is [" << node->name_ << "]";
+      delete tensor;
+    }
+  }
+  return kSuccess;
+}
+
+size_t GetVocabSize(kernel::Kernel *kernel) {
+  size_t vocab_size = 0;
+  auto cache_config = kernel->GetConfig(lite::kMSCache);
+  auto vocab_size_iter = cache_config.find(lite::kMSCacheVocabSize);
+  if (vocab_size_iter == cache_config.end()) {
+    return vocab_size;
+  }
+
+  auto vocab_size_opt = lite::GenericParseValue<size_t>(vocab_size_iter->second);
+  if (!vocab_size_opt.IsNone()) {
+    vocab_size = vocab_size_opt.Get();
+  }
+  return vocab_size;
+}
+
+Status HostCacheModel::LoadCache(DelegateModel<schema::Primitive> *model) {
+  KernelIter from, end;
+  for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
+    kernel::Kernel *kernel = *iter;
+    // only support embedding cache
+    if (kernel->type() != schema::PrimitiveType_Gather) {
+      continue;
+    }
+    MS_ASSERT(kernel->inputs().size() == kGatherInputsSize);
+    auto tensor = kernel->inputs()[0];
+    if (tensor.Data() == nullptr) {
+      continue;
+    }
+
+    size_t vocab_size = GetVocabSize(kernel);
+    if (vocab_size == 0) {
+      continue;
+    }
+
+    cache_tensor_[tensor.Name()] = tensor;
+  }
+  return mindspore::kSuccess;
+}
+
+bool HostCacheModel::CheckIsCacheKernel(kernel::Kernel *kernel) {
+  if (GetHostCacheTensor(kernel) == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+MSTensor HostCacheModel::GetHostCacheTensor(kernel::Kernel *kernel) {
+  if (kernel != nullptr && kernel->inputs().size() > 0) {
+    auto iter = cache_tensor_.find(kernel->inputs()[0].Name());
+    if (iter != cache_tensor_.end()) {
+      return iter->second;
+    }
+  }
+  return MSTensor(nullptr);
+}
+}  // namespace cache
+}  // namespace mindspore
--- a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h
+++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_
+
+#include <map>
+#include <string>
+#include "include/api/status.h"
+#include "include/api/data_type.h"
+#include "include/api/types.h"
+#include "include/api/kernel.h"
+#include "include/api/delegate.h"
+#include "src/runtime/lite_model.h"
+
+namespace mindspore {
+namespace cache {
+class HostCacheModel {
+ public:
+  HostCacheModel() = default;
+  ~HostCacheModel();
+  Status LoadCache(const std::string &model_path);
+  Status LoadCache(DelegateModel<schema::Primitive> *model);
+  bool CheckIsCacheKernel(kernel::Kernel *kernel);
+  MSTensor GetHostCacheTensor(kernel::Kernel *kernel);
+
+ private:
+  std::map<std::string, MSTensor> cache_tensor_;
+  mindspore::lite::LiteModel *cache_model_{nullptr};
+  char *model_buf_{nullptr};
+  size_t model_size_;
+};
+}  // namespace cache
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_EMBEDDING_CACHE_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt
@ -0,0 +1,95 @@
+include_directories(${TENSORRT_PATH}/include)
+include_directories(${CUDA_PATH}/include)
+include_directories(${CUDA_PATH})
+include_directories($(CCSRC_DIR)/plugin/device/cpu/kernel)
+include_directories(${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops)
+
+if(DEFINED ENV{MS_ENABLE_CUDA_DISTRIBUTION})
+    set(MS_ENABLE_CUDA_DISTRIBUTION $ENV{MS_ENABLE_CUDA_DISTRIBUTION})
+else()
+    set(MS_ENABLE_CUDA_DISTRIBUTION "off")
+endif()
+
+set(NCCL_MPI_SRC_STUB
+    ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_collective.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_base.cc
+)
+
+# nccl mpi
+if(MS_ENABLE_CUDA_DISTRIBUTION STREQUAL "on")
+    message("enable cuda gpu distribution collective")
+    file(GLOB NCCL_MPI_SRC LIST_DIRECTORIES false
+        ${CMAKE_CURRENT_SOURCE_DIR}/distribution/*.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/collective_wrapper.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/mpi_wrapper.cc
+        ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/nccl_wrapper.cc
+    )
+    list(REMOVE_ITEM NCCL_MPI_SRC ${NCCL_MPI_SRC_STUB})
+
+    add_compile_definitions(LITE_CUDA_DISTRIBUTION)
+    include(${TOP_DIR}/cmake/external_libs/ompi.cmake)
+    include(${TOP_DIR}/cmake/external_libs/nccl.cmake)
+
+    add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC})
+    add_library(mindspore::nccl ALIAS nccl::nccl)
+    add_library(mindspore::ompi ALIAS ompi::mpi)
+    target_link_libraries(gpu_distribution_collective PRIVATE mindspore::ompi mindspore::nccl)
+else()
+    add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC_STUB})
+endif()
+add_dependencies(gpu_distribution_collective fbs_src)
+
+file(GLOB TENSORRT_RUNTIME_SRC LIST_DIRECTORIES false
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/op/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime/delegate/delegate_utils.cc
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache)
+
+set(TENSORRT_RUNTIME_SRC
+        ${TENSORRT_RUNTIME_SRC}
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache_manager.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/load_host_cache_model.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/lfu_cache.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/gpu/gpu_cache_mem.cc
+        )
+
+link_libraries(${CUDA_LIB_PATH}/libcudnn.so)
+link_libraries(${CUDA_LIB_PATH}/libnvrtc.so)
+link_libraries(${CUDA_LIB_PATH}/libcublasLt.so)
+
+add_library(libcudart SHARED IMPORTED)
+set_target_properties(libcudart PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcudart.so)
+
+add_library(libnvinfer SHARED IMPORTED)
+set_target_properties(libnvinfer PROPERTIES IMPORTED_LOCATION ${TENSORRT_LIB_PATH}/libnvinfer.so)
+
+add_library(libcublas SHARED IMPORTED)
+set_target_properties(libcublas PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcublas.so)
+add_library(tensorrt_kernel_mid OBJECT ${TENSORRT_RUNTIME_SRC})
+
+add_dependencies(tensorrt_kernel_mid fbs_src)
+
+target_link_libraries(
+    tensorrt_kernel_mid
+    libcudart
+    libcublas
+    libnvinfer
+)
+
+# cuda
+find_package(CUDA)
+file(GLOB_RECURSE CUDA_KERNEL_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cu
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu
+    ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cu
+)
+
+set_source_files_properties(${CUDA_KERNEL_SRC} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGES} -std=c++14 -fPIC")
+SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++14;)
+cuda_add_library(cuda_kernel_mid STATIC ${CUDA_KERNEL_SRC})
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu
@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh"
+#include <stdio.h>
+#include <math.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void SigmoidKernel(const T *input1, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = static_cast<T>(1) / (static_cast<T>(1) + exp(-input1[pos]));
+  }
+}
+
+template <typename T>
+__global__ void GeluKernel(const T *input_addr, T *output_addr, int size) {
+  // formula:
+  // gelu(x) = 0.5 * x * (1.0 + tanh(y))
+  // tanh(y) = 2 / (1 + exp(-2y)) - 1)
+  // y = sqrt(2/pi) * (x + 0.044715 * x^3)
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    float x = input_addr[pos];
+    float tanh_res = tanh(0.7978845608f * (x + 0.044715f * x * x * x));
+    output_addr[pos] = 0.5f * x * (1.0f + tanh_res);
+  }
+}
+
+template <typename T>
+void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  SigmoidKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+  return;
+}
+
+template <typename T>
+void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  GeluKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+  return;
+}
+
+template void Sigmoid(const float *input1, float *output, int element_cnt, cudaStream_t stream);
+
+template void Gelu(const float *input1, float *output, int element_cnt, cudaStream_t stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh
@ -0,0 +1,26 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
+
+template <typename T>
+void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu
@ -0,0 +1,49 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+// Generic cast
+template <typename S, typename T>
+__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) {
+  *output_addr = static_cast<T>((*input_addr));
+}
+
+template <typename S, typename T>
+__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) {
+    CastBase(input_addr + pos, output_addr + pos);
+  }
+}
+
+template <typename S, typename T>
+void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) {
+  CastKernel<<<GET_BLOCKS(input_size), GET_THREADS, 0, stream>>>(input_size, input_addr, output_addr);
+}
+
+template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream);
+
+template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream);
+
+template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream);
+template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh
@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
+
+template <typename S, typename T>
+void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc
@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h"
+
+namespace mindspore::lite {
+void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle) {
+  const int m = params[0];
+  const int n = params[1];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(
+    cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &alpha, in_addr, n, &beta, out_addr, m, out_addr, m));
+}
+
+void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params,
+                    const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle) {
+  const int m = params[0];
+  const int n = params[1];
+  const int k = params[2];
+  cublasOperation_t trans_a = operations[0];
+  cublasOperation_t trans_b = operations[1];
+  const int lda = (trans_a == CUBLAS_OP_N) ? k : m;
+  const int ldb = (trans_b == CUBLAS_OP_N) ? n : k;
+  const int ldc = n;
+  cudaDataType type_a = data_types[0];
+  cudaDataType type_b = data_types[1];
+  cudaDataType type_c = data_types[2];
+  cudaDataType compute_type = data_types[3];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(cublasGemmEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addr, type_b, ldb, a_addr, type_a,
+                                 lda, &beta, c_addr, type_c, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params,
+                     const cublasOperation_t *operations, const cudaDataType *data_types,
+                     cublasHandle_t cublas_handle) {
+  cublasOperation_t trans_a = operations[0];
+  cublasOperation_t trans_b = operations[1];
+  const int m = params[0];
+  const int n = params[1];
+  const int k = params[2];
+  const int batch = params[3];
+  const int lda = (trans_a == CUBLAS_OP_N) ? k : m;
+  const int ldb = (trans_b == CUBLAS_OP_N) ? n : k;
+  const int ldc = n;
+  cudaDataType type_a = data_types[0];
+  cudaDataType type_b = data_types[1];
+  cudaDataType type_c = data_types[2];
+  cudaDataType compute_type = data_types[3];
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  CUBLAS_CHECK_VOID(cublasGemmBatchedEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addrs, type_b, ldb, a_addrs,
+                                        type_a, lda, &beta, c_addrs, type_c, ldc, batch, compute_type,
+                                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
+
+#include <cublas_v2.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/common/log_util.h"
+
+// cublas API error checking
+#define CUBLAS_CHECK_VOID(err)                        \
+  do {                                                \
+    cublasStatus_t cublas_err = (err);                \
+    if (cublas_err != CUBLAS_STATUS_SUCCESS) {        \
+      MS_LOG(ERROR) << "cublas error " << cublas_err; \
+      return;                                         \
+    }                                                 \
+  } while (0)
+
+#define CUBLAS_CHECK(err)                             \
+  do {                                                \
+    cublasStatus_t cublas_err = (err);                \
+    if (cublas_err != CUBLAS_STATUS_SUCCESS) {        \
+      MS_LOG(ERROR) << "cublas error " << cublas_err; \
+      return -1;                                      \
+    }                                                 \
+  } while (0)
+
+namespace mindspore::lite {
+// a: m * n
+// params order: m, n
+void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle);
+
+// a: m * k, b: k * n, c: m * n
+// params order: m, n, k
+// operations order: trans_a, trans_b
+// data_types: type_a, type_b, type_c, compute type
+void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params,
+                    const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle);
+
+// a: batch * m * k, b: batch * k * n, c: batch * m * n
+// params order: m, n, k, batch
+// operations order: trans_a, trans_b
+// data_types: type_a, type_b, type_c, compute type
+void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params,
+                     const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc
@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include <cmath>
+#include "src/common/log_util.h"
+
+CudaHelper &CudaHelper::GetInstance() {
+  static CudaHelper instance;
+  return instance;
+}
+int CudaHelper::GetThreadNum() const { return threads_per_block_; }
+int CudaHelper::GetThreadNum(const int block_size) const {
+  return std::min(threads_per_block_, ((block_size - 1) / 32 + 1) * 32);
+}
+int CudaHelper::GetBlocksNum(const int total_threads) const {
+  return std::min(((total_threads - 1) / threads_per_block_) + 1, max_blocks_);
+}
+int CudaHelper::GetBlocksNum(const int total_threads, const int block_size) const {
+  int valid_block_size = std::min(block_size, threads_per_block_);
+  if (valid_block_size == 0) {
+    MS_LOG(ERROR) << "invalid input of block_size: " << block_size;
+    return 0;
+  }
+  return std::min(((total_threads - 1) / valid_block_size) + 1, max_blocks_);
+}
+
+CudaHelper::CudaHelper() {
+  int device_id = 0;
+  (void)cudaGetDevice(&device_id);
+  cudaDeviceProp prop;
+  (void)cudaGetDeviceProperties(&prop, device_id);
+  threads_per_block_ = prop.maxThreadsPerBlock;
+  max_blocks_ = prop.multiProcessorCount;
+}
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
+
+#include <cuda_runtime.h>
+#include <algorithm>
+
+class CudaHelper {
+ public:
+  int GetThreadNum() const;
+  int GetThreadNum(const int block_size) const;
+  int GetBlocksNum(const int total_threads) const;
+  int GetBlocksNum(const int total_threads, const int block_size) const;
+  static CudaHelper &GetInstance();
+
+ private:
+  CudaHelper();
+  ~CudaHelper() = default;
+  CudaHelper(const CudaHelper &) = delete;
+  CudaHelper &operator=(const CudaHelper &) = delete;
+
+  int max_blocks_;
+  int threads_per_block_;
+};
+
+#define GET_BLOCKS(total_threads) CudaHelper::GetInstance().GetBlocksNum(total_threads)
+#define GET_BLOCKS_CAL(total_threads, block_size) CudaHelper::GetInstance().GetBlocksNum(total_threads, block_size)
+
+#define GET_THREADS CudaHelper::GetInstance().GetThreadNum()
+#define GET_THREADS_CAL(block_size) CudaHelper::GetInstance().GetThreadNum(block_size)
+
+#define CUDA_CHECK(ret)              \
+  do {                               \
+    cudaError_t cuda_ret = (ret);    \
+    if ((cuda_ret) != cudaSuccess) { \
+      return -1;                     \
+    }                                \
+  } while (0)
+
+#define CUDA_CHECK_VOID(ret)         \
+  do {                               \
+    cudaError_t cuda_ret = (ret);    \
+    if ((cuda_ret) != cudaSuccess) { \
+      return;                        \
+    }                                \
+  } while (0)
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc
@ -0,0 +1,41 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h"
+#include <unordered_map>
+
+namespace mindspore::lite {
+cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype) {
+  std::unordered_map<nvinfer1::DataType, cudnnDataType_t> data_types = {{nvinfer1::DataType::kFLOAT, CUDNN_DATA_FLOAT},
+                                                                        {nvinfer1::DataType::kHALF, CUDNN_DATA_HALF},
+                                                                        {nvinfer1::DataType::kINT32, CUDNN_DATA_INT32},
+                                                                        {nvinfer1::DataType::kINT8, CUDNN_DATA_INT8}};
+  if (data_types.find(trt_datatype) != data_types.end()) {
+    return data_types[trt_datatype];
+  } else {
+    MS_LOG(ERROR) << "invalid datatype for cudnn: " << static_cast<int>(trt_datatype);
+  }
+  return CUDNN_DATA_FLOAT;
+}
+
+int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc,
+                    const cudnnTensorDescriptor_t x_dsc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y) {
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  CUDNN_CHECK(cudnnActivationForward(handle, activation_desc, &alpha, x_dsc, x, &beta, y_dsc, y));
+  return 0;
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h
@ -0,0 +1,48 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
+
+#include <cudnn.h>
+#include <NvInfer.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/common/log_util.h"
+
+#define CUDNN_CHECK_VOID(err)                                            \
+  do {                                                                   \
+    cudnnStatus_t cudnn_err = (err);                                     \
+    if (cudnn_err != CUDNN_STATUS_SUCCESS) {                             \
+      MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \
+      return;                                                            \
+    }                                                                    \
+  } while (0)
+
+#define CUDNN_CHECK(err)                                                 \
+  do {                                                                   \
+    cudnnStatus_t cudnn_err = (err);                                     \
+    if (cudnn_err != CUDNN_STATUS_SUCCESS) {                             \
+      MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \
+      return -1;                                                         \
+    }                                                                    \
+  } while (0)
+namespace mindspore::lite {
+cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype);
+
+int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc,
+                    const cudnnTensorDescriptor_t x_esc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu
@ -0,0 +1,35 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh"
+#include <stdio.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void EqualKernel(const T *input1, const T *input2, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = (input1[pos] - input2[pos] < 1e-6 && input1[pos] - input2[pos] > -1e-6);
+  }
+}
+
+template <typename T>
+void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  EqualKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+  return;
+}
+
+template void Equal(const float *input1, const float *input2, float *output, int element_cnt, cudaStream_t stream);
+template void Equal(const int *input1, const int *input2, int *output, int element_cnt, cudaStream_t stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh
@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
+
+template <typename T>
+void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu
@ -0,0 +1,64 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void HashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                            const int hash_dim) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) {
+    int hash_index = swap_out_index[i];
+    for (int j = 0; j < hash_dim; j++) {
+      swap_out_value[i * hash_dim + j] = hash_table[hash_index * hash_dim + j];
+    }
+  }
+  return;
+}
+
+template <typename T>
+__global__ void HashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                           const int hash_dim) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) {
+    int hash_index = swap_in_index[i];
+    for (int j = 0; j < hash_dim; j++) {
+      hash_table[hash_index * hash_dim + j] = swap_in_value[i * hash_dim + j];
+    }
+  }
+  return;
+}
+
+template <typename T>
+void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                   const int hash_dim, cudaStream_t cuda_stream) {
+  HashSwapOut<<<GET_BLOCKS(index_size), GET_THREADS, 0, cuda_stream>>>(hash_table, swap_out_value, swap_out_index,
+                                                                       index_size, hash_dim);
+  return;
+}
+
+template <typename T>
+void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                  const int hash_dim, cudaStream_t cuda_stream) {
+  HashSwapIn<<<GET_BLOCKS(index_size), GET_THREADS, 0, cuda_stream>>>(hash_table, swap_in_value, swap_in_index,
+                                                                      index_size, hash_dim);
+  return;
+}
+
+template void DoHashSwapOut<float>(const float *hash_table, float *swap_out_value, const int *swap_out_index,
+                                   const int index_size, const int hash_dim, cudaStream_t cuda_stream);
+
+template void DoHashSwapIn<float>(float *hash_table, const float *swap_in_value, const int *swap_in_index,
+                                  const int index_size, const int hash_dim, cudaStream_t cuda_stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh
@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
+
+template <typename T>
+void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size,
+                   const int hash_dim, cudaStream_t cuda_stream);
+
+template <typename T>
+void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size,
+                  const int hash_dim, cudaStream_t cuda_stream);
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu
@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+
+template <typename T>
+__global__ void LogicalNotKernel(const T *input1, T *output, int element_cnt) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) {
+    output[pos] = static_cast<T>(input1[pos] == 0);
+  }
+}
+
+template <typename T>
+__global__ void LogicalAndKernel(const T *input_addr1, const T *input_addr2, T *output, int size) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    output[pos] = input_addr1[pos] * input_addr2[pos];
+  }
+}
+
+template <typename T>
+__global__ void LogicalOrKernel(const T *input_addr1, const T *input_addr2, T *output, int size) {
+  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
+    T sum = input_addr1[pos] + input_addr2[pos];
+    output[pos] = static_cast<T>(sum > 0);
+  }
+}
+
+template <typename T>
+void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalNotKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, output, element_cnt);
+}
+
+template <typename T>
+void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalAndKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+}
+
+template <typename T>
+void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) {
+  LogicalOrKernel<<<GET_BLOCKS(element_cnt), GET_THREADS, 0, stream>>>(input1, input2, output, element_cnt);
+}
+
+template void LogicalNot(const int32_t *input1, int32_t *output, int element_cnt, cudaStream_t stream);
+
+template void LogicalAnd(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt,
+                         cudaStream_t stream);
+
+template void LogicalOr(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt,
+                        cudaStream_t stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh
@ -0,0 +1,29 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
+
+template <typename T>
+void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream);
+
+template <typename T>
+void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu
@ -0,0 +1,98 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh"
+#include <stdio.h>
+#include <math.h>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/utils.cuh"
+
+template <typename T>
+__global__ void NormalizeKernel(const T *input, const T *gamma, const T *beta, T *output, size_t n, float epsilion,
+                                int dim_before_axis) {
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int block_loop = (dim_before_axis - 1) / gridDim.x + 1;
+  const int element_cnt = dim_before_axis * n;
+
+  __shared__ float s_mean[2048];
+  __shared__ float s_variance[2048];
+  float sum = 0.0f;
+  float variance = 0.0f;
+
+  for (int block = 0; block < block_loop; block++) {
+    float local_sum = 0.0f;
+    int mean_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      local_sum += static_cast<float>(input[num_index + i]);
+    }
+    sum = blockReduceSum(local_sum);
+    if (tid == 0) {
+      s_mean[mean_index] = sum / n;
+    }
+  }
+  __syncthreads();
+
+  for (int block = 0; block < block_loop; block++) {
+    float local_var_sum = 0.0f;
+    int var_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      float diff = static_cast<float>(input[num_index + i]) - s_mean[var_index];
+      local_var_sum += diff * diff;
+    }
+    variance = blockReduceSum(local_var_sum);
+    if (tid == 0) {
+      s_variance[var_index] = rsqrtf(variance / n + epsilion);
+    }
+  }
+  __syncthreads();
+  for (int block = 0; block < block_loop; block++) {
+    int var_index = bid + block * gridDim.x;
+    int num_index = bid * n + block * gridDim.x * blockDim.x;
+    for (int i = tid; i < n; i += blockDim.x) {
+      if (num_index + i >= element_cnt) {
+        break;
+      }
+      float beta_val = (beta == nullptr) ? 0.0f : static_cast<float>(beta[i]);
+      output[num_index + i] =
+        static_cast<T>(((static_cast<float>(input[num_index + i]) - s_mean[var_index]) * s_variance[var_index]) *
+                         static_cast<float>(gamma[i]) +
+                       beta_val);
+    }
+  }
+}
+
+template <typename T>
+void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion,
+               int element_cnt, cudaStream_t stream) {
+  int thread_num = GET_THREADS_CAL(dim_at_axis);
+  int block_num = GET_BLOCKS_CAL(element_cnt, thread_num);
+  int dim_before_axis = element_cnt / dim_at_axis;
+  NormalizeKernel<<<block_num, thread_num, 0, stream>>>(input, gamma, beta, output, dim_at_axis, epsilion,
+                                                        dim_before_axis);
+  return;
+}
+
+template void Normalize(const float *input, const float *gamma, const float *beta, float *output, size_t dim_at_axis,
+                        float epsilion, int element_cnt, cudaStream_t stream);
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh
@ -0,0 +1,24 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
+
+template <typename T>
+void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion,
+               int element_cnt, cudaStream_t stream);
+
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh
@ -0,0 +1,41 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T>
+__device__ T warpedReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  }
+  return val;
+}
+
+template <typename T>
+__device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int warped = threadIdx.x & 0x1f;
+  val = warpedReduceSum<T>(val);
+  if (warped == 0) shared[threadIdx.x >> 5] = val;
+  __syncthreads();
+  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[warped] : static_cast<T>(0.0);
+  val = warpedReduceSum<T>(val);
+  return val;
+}
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc
@ -0,0 +1,23 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+int GetGPUGroupSize() { return 1; }
+
+int GetRankID() { return 0; }
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h
@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
+
+#include <string>
+#include "src/common/log_adapter.h"
+#include "include/errorcode.h"
+
+namespace mindspore::lite {
+constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group";
+
+int GetGPUGroupSize();
+
+int GetRankID();
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc
@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+#include <unistd.h>
+#include <thread>
+#include <string>
+#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int GetGPUGroupSize() { return GetGroupSize(NCCL_WORLD_GROUP); }
+
+int GetRankID() { return GetRankIDByGroup(NCCL_WORLD_GROUP); }
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc
@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+DistributionCollective::DistributionCollective() {}
+
+DistributionCollective &DistributionCollective::instance() {
+  static DistributionCollective instance;
+  return instance;
+}
+
+int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count,
+                                                 nvinfer1::DataType data_type, schema::ReduceMode reduce_type,
+                                                 cudaStream_t stream, const std::string &group) {
+  return RET_OK;
+}
+
+int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count,
+                                             nvinfer1::DataType data_type, cudaStream_t stream,
+                                             const std::string &group_name) {
+  return RET_OK;
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
+
+#include <string>
+#include "NvInfer.h"
+#include "schema/ops_types_generated.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+class DistributionCollective {
+ public:
+  DistributionCollective(DistributionCollective const &) = delete;
+
+  DistributionCollective &operator=(const DistributionCollective &) = delete;
+
+  static DistributionCollective &instance();
+
+  int ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type,
+                           schema::ReduceMode reduce_type, cudaStream_t stream, const std::string &group);
+
+  int AllGatherWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type,
+                       cudaStream_t stream, const std::string &group_name);
+
+ private:
+  DistributionCollective();
+
+  ~DistributionCollective() = default;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc
@ -0,0 +1,72 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+#include <unistd.h>
+#include <thread>
+#include <string>
+#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h"
+
+namespace mindspore::lite {
+DistributionCollective::DistributionCollective() {
+  InitMPI();
+  InitNCCLComm();
+}
+
+DistributionCollective &DistributionCollective::instance() {
+  static DistributionCollective instance;
+  return instance;
+}
+
+int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count,
+                                                 nvinfer1::DataType data_type, schema::ReduceMode reduce_type,
+                                                 cudaStream_t stream, const std::string &group) {
+  int rank_id = GetRankID();
+  MS_LOG(DEBUG) << "ReduceScatter on rank: " << rank_id;
+  ncclResult_t ret = ReduceScatter(input_addr, output_addr, count, ConvertNCCLDataType(data_type),
+                                   ConvertNCCLReduceMode(reduce_type), stream, group);
+  if (ret != ncclSuccess) {
+    MS_LOG(ERROR) << "ReduceScatter failed: " << static_cast<int>(ret);
+    return RET_ERROR;
+  }
+  auto cuda_ret = cudaStreamSynchronize(stream);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast<int>(cuda_ret);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count,
+                                             nvinfer1::DataType data_type, cudaStream_t stream,
+                                             const std::string &group_name) {
+  int rank_id = GetRankID();
+  MS_LOG(DEBUG) << "AllGather on rank: " << rank_id;
+  ncclResult_t ret = AllGather(input_addr, output_addr, count, ConvertNCCLDataType(data_type), stream, group_name);
+  if (ret != ncclSuccess) {
+    MS_LOG(ERROR) << "AllGather failed: " << static_cast<int>(ret);
+    return RET_ERROR;
+  }
+  auto cuda_ret = cudaStreamSynchronize(stream);
+  if (cuda_ret != cudaSuccess) {
+    MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast<int>(cuda_ret);
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc
@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h"
+#include <unordered_map>
+#include "src/common/log_adapter.h"
+
+namespace mindspore::lite {
+ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id) {
+  std::unordered_map<nvinfer1::DataType, ncclDataType_t> data_type_map = {
+    {nvinfer1::DataType::kINT8, ncclInt8},
+    {nvinfer1::DataType::kINT32, ncclInt32},
+    {nvinfer1::DataType::kFLOAT, ncclFloat32},
+    {nvinfer1::DataType::kHALF, ncclHalf},
+  };
+  auto iter = data_type_map.find(type_id);
+  ncclDataType_t data_type;
+  if (iter != data_type_map.end()) {
+    data_type = iter->second;
+  } else {
+    data_type = ncclFloat32;
+    MS_LOG(WARNING) << "invalid data_type for NCCL, need check: " << static_cast<int>(type_id);
+  }
+  return data_type;
+}
+
+ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode) {
+  std::unordered_map<schema::ReduceMode, ncclRedOp_t> reduce_ops_ = {
+    // higher version support mean {schema::ReduceMode::ReduceMode_ReduceMean, ncclAvg},
+    {schema::ReduceMode::ReduceMode_ReduceMax, ncclMax},
+    {schema::ReduceMode::ReduceMode_ReduceMin, ncclMin},
+    {schema::ReduceMode::ReduceMode_ReduceProd, ncclProd},
+    {schema::ReduceMode::ReduceMode_ReduceSum, ncclSum},
+  };
+  auto iter = reduce_ops_.find(mode);
+  ncclRedOp_t nccl_mode;
+  if (iter != reduce_ops_.end()) {
+    nccl_mode = iter->second;
+  } else {
+    nccl_mode = ncclSum;
+    MS_LOG(WARNING) << "invalid reduce for NCCL, need check: " << static_cast<int>(mode);
+  }
+  return nccl_mode;
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h
@ -0,0 +1,32 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
+
+#include <nccl.h>
+#include "include/errorcode.h"
+#include "NvInfer.h"
+#include "schema/ops_types_generated.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::lite {
+ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id);
+
+ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode);
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc
@ -0,0 +1,116 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cuh"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(ActivationOptPluginCreater);
+template class TensorRTPluginCreater<ActivationOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int ActivationOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                 const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                 void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  return RunCudaActivation(inputDesc, inputs, outputs, stream);
+}
+
+bool ActivationOptPlugin::needResize(const int *current_dims, const int *last_dims) {
+  for (int i = 0; i < infer_dims_cnt_; i++) {
+    if (current_dims[i] != last_dims[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int ActivationOptPlugin::RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                            void *const *outputs, cudaStream_t stream) {
+  if (needResize(infer_dims_, inputDesc[0].dims.d)) {
+    if (input_desc_ != nullptr) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
+      input_desc_ = nullptr;
+    }
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc_));
+    for (int i = 0; i < inputDesc[0].dims.nbDims; i++) {
+      infer_dims_[i] = inputDesc[0].dims.d[i];
+    }
+    CUDNN_CHECK(cudnnSetTensorNdDescriptor(input_desc_, ConvertCudnnDataType(inputDesc[0].type), infer_dims_cnt_,
+                                           infer_dims_, infer_stride_));
+  }
+  CHECK_NULL_RETURN(cudnn_handle_);
+  CHECK_NULL_RETURN(activation_desc_);
+  CHECK_NULL_RETURN(input_desc_);
+  CUDNN_CHECK(cudnnSetStream(cudnn_handle_, stream));
+  auto ret = CudnnActivation(cudnn_handle_, activation_desc_, input_desc_, inputs[0], input_desc_, outputs[0]);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "cudnn activation func call failed " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+int ActivationOptPlugin::RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                           void *const *outputs, cudaStream_t stream) {
+  switch (activation_type_) {
+    case (schema::ActivationType::ActivationType_SIGMOID): {
+      Sigmoid(static_cast<const float *>(inputs[0]), static_cast<float *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+              stream);
+      break;
+    }
+    case (schema::ActivationType::ActivationType_GELU): {
+      Gelu(static_cast<const float *>(inputs[0]), static_cast<float *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+           stream);
+      break;
+    }
+    case (schema::ActivationType::ActivationType_SWISH): {
+      CalSwish(GetDimsVolume(inputDesc[0].dims), static_cast<const float *>(inputs[0]),
+               static_cast<float *>(outputs[0]), stream, device_id_);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid activation type: " << static_cast<int>(activation_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *ActivationOptPlugin::clone() const noexcept {
+  auto *plugin = new ActivationOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t ActivationOptPlugin::getSerializationSize() const noexcept { return sizeof(schema::ActivationType); }
+
+void ActivationOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &activation_type_, sizeof(schema::ActivationType));
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h
@ -0,0 +1,72 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
+
+#include <string>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h"
+
+namespace mindspore::lite {
+constexpr char *ACTIVATION_OPT_PLUGIN_NAME{"ActivationOptPlugin"};
+class ActivationOptPlugin : public TensorRTPlugin {
+ public:
+  ActivationOptPlugin(const std::string name, schema::ActivationType activation_type, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(ACTIVATION_OPT_PLUGIN_NAME), device_id), activation_type_(activation_type) {}
+
+  ActivationOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    activation_type_ = static_cast<const schema::ActivationType *>(fields[0].data)[0];
+  }
+
+  ActivationOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &activation_type_, sizeof(schema::ActivationType));
+  }
+
+  ActivationOptPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  bool needResize(const int *current_dims, const int *last_dims);
+  int RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                        cudaStream_t stream);
+  int RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                         cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::ActivationType activation_type_;
+  cudnnHandle_t cudnn_handle_{nullptr};
+  cudnnActivationDescriptor_t activation_desc_{nullptr};
+  cudnnTensorDescriptor_t input_desc_{nullptr};
+  int infer_dims_[5]{1, 1, 1, 1, 1};
+  int infer_stride_[5]{1, 1, 1, 1, 1};
+  int infer_dims_cnt_{0};
+};
+class ActivationOptPluginCreater : public TensorRTPluginCreater<ActivationOptPlugin> {
+ public:
+  ActivationOptPluginCreater() : TensorRTPluginCreater(std::string(ACTIVATION_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc
@ -0,0 +1,153 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include <cfloat>
+#include <memory>
+#include <unordered_set>
+#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h"
+
+namespace mindspore::lite {
+namespace {
+bool HasCustomActivationPlugin(schema::ActivationType type) {
+  std::unordered_set<schema::ActivationType> plugin_activation = {schema::ActivationType::ActivationType_SIGMOID,
+                                                                  schema::ActivationType::ActivationType_GELU,
+                                                                  schema::ActivationType::ActivationType_SWISH};
+  return plugin_activation.find(type) != plugin_activation.end();
+}
+}  // namespace
+
+int ActivationTensorRT::IsSupport(const schema::Primitive *primitive,
+                                  const std::vector<mindspore::MSTensor> &in_tensors,
+                                  const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  auto activation_op = this->op_primitive_->value_as_Activation();
+  if (activation_op == nullptr) {
+    MS_LOG(ERROR) << "op convert failed";
+    return RET_ERROR;
+  }
+  auto activation_params_opt = TryConvertActivationType(activation_op->activation_type());
+  bool has_custom_plugin = HasCustomActivationPlugin(activation_op->activation_type());
+  if (!activation_params_opt && !has_custom_plugin) {
+    MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_op->activation_type();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int ActivationTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network is invalid";
+    return RET_ERROR;
+  }
+  auto activation_op = this->op_primitive_->value_as_Activation();
+  if (activation_op == nullptr) {
+    MS_LOG(ERROR) << "op convert failed";
+    return RET_ERROR;
+  }
+  float alpha = activation_op->alpha();
+  nvinfer1::ITensor *activation_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32) {
+    activation_input =
+      TRTTensorCast(ctx, tensorrt_in_tensors_[0].trt_tensor_, nvinfer1::DataType::kFLOAT, op_name_ + "_cast_in");
+  }
+
+  auto activation_layer =
+    ActivationTensorRT::AddActivation(ctx, activation_op->activation_type(), alpha,
+                                      std::isfinite(activation_op->min_val()) ? activation_op->min_val() : FLT_MIN,
+                                      std::isfinite(activation_op->max_val()) ? activation_op->max_val() : FLT_MAX,
+                                      activation_input, device_id_, quant_type_);
+  if (activation_layer == nullptr) {
+    MS_LOG(ERROR) << "add activation op failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  activation_layer->setName(op_name_.c_str());
+  // cast to origin type
+  nvinfer1::ITensor *out_tensor = activation_layer->getOutput(0);
+  if (out_tensor->getType() != ConvertDataType(out_tensors_[0].DataType())) {
+    out_tensor = TRTTensorCast(ctx, activation_layer->getOutput(0), ConvertDataType(out_tensors_[0].DataType()),
+                               op_name_ + "_cast_out");
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = activation_layer;
+  return RET_OK;
+}
+nvinfer1::ILayer *ActivationTensorRT::AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type,
+                                                    float alpha, float min_value, float max_value,
+                                                    nvinfer1::ITensor *trt_in_tensor, uint32_t device_id,
+                                                    schema::QuantType quant_type) {
+  bool has_custom_plugin = HasCustomActivationPlugin(activation_type);
+  // sigmoid precision is wrong for trt
+  if (quant_type == schema::QuantType_QUANT_NONE && has_custom_plugin) {
+    std::string layer_name = std::string(trt_in_tensor->getName()) + "_activation";
+    auto plugin = std::make_shared<ActivationOptPlugin>(layer_name.c_str(), activation_type, device_id);
+    MS_LOG(INFO) << "using opt plugin for " << layer_name;
+    if (plugin == nullptr) {
+      MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << layer_name;
+      return nullptr;
+    }
+    nvinfer1::ITensor *inputTensors[] = {trt_in_tensor};
+    nvinfer1::IPluginV2Layer *activation_opt_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+    activation_opt_layer->setName(layer_name.c_str());
+    return activation_opt_layer;
+  }
+
+  // Just some action_code correct, unfind code is set to default relu. need double check.
+  auto action_param_opt = TryConvertActivationType(activation_type);
+  if (!action_param_opt) {
+    MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_type;
+    return nullptr;
+  }
+  auto action_param = action_param_opt.value();
+  nvinfer1::IActivationLayer *activation_layer =
+    ctx->network()->addActivation(*trt_in_tensor, action_param.activation_type);
+  if (activation_layer == nullptr) {
+    MS_LOG(ERROR) << "add activation op failed for TensorRT.";
+    return nullptr;
+  }
+
+  if (activation_type == schema::ActivationType_HARD_TANH) {
+    activation_layer->setAlpha(min_value);
+    activation_layer->setBeta(max_value);
+    return activation_layer;
+  }
+
+  if (action_param.has_alpha) {
+    activation_layer->setAlpha(alpha);
+  }
+
+  if (action_param.has_beta) {
+    activation_layer->setBeta(action_param.beta);
+  }
+
+  return activation_layer;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Activation, ActivationTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ActivationTensorRT : public TensorRTOp {
+ public:
+  ActivationTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                     const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                     const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ActivationTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  static nvinfer1::ILayer *AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type, float alpha,
+                                         float min_value, float max_value, nvinfer1::ITensor *trt_in_tensor,
+                                         uint32_t device_id = 0,
+                                         schema::QuantType quant_type = schema::QuantType_QUANT_NONE);
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc
@ -0,0 +1,113 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/allgather_tensorrt.h"
+#include <numeric>
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(AllGatherPluginCreater);
+template class TensorRTPluginCreater<AllGatherPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int AllGatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+#ifndef LITE_CUDA_DISTRIBUTION
+  MS_LOG(ERROR)
+    << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on.";
+  return RET_ERROR;
+#else
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#endif
+}
+
+int AllGatherTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  auto allgather_op = op_primitive_->value_as_AllGather();
+  if (allgather_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed for " << op_name_;
+    return RET_ERROR;
+  }
+  int rank = GetGPUGroupSize();
+  auto plugin = std::make_shared<AllGatherPlugin>(op_name_, rank, device_id_);
+  MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID();
+  nvinfer1::IPluginV2Layer *allgather_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  if (allgather_layer == nullptr) {
+    MS_LOG(ERROR) << "create AllGather layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *allgather_out = allgather_layer->getOutput(0);
+  allgather_layer->setName(op_name_.c_str());
+  allgather_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{allgather_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = allgather_layer;
+  return RET_OK;
+}
+
+// AllGatherPlugin
+int AllGatherPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                             const void *const *inputs, void *const *outputs, void *workspace,
+                             cudaStream_t stream) noexcept {
+  MS_LOG(INFO) << "all gather run at rank id: " << GetRankID() << " stream: " << stream;
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int send_element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+  const void *input = inputs[0];
+  void *output = outputs[0];
+  auto ret = DistributionCollective::instance().AllGatherWrapper(input, output, send_element_cnt, inputDesc->type,
+                                                                 stream, NCCL_WORLD_GROUP);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AllGather nccl run failed for " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *AllGatherPlugin::clone() const noexcept {
+  auto *plugin = new AllGatherPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AllGatherPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                         int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs->nbDims;
+  auto rank_dim = exprBuilder.constant(rank_);
+  out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs->d[0], *rank_dim);
+  for (int i = 1; i < inputs->nbDims; i++) {
+    out_dims.d[i] = inputs->d[i];
+  }
+  return out_dims;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AllGather, AllGatherTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h
@ -0,0 +1,75 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+constexpr char *ALLGATHER_PLUGIN_NAME{"AllGatherPlugin"};
+class AllGatherTensorRT : public TensorRTOp {
+ public:
+  AllGatherTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~AllGatherTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class AllGatherPlugin : public TensorRTPlugin {
+ public:
+  AllGatherPlugin(const std::string name, int rank, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(ALLGATHER_PLUGIN_NAME), device_id), rank_(rank) {}
+
+  AllGatherPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    rank_ = static_cast<const int *>(fields[0].data)[0];
+  }
+
+  AllGatherPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int));
+  }
+
+  AllGatherPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+ private:
+  int rank_{0};
+};
+class AllGatherPluginCreater : public TensorRTPluginCreater<AllGatherPlugin> {
+ public:
+  AllGatherPluginCreater() : TensorRTPluginCreater(std::string(ALLGATHER_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc
@ -0,0 +1,83 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/cast_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(CastPluginCreater);
+template class TensorRTPluginCreater<CastPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int CastPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                        const void *const *inputs, void *const *outputs, void *workspace,
+                        cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+
+  if (inputDesc->type == outputDesc->type) {
+    int element_size = (outputDesc->type == nvinfer1::DataType::kFLOAT)
+                         ? sizeof(float)
+                         : ((outputDesc->type == nvinfer1::DataType::kINT32) ? sizeof(int) : 0);
+    auto cuda_ret = cudaMemcpy(outputs[0], inputs[0], element_cnt * element_size, cudaMemcpyDeviceToDevice);
+    if (cuda_ret != cudaSuccess) {
+      MS_LOG(ERROR) << "copy mem failed for " << layer_name_;
+      return RET_ERROR;
+    }
+    return RET_OK;
+  }
+  if (inputDesc->type == nvinfer1::DataType::kINT32 && dest_datatype_ == nvinfer1::DataType::kFLOAT) {
+    auto input = static_cast<const int *>(inputs[0]);
+    auto output = static_cast<float *>(outputs[0]);
+    Cast(element_cnt, input, output, stream);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT && dest_datatype_ == nvinfer1::DataType::kINT32) {
+    auto input = static_cast<const float *>(inputs[0]);
+    auto output = static_cast<int *>(outputs[0]);
+    Cast(element_cnt, input, output, stream);
+  } else {
+    MS_LOG(ERROR) << "unsupported data type cast " << layer_name_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *CastPlugin::clone() const noexcept {
+  auto *plugin = new CastPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DataType CastPlugin::getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const
+  noexcept {
+  return dest_datatype_;
+}
+
+size_t CastPlugin::getSerializationSize() const noexcept {
+  // origin_datatype_ and dest_datatype_
+  return sizeof(nvinfer1::DataType) * 2;
+}
+
+void CastPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &origin_datatype_, sizeof(nvinfer1::DataType));
+  SerializeValue(&buffer, &dest_datatype_, sizeof(nvinfer1::DataType));
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h
@ -0,0 +1,67 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+constexpr char *CAST_PLUGIN_NAME{"CastPluginCreater"};
+class CastPlugin : public TensorRTPlugin {
+ public:
+  CastPlugin(const std::string name, nvinfer1::DataType origin_datatype, nvinfer1::DataType dest_datatype,
+             uint32_t device_id = 0)
+      : TensorRTPlugin(name, std::string(CAST_PLUGIN_NAME), device_id),
+        origin_datatype_(origin_datatype),
+        dest_datatype_(dest_datatype) {}
+
+  CastPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    origin_datatype_ = static_cast<const nvinfer1::DataType *>(fields[0].data)[0];
+    dest_datatype_ = static_cast<const nvinfer1::DataType *>(fields[1].data)[0];
+  }
+
+  CastPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &origin_datatype_, sizeof(nvinfer1::DataType));
+    DeserializeValue(&serialData, &serialLength, &dest_datatype_, sizeof(nvinfer1::DataType));
+  }
+
+  CastPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const
+    noexcept override;
+
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  nvinfer1::DataType origin_datatype_;
+  nvinfer1::DataType dest_datatype_;
+};
+class CastPluginCreater : public TensorRTPluginCreater<CastPlugin> {
+ public:
+  CastPluginCreater() : TensorRTPluginCreater(std::string(CAST_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc
@ -0,0 +1,79 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/cast_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+
+namespace mindspore::lite {
+int CastTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int CastTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  // cast to type tensor
+  auto type_tensor = in_tensors_[1];
+  if (type_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "unknown cast type of " << op_name_;
+    return RET_ERROR;
+  }
+  auto type_data = static_cast<const int *>(type_tensor.Data().get());
+  DataType data_type = static_cast<DataType>(type_data[0]);
+  MS_LOG(DEBUG) << op_name_ << " cast to data type(43 float): " << type_data[0];
+  nvinfer1::DataType dest_datatype = ConvertDataType(data_type);
+  auto trt_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+
+#if TRT_VERSION_GE(7, 2)
+  dest_datatype = (dest_datatype == nvinfer1::DataType::kBOOL ? nvinfer1::DataType::kINT32 : dest_datatype);
+  auto cast_layer = ctx->network()->addIdentity(*trt_tensor);
+#else
+  auto plugin = std::make_shared<CastPlugin>(op_name_, trt_tensor->getType(), dest_datatype);
+  nvinfer1::ITensor *inputTensors[] = {trt_tensor};
+  nvinfer1::IPluginV2Layer *cast_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+#endif
+  if (cast_layer == nullptr) {
+    MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+#if TRT_VERSION_GE(7, 2)
+  cast_layer->setOutputType(0, dest_datatype);
+#endif
+  cast_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *cast_out = cast_layer->getOutput(0);
+  cast_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{cast_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = cast_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Cast, CastTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh"
+
+namespace mindspore::lite {
+class CastTensorRT : public TensorRTOp {
+ public:
+  CastTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~CastTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  // CastTensorRT
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc
@ -0,0 +1,158 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/concate_tensorrt.h"
+#include <experimental/optional>
+#include <algorithm>
+
+namespace mindspore::lite {
+int ConcateTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (type_ != schema::PrimitiveType_Stack && type_ != schema::PrimitiveType_Concat) {
+    MS_LOG(ERROR) << "Unsupported op :" << op_name_ << " , type: " << type_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() == 0 || in_tensors.size() < INPUT_SIZE2 && type_ != schema::PrimitiveType_Stack) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+
+  int input_nbDims = in_tensors_[0].Shape().size();
+  if (axis_ == -1) {
+    axis_ = input_nbDims - 1;
+  }
+  if (axis_ < 0 || axis_ > input_nbDims || axis_ == input_nbDims && type_ != schema::PrimitiveType_Stack) {
+    MS_LOG(ERROR) << "concate_op valid axis : " << axis_ << " , input dims : " << input_nbDims;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int ConcateTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+
+  if (tensorrt_in_tensors_.size() != in_tensors_.size()) {
+    MS_LOG(ERROR) << "concate_op in tensor is invalid, trt tensor has " << tensorrt_in_tensors_.size()
+                  << ", but origin ms tensor has " << in_tensors_.size();
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *trt_input_tensors[tensorrt_in_tensors_.size()];
+  int ret = PreProcessInputs(ctx, trt_input_tensors);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreProcessInputs failed for " << op_name_;
+    return ret;
+  }
+
+  if (!same_format_) {
+    if (trt_input_tensors[0]->getDimensions().nbDims == DIMENSION_4D && out_format_ == Format::NCHW) {
+      // when inputs all NCHW, change axis
+      axis_ = ConvertAxisFromNHWC2NCHW(axis_);
+      MS_LOG(DEBUG) << "concate axis change to " << axis_ << " when using NCHW format.";
+    } else {
+      MS_LOG(WARNING) << "input tensor format needs check, convert concat axis failed for " << op_name_;
+    }
+  }
+
+  if (type_ == schema::PrimitiveType_Stack) {
+    for (size_t i = 0; i != tensorrt_in_tensors_.size(); ++i) {
+      auto shuffle_layer = ctx->network()->addShuffle(*trt_input_tensors[i]);
+      if (shuffle_layer == nullptr) {
+        MS_LOG(ERROR) << "addShuffle failed for TensorRT.";
+        return RET_ERROR;
+      }
+      auto shuffer_dims_opt = UnsqueezeDims(trt_input_tensors[i]->getDimensions(), axis_, 1);
+      if (!shuffer_dims_opt) {
+        MS_LOG(ERROR) << "UnsqueezeDims failed.";
+        return RET_ERROR;
+      }
+      shuffle_layer->setReshapeDimensions(shuffer_dims_opt.value());
+      trt_input_tensors[i] = shuffle_layer->getOutput(0);
+    }
+  }
+  nvinfer1::IConcatenationLayer *concate_layer =
+    ctx->network()->addConcatenation(trt_input_tensors, static_cast<int>(tensorrt_in_tensors_.size()));
+  if (concate_layer == nullptr) {
+    MS_LOG(ERROR) << "addConcatenation failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  if (axis_ != RET_INVALID_OP_ATTR) {
+    concate_layer->setAxis(axis_);
+  }
+  concate_layer->setName(op_name_.c_str());
+  auto concat_output = concate_layer->getOutput(0);
+  concat_output->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{concat_output, out_format_, same_format_});
+  this->layer_ = concate_layer;
+  return RET_OK;
+}
+
+int ConcateTensorRT::PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]) {
+  int input_nbDims = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  same_format_ = tensorrt_in_tensors_[0].same_format_;
+
+  for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+    if (tensorrt_in_tensors_[i].trt_tensor_->getDimensions().nbDims != input_nbDims) {
+      MS_LOG(ERROR) << "dims of inputs is invalid for " << op_name_;
+      return RET_ERROR;
+    }
+    // keep origin format if all input format are the same
+    if (input_nbDims == DIMENSION_4D && tensorrt_in_tensors_[i].format_ != out_format_) {
+      out_format_ = Format::NHWC;
+    }
+  }
+
+  // make sure all inputs are same format
+  if (input_nbDims == DIMENSION_4D) {
+    for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+      if (tensorrt_in_tensors_[i].format_ == out_format_) {
+        trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+        MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]);
+      } else {
+        nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[i].trt_tensor_);
+        if (transpose_layer == nullptr) {
+          MS_LOG(ERROR) << "op action convert failed";
+          return RET_ERROR;
+        }
+        trt_input_tensors[i] = transpose_layer->getOutput(0);
+        this->transpose_layer_ = transpose_layer;
+        same_format_ = true;
+        MS_LOG(DEBUG) << "concate input " << GetTensorFormat(trt_input_tensors[i], Format::NHWC, true);
+      }
+    }
+  } else {
+    for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+      trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+      MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]);
+    }
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Concat, ConcateTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Stack, ConcateTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ConcateTensorRT : public TensorRTOp {
+ public:
+  ConcateTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {
+    type_ = primitive->value_type();
+    axis_ = (type_ == schema::PrimitiveType_Concat ? primitive->value_as_Concat()->axis()
+                                                   : primitive->value_as_Stack()->axis());
+  }
+
+  ~ConcateTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]);
+
+  Format out_format_{Format::NHWC};
+  bool same_format_{true};
+  schema::PrimitiveType type_;
+  int axis_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc
@ -0,0 +1,187 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/convolution_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+
+namespace mindspore::lite {
+constexpr int BIAS_INDEX = 2;
+
+int ConvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
+                                   const std::vector<mindspore::MSTensor> &in_tensors,
+                                   const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  const schema::Conv2DFusion *conv_op = this->op_primitive_->value_as_Conv2DFusion();
+  if (conv_op == nullptr) {
+    MS_LOG(ERROR) << "op action convert failed";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *conv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    conv_input = transpose_layer_in->getOutput(0);
+  }
+
+  // transpose weight
+  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
+  nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_);
+
+  // conv
+  int nbOutputMaps = weight_tensor.Shape()[0];
+  if (nbOutputMaps <= 0) {
+    MS_LOG(ERROR) << "out_channel is invalid";
+    return RET_ERROR;
+  }
+
+  auto kernel_size = conv_op->kernel_size();
+  if (kernel_size == nullptr) {
+    MS_LOG(ERROR) << "kernel_size is null";
+    return RET_ERROR;
+  }
+  nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector<int64_t>(kernel_size->begin(), kernel_size->end()));
+  if (kernelSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  // bias
+  nvinfer1::Weights biasWeights{};
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    biasWeights = lite::ConvertWeight(in_tensors_[BIAS_INDEX]);
+  } else {
+    biasWeights.type = ConvertDataType(weight_tensor.DataType());
+    biasWeights.count = 0;
+    biasWeights.values = nullptr;
+  }
+
+  nvinfer1::IConvolutionLayer *conv_layer =
+    ctx->network()->addConvolutionNd(*conv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+
+  if (conv_layer == nullptr) {
+    MS_LOG(ERROR) << "ConvolutionLayer failed";
+    return RET_ERROR;
+  }
+  conv_layer->setName((op_name_ + "_conv").c_str());
+  this->layer_ = conv_layer;
+
+  // add params
+  SetAttributes(conv_op, conv_layer);
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (conv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = conv_layer;
+  } else {
+    activation_layer =
+      ActivationTensorRT::AddActivation(ctx, conv_op->activation_type(), 0, 0, 0, conv_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for conv failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false});
+  return RET_OK;
+}
+
+void ConvolutionTensorRT::SetAttributes(const schema::Conv2DFusion *conv_op, nvinfer1::IConvolutionLayer *conv_layer) {
+  auto stride = conv_op->stride();
+  if (stride != nullptr) {
+    auto stride_val = std::vector<int64_t>(stride->begin(), stride->end());
+    auto dims = ConvertCudaDims(stride_val);
+    if (dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    conv_layer->setStrideNd(dims);
+  }
+
+  auto dilation = conv_op->dilation();
+  if (dilation != nullptr) {
+    auto dilation_val = std::vector<int64_t>(dilation->begin(), dilation->end());
+    auto dims = ConvertCudaDims(dilation_val);
+    if (dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    conv_layer->setDilationNd(dims);
+  }
+  int nbGroups = conv_op->group();
+  if (nbGroups > 0) {
+    conv_layer->setNbGroups(nbGroups);
+  }
+
+  schema::PadMode pad_mode = conv_op->pad_mode();
+  if (pad_mode == schema::PadMode::PadMode_SAME) {
+    conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    auto padding = conv_op->pad_list();
+    if (padding != nullptr && padding->size() == DIMENSION_4D) {
+      auto padding_val = std::vector<int64_t>(padding->begin(), padding->end());
+      if (padding_val[0] != padding_val[1] || padding_val[DIMENSION_2D] != padding_val[DIMENSION_3D]) {
+        MS_LOG(WARNING) << op_name_ << " has different up and down padding value";
+      }
+      nvinfer1::Dims2 dims(padding_val[0], padding_val[DIMENSION_2D]);
+      conv_layer->setPaddingNd(dims);
+    } else if (padding == nullptr || padding->size() == 0) {
+      nvinfer1::Dims2 dims;
+      conv_layer->setPaddingNd(dims);
+    } else {
+      MS_LOG(WARNING) << "pad list is invalid for " << op_name_;
+    }
+  }
+}
+
+ConvolutionTensorRT::~ConvolutionTensorRT() {
+  if (pack_weight_ != nullptr) {
+    free(pack_weight_);
+    pack_weight_ = nullptr;
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2DFusion, ConvolutionTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ConvolutionTensorRT : public TensorRTOp {
+ public:
+  ConvolutionTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                      const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                      const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ConvolutionTensorRT() override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  void SetAttributes(const schema::Conv2DFusion *ms_op, nvinfer1::IConvolutionLayer *current_layer_);
+
+  void *pack_weight_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc
@ -0,0 +1,199 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "nnacl/pack.h"
+
+namespace mindspore::lite {
+int DeconvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
+                                     const std::vector<mindspore::MSTensor> &in_tensors,
+                                     const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+int DeconvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  const schema::Conv2dTransposeFusion *deconv_op = this->op_primitive_->value_as_Conv2dTransposeFusion();
+  if (deconv_op == nullptr) {
+    MS_LOG(ERROR) << "op action convert failed";
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *deconv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    deconv_input = transpose_layer_in->getOutput(0);
+  }
+
+  // transpose weight
+  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
+  nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_);
+
+  // deconv basic params
+  int nbOutputMaps = weight_tensor.Shape()[0];
+  if (nbOutputMaps <= 0) {
+    MS_LOG(ERROR) << "out_channel is invalid";
+    return RET_ERROR;
+  }
+
+  auto kernel_size = deconv_op->kernel_size();
+  if (kernel_size == nullptr) {
+    MS_LOG(ERROR) << "kernel_size is null";
+    return RET_ERROR;
+  }
+  nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector<int64_t>(kernel_size->begin(), kernel_size->end()));
+  if (kernelSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  // bias
+  nvinfer1::Weights biasWeights{};
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    biasWeights = lite::ConvertWeight(in_tensors_[INPUT_SIZE3 - 1]);
+  } else {
+    biasWeights.type = ConvertDataType(weight_tensor.DataType());
+    biasWeights.count = 0;
+    biasWeights.values = nullptr;
+  }
+
+  nvinfer1::IDeconvolutionLayer *deconv_layer =
+    ctx->network()->addDeconvolutionNd(*deconv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+
+  if (deconv_layer == nullptr) {
+    MS_LOG(ERROR) << "DeconvolutionLayer failed";
+    return RET_ERROR;
+  }
+  deconv_layer->setName((op_name_ + "_deconv").c_str());
+  this->layer_ = deconv_layer;
+  // set extra params
+  SetAttributes(deconv_op, deconv_layer);
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (deconv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = deconv_layer;
+  } else {
+    activation_layer = ActivationTensorRT::AddActivation(ctx, deconv_op->activation_type(), 0, 0, 0,
+                                                         deconv_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for conv failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false});
+  return RET_OK;
+}
+
+void DeconvolutionTensorRT::SetAttributes(const schema::Conv2dTransposeFusion *ms_op,
+                                          nvinfer1::IDeconvolutionLayer *decon_layer) {
+  // kernel_size
+  auto kernel_size = ms_op->kernel_size();
+  if (kernel_size != nullptr) {
+    auto kernel_size_val = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+    nvinfer1::Dims kernel_size_dims = lite::ConvertCudaDims(kernel_size_val);
+    if (kernel_size_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    decon_layer->setKernelSizeNd(kernel_size_dims);
+  }
+
+  // nbOutputMaps
+  int32_t nbOutputMaps = static_cast<int32_t>(ms_op->out_channel());
+  decon_layer->setNbOutputMaps(nbOutputMaps);
+
+  // stride
+  auto stride = ms_op->stride();
+  if (stride != nullptr) {
+    auto stride_val = std::vector<int64_t>(stride->begin(), stride->end());
+    nvinfer1::Dims stride_dims = lite::ConvertCudaDims(stride_val);
+    if (stride_dims.nbDims == -1) {
+      MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+      return;
+    }
+    decon_layer->setStrideNd(stride_dims);
+  }
+
+  // nbGroups
+  int32_t nbGroups = static_cast<int32_t>(ms_op->group());
+  decon_layer->setNbGroups(nbGroups);
+
+  // padding
+  schema::PadMode pad_mode = ms_op->pad_mode();
+  if (pad_mode == schema::PadMode::PadMode_SAME) {
+    decon_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    auto padding = ms_op->pad_list();
+    auto out_pad = ms_op->output_paddings();
+    if (padding == nullptr || out_pad == nullptr) {
+      MS_LOG(WARNING) << "on pad value of " << op_name_;
+      return;
+    }
+    auto padding_val = std::vector<int64_t>(padding->begin(), padding->end());
+    auto out_pad_val = std::vector<int64_t>(out_pad->begin(), out_pad->end());  // h, w
+    if (out_pad_val.size() != DIMENSION_2D || padding_val.size() != DIMENSION_4D) {
+      MS_LOG(ERROR) << "invalid size of pad " << op_name_;
+      return;
+    }
+    nvinfer1::Dims dims_pre{};
+    dims_pre.nbDims = DIMENSION_2D;
+    dims_pre.d[0] = padding_val[0];  // up
+    dims_pre.d[1] = padding_val[2];  // left
+    decon_layer->setPrePadding(dims_pre);
+    nvinfer1::Dims dims_post{};
+    dims_post.nbDims = DIMENSION_2D;
+    dims_post.d[0] = padding_val[1] - out_pad_val[0];  // down
+    dims_post.d[1] = padding_val[3] - out_pad_val[1];  // right
+    decon_layer->setPostPadding(dims_post);
+  }
+}
+
+DeconvolutionTensorRT::~DeconvolutionTensorRT() {
+  if (pack_weight_ != nullptr) {
+    free(pack_weight_);
+    pack_weight_ = nullptr;
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2dTransposeFusion, DeconvolutionTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h
@ -0,0 +1,43 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class DeconvolutionTensorRT : public TensorRTOp {
+ public:
+  DeconvolutionTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                        const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                        const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~DeconvolutionTensorRT() override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  void SetAttributes(const schema::Conv2dTransposeFusion *ms_op, nvinfer1::IDeconvolutionLayer *decon_layer);
+
+  void *pack_weight_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc
@ -0,0 +1,312 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unordered_map>
+#include <unordered_set>
+#include "src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+
+namespace mindspore::lite {
+namespace {
+std::unordered_map<schema::PrimitiveType, nvinfer1::ElementWiseOperation> NOT_BOOL_PRIM2NV_ELEM_OP = {
+#if TRT_VERSION_GE(7, 2)
+  {schema::PrimitiveType_Less, nvinfer1::ElementWiseOperation::kLESS},
+  {schema::PrimitiveType_Greater, nvinfer1::ElementWiseOperation::kGREATER},
+#endif
+  {schema::PrimitiveType_AddFusion, nvinfer1::ElementWiseOperation::kSUM},
+  {schema::PrimitiveType_PowFusion, nvinfer1::ElementWiseOperation::kPOW},
+  {schema::PrimitiveType_DivFusion, nvinfer1::ElementWiseOperation::kDIV},
+  {schema::PrimitiveType_RealDiv, nvinfer1::ElementWiseOperation::kDIV},
+  {schema::PrimitiveType_FloorDiv, nvinfer1::ElementWiseOperation::kFLOOR_DIV},
+  {schema::PrimitiveType_SubFusion, nvinfer1::ElementWiseOperation::kSUB},
+  {schema::PrimitiveType_MulFusion, nvinfer1::ElementWiseOperation::kPROD},
+  {schema::PrimitiveType_Minimum, nvinfer1::ElementWiseOperation::kMIN},
+  {schema::PrimitiveType_Maximum, nvinfer1::ElementWiseOperation::kMAX},
+  {schema::PrimitiveType_BiasAdd, nvinfer1::ElementWiseOperation::kSUM},
+#if TRT_VERSION_GE(7, 2)
+  {schema::PrimitiveType_Equal, nvinfer1::ElementWiseOperation::kEQUAL},
+#endif
+};
+}  // namespace
+
+int ElementWiseTensorRT::IsSupport(const schema::Primitive *primitive,
+                                   const std::vector<mindspore::MSTensor> &in_tensors,
+                                   const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensort size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensort size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+
+  // if constant tensor is scalar, it needs to know another input tensor's shape to broadcast
+  if ((in_tensors[0].Shape().size() > 0 && in_tensors[0].Shape()[0] == -1 && in_tensors[1].Shape().size() == 0) ||
+      (in_tensors[1].Shape().size() > 0 && in_tensors[1].Shape()[0] == -1 && in_tensors[0].Shape().size() == 0)) {
+    MS_LOG(ERROR) << "invalid all input tensor shape unknown for: " << op_name_;
+    return RET_ERROR;
+  }
+
+  bool is_not_bool_arith = NOT_BOOL_PRIM2NV_ELEM_OP.find(type_) != NOT_BOOL_PRIM2NV_ELEM_OP.end();
+  if (is_not_bool_arith) {
+    if (std::any_of(in_tensors.begin(), in_tensors.end(),
+                    [](const mindspore::MSTensor &tensor) { return tensor.DataType() == DataType::kNumberTypeBool; })) {
+      MS_LOG(ERROR) << "invalid input type for : " << op_name_;
+      return RET_ERROR;
+    }
+    element_wise_op_ = NOT_BOOL_PRIM2NV_ELEM_OP[type_];
+  }
+  if (!is_not_bool_arith) {
+    // PrimitiveType_Eltwise
+    auto eltwise_op = op_primitive_->value_as_Eltwise();
+    if (eltwise_op == nullptr) {
+      MS_LOG(ERROR) << "convert to Eltwise failed: " << op_name_;
+      return RET_ERROR;
+    }
+    schema::EltwiseMode eltwiseMode = eltwise_op->mode();
+    std::map<schema::EltwiseMode, nvinfer1::ElementWiseOperation> eltwise_modes = {
+      {schema::EltwiseMode::EltwiseMode_SUM, nvinfer1::ElementWiseOperation::kSUM},
+      {schema::EltwiseMode::EltwiseMode_PROD, nvinfer1::ElementWiseOperation::kPROD},
+      {schema::EltwiseMode::EltwiseMode_MAXIMUM, nvinfer1::ElementWiseOperation::kMAX},
+    };
+    auto iter_mode = eltwise_modes.find(eltwiseMode);
+    if (iter_mode != eltwise_modes.end()) {
+      element_wise_op_ = iter_mode->second;
+    } else {
+      MS_LOG(ERROR) << "unsupported type for ElementWise op" << op_name_;
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ElementWiseTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network or input tensor size is invalid";
+    return RET_ERROR;
+  }
+  ITensorHelper x_input;
+  ITensorHelper y_input;
+  int ret = PreprocessInputTensors(ctx, &x_input, &y_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputTensors failed.";
+    return RET_ERROR;
+  }
+  nvinfer1::IElementWiseLayer *cal_layer =
+    ctx->network()->addElementWise(*x_input.trt_tensor_, *y_input.trt_tensor_, element_wise_op_);
+
+  if (cal_layer == nullptr) {
+    MS_LOG(ERROR) << "addElementWise failed for TensorRT.";
+    return RET_ERROR;
+  }
+  cal_layer->setName(op_name_.c_str());
+  this->layer_ = cal_layer;
+
+  nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  // add activation
+  nvinfer1::ITensor *activation_out_tensor = AddActivation(ctx, op_out_tensor);
+  op_out_tensor = (activation_out_tensor == nullptr) ? op_out_tensor : activation_out_tensor;
+
+  // scale and shift
+  if (type_ == schema::PrimitiveType_PowFusion) {
+    auto pow_op = op_primitive_->value_as_PowFusion();
+    if (pow_op == nullptr) {
+      MS_LOG(ERROR) << "PowFusion convert failed.";
+      return RET_ERROR;
+    }
+    float scale = pow_op->scale();
+    float shift = pow_op->shift();
+    if (abs(scale - 1) >= 1.0e-05 || abs(shift - 0) >= 1.0e-05) {
+      MS_LOG(WARNING) << "deal with scale and shift for pow op";
+    }
+  }
+#if TRT_VERSION_GE(7, 2)
+  std::unordered_set<schema::PrimitiveType> bool_producer_ops = {
+    schema::PrimitiveType_Equal, schema::PrimitiveType_Greater, schema::PrimitiveType_Less};
+  if (bool_producer_ops.find(type_) != bool_producer_ops.end()) {
+    auto cast_layer = ctx->network()->addIdentity(*op_out_tensor);
+    if (cast_layer == nullptr) {
+      MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+      return RET_ERROR;
+    }
+    cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    op_out_tensor = cast_layer->getOutput(0);
+    MS_LOG(INFO) << "bool result cast to int32" << op_name_;
+  }
+#endif
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, x_input.format_, x_input.same_format_});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+int ElementWiseTensorRT::PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input) {
+  int input_x_index = SameTensor(tensorrt_in_tensors_[0].trt_tensor_, &in_tensors_[0]) ? 0 : 1;
+  if (in_tensors_[0].Shape() == in_tensors_[1].Shape() && in_tensors_[0].IsConst()) {
+    input_x_index = 1;
+  }
+
+  if (this->tensorrt_in_tensors_.size() != INPUT_SIZE2) {
+    int ret = AddConstTensor(ctx);
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+  *x_input = tensorrt_in_tensors_[input_x_index];
+  *y_input = tensorrt_in_tensors_[1 - input_x_index];
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*x_input);
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*y_input);
+
+  if (x_input->trt_tensor_->getDimensions().nbDims == DIMENSION_4D && x_input->format_ != y_input->format_) {
+    // when inputs format are different, change to NHWC
+    auto need_trans = x_input->format_ == Format::NCHW ? x_input : y_input;
+    nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *need_trans->trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer->setName((op_name_ + "_input_transpose2NHWC").c_str());
+    need_trans->trt_tensor_ = transpose_layer->getOutput(0);
+    need_trans->format_ = Format::NHWC;
+    need_trans->same_format_ = true;
+  }
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*x_input);
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*y_input);
+  if (GetDimsVolume(x_input->trt_tensor_->getDimensions()) == GetDimsVolume(y_input->trt_tensor_->getDimensions()) &&
+      x_input->trt_tensor_->getDimensions().nbDims != y_input->trt_tensor_->getDimensions().nbDims) {
+    bool x_large = x_input->trt_tensor_->getDimensions().nbDims > y_input->trt_tensor_->getDimensions().nbDims;
+    auto input_tensor = x_large ? y_input : x_input;
+    auto output_dim = x_large ? x_input->trt_tensor_->getDimensions() : y_input->trt_tensor_->getDimensions();
+    auto reshape_layer = ctx->network()->addShuffle(*input_tensor->trt_tensor_);
+    if (reshape_layer == nullptr) {
+      MS_LOG(ERROR) << "add reshape failed for " << op_name_;
+      return RET_ERROR;
+    }
+    reshape_layer->setReshapeDimensions(output_dim);
+    input_tensor->trt_tensor_ = reshape_layer->getOutput(0);
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *ElementWiseTensorRT::AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor) {
+  schema::ActivationType activation = schema::ActivationType::ActivationType_NO_ACTIVATION;
+  switch (type_) {
+    case schema::PrimitiveType_AddFusion: {
+      auto sum_op = op_primitive_->value_as_AddFusion();
+      if (sum_op == nullptr) {
+        MS_LOG(ERROR) << "AddFusion convert failed.";
+        return nullptr;
+      }
+      activation = sum_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_DivFusion: {
+      auto div_op = op_primitive_->value_as_DivFusion();
+      if (div_op == nullptr) {
+        MS_LOG(ERROR) << "DivFusion convert failed.";
+        return nullptr;
+      }
+      activation = div_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_SubFusion: {
+      auto sub_op = op_primitive_->value_as_SubFusion();
+      if (sub_op == nullptr) {
+        MS_LOG(ERROR) << "SubFusion convert failed.";
+        return nullptr;
+      }
+      activation = sub_op->activation_type();
+      break;
+    }
+    case schema::PrimitiveType_MulFusion: {
+      auto mul_op = op_primitive_->value_as_MulFusion();
+      if (mul_op == nullptr) {
+        MS_LOG(ERROR) << "MulFusion convert failed.";
+        return nullptr;
+      }
+      activation = mul_op->activation_type();
+      break;
+    }
+    default:
+      MS_LOG(DEBUG) << "no activation need for: " << op_name_;
+  }
+  nvinfer1::ITensor *activation_out_tensor = nullptr;
+  if (activation != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation, 0, 0, 0, in_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for element wise failed";
+      return nullptr;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    activation_out_tensor = activation_layer->getOutput(0);
+  }
+  return activation_out_tensor;
+}
+int ElementWiseTensorRT::AddConstTensor(TensorRTContext *ctx) {
+  int const_tensor_index = (in_tensors_[0].Data() != nullptr && in_tensors_[0].IsConst()) ? 0 : 1;
+  nvinfer1::ITensor *constant_input = ConvertConstantTensorWithDims(
+    ctx, in_tensors_[const_tensor_index], in_tensors_[1 - const_tensor_index].Shape(), op_name_);
+  CHECK_NULL_RETURN(constant_input);
+  AddInnerInTensors(ITensorHelper{constant_input, tensorrt_in_tensors_[0].format_, true});
+  return RET_OK;
+}
+bool ElementWiseTensorRT::SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor) {
+  if (SameDims(trt_tensor->getDimensions(), ms_tensor->Shape())) {
+    return true;
+  }
+  if (ms_tensor->Shape().size() == DIMENSION_4D) {
+    // nhwc nchw
+    auto nchw_shape = NHWC2NCHW(ms_tensor->Shape());
+    if (SameDims(trt_tensor->getDimensions(), nchw_shape)) {
+      return true;
+    }
+  }
+  auto str_name = strstr(trt_tensor->getName(), ms_tensor->Name().c_str());
+  if (str_name != nullptr) {
+    return true;
+  }
+  str_name = strstr(ms_tensor->Name().c_str(), trt_tensor->getName());
+  if (str_name != nullptr) {
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_SubFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_DivFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_RealDiv, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PowFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AddFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MulFusion, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Eltwise, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Minimum, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Maximum, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_BiasAdd, ElementWiseTensorRT)
+#if TRT_VERSION_GE(7, 2)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Less, ElementWiseTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Greater, ElementWiseTensorRT)
+#endif
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h
@ -0,0 +1,50 @@
+/**
+ * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ElementWiseTensorRT : public TensorRTOp {
+ public:
+  ElementWiseTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                      const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                      const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ElementWiseTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  nvinfer1::ITensor *AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor);
+
+  int AddConstTensor(TensorRTContext *ctx);
+
+  bool SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor);
+
+  int PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input);
+
+  nvinfer1::ElementWiseOperation element_wise_op_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc
@ -0,0 +1,96 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/equal_tensorrt.h"
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(EqualPluginCreater);
+template class TensorRTPluginCreater<EqualPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int EqualTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int EqualTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_};
+  auto plugin = std::make_shared<EqualPlugin>(op_name_, device_id_);
+  nvinfer1::IPluginV2Layer *equal_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (equal_layer == nullptr) {
+    MS_LOG(ERROR) << "create equal layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  layer_ = equal_layer;
+  nvinfer1::ITensor *equal_out = equal_layer->getOutput(0);
+  equal_layer->setName(op_name_.c_str());
+  equal_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{equal_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+int EqualPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                         const void *const *inputs, void *const *outputs, void *workspace,
+                         cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+
+  if (inputDesc->type == nvinfer1::DataType::kINT32) {
+    const int *input1 = static_cast<const int *>(inputs[0]);
+    const int *input2 = static_cast<const int *>(inputs[1]);
+    int *output = static_cast<int *>(outputs[0]);
+    Equal(input1, input2, output, element_cnt, stream);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    const float *input2 = static_cast<const float *>(inputs[1]);
+    float *output = static_cast<float *>(outputs[0]);
+    Equal(input1, input2, output, element_cnt, stream);
+  } else {
+    MS_LOG(ERROR) << "unsupported equal data type";
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *EqualPlugin::clone() const noexcept {
+  auto *plugin = new EqualPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+#if TRT_VERSION_LS(7, 2)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, EqualTensorRT)
+#endif
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh"
+
+namespace mindspore::lite {
+constexpr char *EQUAL_PLUGIN_NAME{"EqualPlugin"};
+class EqualTensorRT : public TensorRTOp {
+ public:
+  EqualTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~EqualTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class EqualPlugin : public TensorRTPlugin {
+ public:
+  EqualPlugin(const std::string name, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(EQUAL_PLUGIN_NAME), device_id) {}
+
+  EqualPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {}
+
+  EqualPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {}
+
+  EqualPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+};
+class EqualPluginCreater : public TensorRTPluginCreater<EqualPlugin> {
+ public:
+  EqualPluginCreater() : TensorRTPluginCreater(std::string(EQUAL_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc
@ -0,0 +1,106 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+namespace mindspore::lite {
+constexpr int BIAS_INDEX = 2;
+int FullyConnectedTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                                      const std::vector<mindspore::MSTensor> &in_tensors,
+                                      const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int FullyConnectedTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  auto primitive = op_primitive_->value_as_FullConnection();
+  CHECK_NULL_RETURN(primitive);
+  activation_ = primitive->activation_type();
+  int axis = primitive->axis();
+  if (axis < 0 || axis >= out_tensors_[0].Shape().size()) {
+    MS_LOG(ERROR) << "axis: " << axis << " is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper fc_input;
+  auto ret = PreprocessInputs(ctx, &fc_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputs failed for " << op_name_;
+    return ret;
+  }
+  auto kernel_weight = ConvertWeight(in_tensors_[1].Data().get() == nullptr ? in_tensors_[0] : in_tensors_[1]);
+  nvinfer1::Weights bias_weight{};
+  if (primitive->has_bias()) {
+    bias_weight = ConvertWeight(in_tensors_[BIAS_INDEX]);
+  }
+  nvinfer1::IFullyConnectedLayer *fc_layer = ctx->network()->addFullyConnected(
+    *(fc_input.trt_tensor_), out_tensors_[0].Shape()[axis], kernel_weight, bias_weight);
+  if (fc_layer == nullptr) {
+    MS_LOG(ERROR) << "addFullyConnected failed for " << op_name_;
+    return RET_ERROR;
+  }
+  this->layer_ = fc_layer;
+  fc_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0);
+
+  if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) {
+    std::vector<int64_t> squeeze_dim(out_tensors_[0].Shape());
+    squeeze_dim[0] = out_tensor->getDimensions().d[0] == -1 ? -1 : squeeze_dim[0];
+    out_tensor = Reshape(ctx, out_tensor, squeeze_dim);
+  }
+  // add activation
+  if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    nvinfer1::ILayer *activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for matmul failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    out_tensor = activation_layer->getOutput(0);
+  }
+
+  out_tensor->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, fc_input.format_});
+  return RET_OK;
+}
+
+int FullyConnectedTensorRT::PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input) {
+  auto ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], fc_input);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim failed for " << op_name_;
+    return ret;
+  }
+  auto origin_dims = fc_input->trt_tensor_->getDimensions();
+  if (origin_dims.nbDims != DIMENSION_4D) {
+    std::vector<int64_t> expand_dim(origin_dims.d, origin_dims.d + origin_dims.nbDims);
+    for (int i = 0; i < DIMENSION_4D - origin_dims.nbDims; i++) {
+      expand_dim.push_back(1);
+    }
+    fc_input->trt_tensor_ = Reshape(ctx, fc_input->trt_tensor_, expand_dim);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_FullConnection, FullyConnectedTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h
@ -0,0 +1,45 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class FullyConnectedTensorRT : public TensorRTOp {
+ public:
+  FullyConnectedTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                         const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                         const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~FullyConnectedTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input);
+
+  schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc
@ -0,0 +1,139 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(GatherDPluginCreater);
+template class TensorRTPluginCreater<GatherDPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int GatherDTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported gatherd input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid gatherd input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid gatherd output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherDTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[2].trt_tensor_};
+  auto dim_tensor = static_cast<const int *>(in_tensors_[1].Data().get());
+  if (dim_tensor == nullptr) {
+    MS_LOG(ERROR) << op_name_ << " gatherd dim_tensor is null!";
+    return RET_ERROR;
+  }
+  size_t dim = static_cast<size_t>(dim_tensor[0]);
+
+  auto plugin = std::make_shared<GatherDPlugin>(op_name_, dim, device_id_);
+  nvinfer1::IPluginV2Layer *gatherd_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (gatherd_layer == nullptr) {
+    MS_LOG(ERROR) << "create gatherd failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *gatherd_out = gatherd_layer->getOutput(0);
+  gatherd_layer->setName(op_name_.c_str());
+  gatherd_out->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{gatherd_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  this->layer_ = gatherd_layer;
+  return RET_OK;
+}
+
+int GatherDPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                           const void *const *inputs, void *const *outputs, void *workspace,
+                           cudaStream_t stream) noexcept {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int dims = input_dims.nbDims;
+  if (axis_ < 0) {
+    axis_ += dims;
+  }
+
+  if (inputDesc->type == nvinfer1::DataType::kINT32) {
+    auto input = static_cast<const int *>(inputs[0]);
+    auto index = static_cast<const int *>(inputs[1]);
+    auto output = static_cast<int *>(outputs[0]);
+    Reshape(inputDesc, outputDesc);
+    Gather<int, int>(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_,
+                     stream, device_id_);
+  } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) {
+    auto input = static_cast<const float *>(inputs[0]);
+    auto index = static_cast<const int *>(inputs[1]);
+    auto output = static_cast<float *>(outputs[0]);
+    Reshape(inputDesc, outputDesc);
+    Gather<float, int>(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_,
+                       stream, device_id_);
+  } else {
+    MS_LOG(ERROR) << "unsupported data type gatherd" << layer_name_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *GatherDPlugin::clone() const noexcept {
+  auto *plugin = new GatherDPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs GatherDPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                                       nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs[1].nbDims;
+  for (int i = 0; i < inputs[1].nbDims; i++) {
+    out_dims.d[i] = inputs[1].d[i];
+  }
+  return out_dims;
+}
+
+void GatherDPlugin::Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+  size_t dim_before_axis = 1;
+  for (size_t i = 0; i < IntToSize(axis_); i++) {
+    dim_before_axis *= output_dims.d[i];
+  }
+  size_t dim_at_axis_input = input_dims.d[IntToSize(axis_)];
+  size_t dim_at_axis_output = output_dims.d[IntToSize(axis_)];
+  size_t dim_after_axis = 1;
+  for (size_t i = IntToSize(axis_) + 1; i < output_dims.nbDims; i++) {
+    dim_after_axis *= output_dims.d[i];
+  }
+
+  dim_before_axis_ = dim_before_axis;
+  dim_at_axis_input_ = dim_at_axis_input;
+  dim_at_axis_output_ = dim_at_axis_output;
+  dim_after_axis_ = dim_after_axis;
+  return;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_GatherD, GatherDTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h
@ -0,0 +1,80 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr char *GATHER_D_PLUGIN_NAME{"GatherDPluginCreater"};
+class GatherDTensorRT : public TensorRTOp {
+ public:
+  GatherDTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~GatherDTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class GatherDPlugin : public TensorRTPlugin {
+ public:
+  GatherDPlugin(const std::string name, size_t dim, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(GATHER_D_PLUGIN_NAME), device_id), axis_(dim) {}
+
+  GatherDPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    axis_ = static_cast<const int *>(fields[0].data)[0];
+  }
+
+  GatherDPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &axis_, sizeof(int));
+  }
+
+  GatherDPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+ private:
+  int axis_;
+  size_t dim_before_axis_;
+  size_t dim_at_axis_input_;
+  size_t dim_at_axis_output_;
+  size_t dim_after_axis_;
+  void Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc);
+};
+
+class GatherDPluginCreater : public TensorRTPluginCreater<GatherDPlugin> {
+ public:
+  GatherDPluginCreater() : TensorRTPluginCreater(std::string(GATHER_D_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc
@ -0,0 +1,108 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/gather_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr int AXIS_INDEX = 2;
+
+int GatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[1].DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(ERROR) << "Gather indices only support Int32";
+    return RET_ERROR;
+  }
+  if (in_tensors[AXIS_INDEX].ElementNum() == 1) {
+    MS_ASSERT(in_tensors[AXIS_INDEX].Data().get());
+    axis_ = static_cast<const int *>(in_tensors[AXIS_INDEX].Data().get())[0];
+  } else {
+    MS_LOG(ERROR) << "TensorRT axis is attribute.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int GatherTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  if (tensorrt_in_tensors_.size() < INPUT_SIZE2 && in_tensors_.size() >= INPUT_SIZE2) {
+    int const_ms_tensor_index = in_tensors_[0].IsConst() ? 0 : 1;
+    auto const_input = ConvertConstantTensor(ctx, in_tensors_[const_ms_tensor_index], op_name_);
+    if (const_input == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{const_input});
+  }
+
+  int indices_tensor_index = tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32 ? 0 : 1;
+  ITensorHelper gather_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - indices_tensor_index], &gather_input);
+  if (ret != RET_OK || gather_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim gather failed for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper indices_tensor;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[indices_tensor_index], &indices_tensor);
+  if (ret != RET_OK || indices_tensor.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim indices failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::IGatherLayer *gather_layer =
+    ctx->network()->addGather(*gather_input.trt_tensor_, *indices_tensor.trt_tensor_, axis_);
+  if (gather_layer == nullptr) {
+    MS_LOG(ERROR) << "addGather failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  this->layer_ = gather_layer;
+  gather_layer->setName(op_name_.c_str());
+  nvinfer1::ITensor *op_output = gather_layer->getOutput(0);
+  // keep shape
+  if (in_tensors_[1].Shape().empty()) {
+    auto squeeze = ctx->network()->addShuffle(*op_output);
+    if (squeeze == nullptr) {
+      MS_LOG(ERROR) << "add output squeeze failed for " << op_name_;
+      return RET_ERROR;
+    }
+    squeeze->setName((op_name_ + "_squeeze_out").c_str());
+    auto old_shape = ConvertMSShape(op_output->getDimensions());
+    old_shape.erase(old_shape.begin() + axis_);
+    squeeze->setReshapeDimensions(ConvertCudaDims(old_shape));
+    op_output = squeeze->getOutput(0);
+  }
+  op_output->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_output, gather_input.format_, gather_input.same_format_});
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Gather, GatherTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class GatherTensorRT : public TensorRTOp {
+ public:
+  GatherTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~GatherTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int axis_{0};
+  mindspore::MSTensor indices_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc
@ -0,0 +1,119 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+
+namespace mindspore::lite {
+int LogicalNotTensorRT::IsSupport(const schema::Primitive *primitive,
+                                  const std::vector<mindspore::MSTensor> &in_tensors,
+                                  const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  return RET_OK;
+}
+
+int LogicalNotTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  if (tensorrt_in_tensors_[0].trt_tensor_->getType() != nvinfer1::DataType::kINT32) {
+    auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_);
+    if (cast_layer == nullptr) {
+      MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+      return RET_ERROR;
+    }
+    cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0);
+  }
+  auto plugin = std::make_shared<LogicalNotPlugin>(op_name_, op_primitive_->value_type());
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  this->layer_ = logical_layer;
+  nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+REGISTER_TENSORRT_PLUGIN(LogicalNotPluginCreater);
+template class TensorRTPluginCreater<LogicalNotPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int LogicalNotPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                              const void *const *inputs, void *const *outputs, void *workspace,
+                              cudaStream_t stream) noexcept {
+  return RunCudaLogical(inputDesc, inputs, outputs, stream);
+}
+
+int LogicalNotPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                     void *const *outputs, cudaStream_t stream) {
+  switch (primitive_type_) {
+    case (schema::PrimitiveType_LogicalNot): {
+      LogicalNot(static_cast<const int *>(inputs[0]), static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims),
+                 stream);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid logical type: " << static_cast<int>(primitive_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *LogicalNotPlugin::clone() const noexcept {
+  auto *plugin = new LogicalNotPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t LogicalNotPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); }
+
+void LogicalNotPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType));
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalNot, LogicalNotTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h
@ -0,0 +1,78 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class LogicalNotTensorRT : public TensorRTOp {
+ public:
+  LogicalNotTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                     const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                     const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LogicalNotTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+constexpr char *LOGICAL_NOT_PLUGIN_NAME{"LogicalNotPlugin"};
+class LogicalNotPlugin : public TensorRTPlugin {
+ public:
+  LogicalNotPlugin(const std::string name, schema::PrimitiveType primitive_type)
+      : TensorRTPlugin(name, std::string(LOGICAL_NOT_PLUGIN_NAME)), primitive_type_(primitive_type) {}
+
+  LogicalNotPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    primitive_type_ = static_cast<const schema::PrimitiveType *>(fields[0].data)[0];
+  }
+
+  LogicalNotPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType));
+  }
+
+  LogicalNotPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                     cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::PrimitiveType primitive_type_;
+};
+class LogicalNotPluginCreater : public TensorRTPluginCreater<LogicalNotPlugin> {
+ public:
+  LogicalNotPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_NOT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc
@ -0,0 +1,129 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/op/logical_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh"
+
+namespace mindspore::lite {
+int LogicalTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                               const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int LogicalTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "network or input tensor is invalid";
+    return RET_ERROR;
+  }
+  for (int i = 0; i != tensorrt_in_tensors_.size(); ++i) {
+    if (tensorrt_in_tensors_[i].trt_tensor_->getType() != nvinfer1::DataType::kINT32) {
+      auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_);
+      if (cast_layer == nullptr) {
+        MS_LOG(ERROR) << "create cast layer failed for: " << op_name_;
+        return RET_ERROR;
+      }
+      cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+      tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0);
+    }
+  }
+  auto plugin = std::make_shared<LogicalPlugin>(op_name_, op_primitive_->value_type());
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_};
+  nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 2, *plugin);
+  this->layer_ = logical_layer;
+  nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0);
+  if (op_out_tensor == nullptr) {
+    MS_LOG(ERROR) << "addElementWise out tensor is nullptr.";
+    return RET_ERROR;
+  }
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(
+    ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+REGISTER_TENSORRT_PLUGIN(LogicalPluginCreater);
+template class TensorRTPluginCreater<LogicalPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int LogicalPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                           const void *const *inputs, void *const *outputs, void *workspace,
+                           cudaStream_t stream) noexcept {
+  return RunCudaLogical(inputDesc, inputs, outputs, stream);
+}
+
+int LogicalPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs,
+                                  void *const *outputs, cudaStream_t stream) {
+  switch (primitive_type_) {
+    case (schema::PrimitiveType_LogicalAnd): {
+      LogicalAnd(static_cast<const int *>(inputs[0]), static_cast<const int *>(inputs[1]),
+                 static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream);
+      break;
+    }
+    case (schema::PrimitiveType_LogicalOr): {
+      LogicalOr(static_cast<const int *>(inputs[0]), static_cast<const int *>(inputs[1]),
+                static_cast<int *>(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream);
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "invalid logical type: " << static_cast<int>(primitive_type_);
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *LogicalPlugin::clone() const noexcept {
+  auto *plugin = new LogicalPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t LogicalPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); }
+
+void LogicalPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType));
+}
+
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalOr, LogicalTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalAnd, LogicalTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h
@ -0,0 +1,78 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
+
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+class LogicalTensorRT : public TensorRTOp {
+ public:
+  LogicalTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                  const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                  const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LogicalTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+constexpr char *LOGICAL_PLUGIN_NAME{"LogicalPlugin"};
+class LogicalPlugin : public TensorRTPlugin {
+ public:
+  LogicalPlugin(const std::string name, schema::PrimitiveType primitive_type)
+      : TensorRTPlugin(name, std::string(LOGICAL_PLUGIN_NAME)), primitive_type_(primitive_type) {}
+
+  LogicalPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    primitive_type_ = static_cast<const schema::PrimitiveType *>(fields[0].data)[0];
+  }
+
+  LogicalPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType));
+  }
+
+  LogicalPlugin() = delete;
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs,
+                     cudaStream_t stream);
+  const std::string layer_name_;
+  std::string name_space_;
+  schema::PrimitiveType primitive_type_;
+};
+class LogicalPluginCreater : public TensorRTPluginCreater<LogicalPlugin> {
+ public:
+  LogicalPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc
@ -0,0 +1,493 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/lstm_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+namespace mindspore::lite {
+int LSTMTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+#if TRT_VERSION_GE(7, 0)
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() < INPUT_TENSOR_SIZE) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != OUTPUT_TENSOR_SIZE) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+  hidden_init_name_ = hidden_in_init.Name() + "_hidden_init";
+  mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+  cell_init_name_ = cell_in_init.Name() + "_cell_init";
+
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher";
+  return RET_ERROR;
+#endif
+}
+
+int LSTMTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  int input_data_dims_cnt = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+  if (input_data_dims_cnt != DIMENSION_3D) {
+    MS_LOG(ERROR) << "invalid input data shape dims for " << op_name_;
+    return RET_ERROR;
+  }
+  network_ = ctx->network();
+  int ret = PreProcess();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "PreProcess for " << op_name_;
+    return ret;
+  }
+
+  ret = AddLSTMLayers();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AddLSTMLayers for " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (op_data_out_ == nullptr) {
+    MS_LOG(ERROR) << "layers final output tensor is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+  op_data_out_->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "lstm op_data_out_ " << GetTensorFormat(op_data_out_);
+  MS_LOG(DEBUG) << "lstm op_hidden_out_ " << GetTensorFormat(op_hidden_out_);
+  MS_LOG(DEBUG) << "lstm op_cell_out_ " << GetTensorFormat(op_cell_out_);
+  this->AddInnerOutTensors(ITensorHelper{op_data_out_});
+  this->AddInnerOutTensors(ITensorHelper{op_hidden_out_});
+  this->AddInnerOutTensors(ITensorHelper{op_cell_out_});
+  return RET_OK;
+}
+
+int LSTMTensorRT::PreProcess() {
+  auto ms_input_shape = in_tensors_[0].Shape();
+  params_.sequence_size_ = ms_input_shape[0];
+  params_.batch_size_ = ms_input_shape[1];
+  params_.input_data_size_ = ms_input_shape[INPUT_SIZE_INDEX];
+  if (params_.batch_size_ != 1) {
+    MS_LOG(WARNING) << op_name_ << " lstm has batchsize " << params_.batch_size_ << ", needs further verify";
+  }
+  // ms: 0 sequence size, 1 batch size, 2 input size -> tensorrt: 0 batch size, 1 sequence size, 2 input size
+  auto transpose_in_layer = network_->addShuffle(*tensorrt_in_tensors_[0].trt_tensor_);
+  if (transpose_in_layer == nullptr) {
+    MS_LOG(ERROR) << "create transpose_in_layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::Permutation transpose_perm{{1, 0, INPUT_SIZE_INDEX}};
+  transpose_in_layer->setFirstTranspose(transpose_perm);
+  transpose_in_layer->setName((op_name_ + "transpose_in").c_str());
+  input_data_ = transpose_in_layer->getOutput(0);
+  MS_LOG(DEBUG) << "lstm input " << GetTensorFormat(input_data_);
+
+  auto lstm_op = op_primitive_->value_as_LSTM();
+  params_.layer_count_ = lstm_op->num_layers() == 0 ? 1 : lstm_op->num_layers();
+  params_.hidden_size_ = lstm_op->hidden_size();
+  params_.directional_cnt_ = lstm_op->bidirectional() ? BIDIRECTIONAL : 1;
+  params_.data_type_ = ConvertDataType(in_tensors_[1].DataType());
+  return RET_OK;
+}
+
+int LSTMTensorRT::AddLSTMLayers() {
+  mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+  mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+
+  nvinfer1::ITensor *data_out{nullptr};
+  nvinfer1::ITensor *hidden_init = network_->addInput(
+    hidden_init_name_.c_str(), nvinfer1::DataType::kFLOAT,
+    nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_));
+  if (hidden_init == nullptr) {
+    MS_LOG(ERROR) << "add hidden_init input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(),
+                                             nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()});
+  nvinfer1::ITensor *cell_init = network_->addInput(
+    cell_init_name_.c_str(), nvinfer1::DataType::kFLOAT,
+    nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_));
+  if (cell_init == nullptr) {
+    MS_LOG(ERROR) << "add cell_init input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(
+    BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()});
+
+  sequence_size_input_ =
+    network_->addInput((op_name_ + "_seq_input").c_str(), nvinfer1::DataType::kINT32, nvinfer1::Dims{});
+  if (sequence_size_input_ == nullptr) {
+    MS_LOG(ERROR) << "add sequence_size_input_ input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_binding_tensor_.push_back(
+    BindingHelper{(op_name_ + "_seq_input"), &params_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)});
+
+  nvinfer1::ITensor *max_sequence_size =
+    network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, &params_.sequence_size_, 1})
+      ->getOutput(0);
+  if (max_sequence_size == nullptr) {
+    MS_LOG(ERROR) << "add max_sequence_size constant tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  LstmState next_state{input_data_, nullptr, nullptr};  // init states
+  std::vector<nvinfer1::ITensor *> hidden_outputs;
+  std::vector<nvinfer1::ITensor *> cell_outputs;
+  int input_weight_offset = 0;
+  int state_weight_offset = 0;
+  int bias_offset = 0;
+
+  if (params_.layer_count_ != 1) {
+    MS_LOG(WARNING) << op_name_ << " needs verify for layer cnt: " << params_.layer_count_;
+  }
+  for (int i = 0; i < params_.layer_count_; i++) {
+    LstmState layer_input_states[BIDIRECTIONAL];
+    LstmWeights layer_weights[BIDIRECTIONAL];
+    layer_weights[0].max_seq_size_ = max_sequence_size;
+    int ret = ParseLSTMCellInputs(i, hidden_init, cell_init, layer_input_states, &input_weight_offset,
+                                  &state_weight_offset, &bias_offset, layer_weights, next_state);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "ParseLSTMCellInputs failed for " << op_name_;
+      return RET_ERROR;
+    }
+    data_out = AddLSTMCell(layer_input_states, layer_weights, &next_state);
+    hidden_outputs.push_back(next_state.hidden_);
+    cell_outputs.push_back(next_state.cell_);
+    if (data_out == nullptr || next_state.hidden_ == nullptr || next_state.cell_ == nullptr) {
+      MS_LOG(ERROR) << "AddLSTMCell failed for " << op_name_;
+      return RET_ERROR;
+    }
+  }
+
+  op_hidden_out_ = ConcateAll(hidden_outputs);
+  if (op_hidden_out_ == nullptr) {
+    MS_LOG(ERROR) << "concat hidden output failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_hidden_out_->setName(out_tensors_[OUTPUT_HIDDEN_INDEX].Name().c_str());
+  op_cell_out_ = ConcateAll(cell_outputs);
+  if (op_cell_out_ == nullptr) {
+    MS_LOG(ERROR) << "concat cell output failed for " << op_name_;
+    return RET_ERROR;
+  }
+  op_cell_out_->setName(out_tensors_[OUTPUT_CELL_INDEX].Name().c_str());
+  op_data_out_ = data_out;
+  return RET_OK;
+}
+
+int LSTMTensorRT::ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init,
+                                      LstmState *layer_input_states, int *input_weight_offset, int *state_weight_offset,
+                                      int *bias_offset, LstmWeights *layer_weights, const LstmState &next_state) {
+  nvinfer1::Dims2 dim_input_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.input_data_size_);
+  nvinfer1::Dims2 dim_state_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.hidden_size_);
+  nvinfer1::Dims dim_bias{1, {LSTM_GATE_NUM * params_.hidden_size_}};
+
+  mindspore::MSTensor &input_weight = in_tensors_[INPUT_WEIGHT];
+  mindspore::MSTensor &state_weight = in_tensors_[STATE_WEIGHT];
+  mindspore::MSTensor &bias = in_tensors_[BIAS];
+
+  nvinfer1::Dims dimW = layer_index == 0 ? dim_input_weight : dim_state_weight;
+
+  for (int direction_index = 0; direction_index < params_.directional_cnt_; direction_index++) {
+    nvinfer1::ITensor *index =
+      network_
+        ->addConstant(nvinfer1::Dims{},
+                      nvinfer1::Weights{nvinfer1::DataType::kINT32,
+                                        &INDICES[layer_index * params_.directional_cnt_ + direction_index], 1})
+        ->getOutput(0);
+    MS_ASSERT(index);
+    layer_input_states[direction_index].data_ = next_state.data_;
+    layer_input_states[direction_index].hidden_ = network_->addGather(*hidden_init, *index, 0)->getOutput(0);
+    layer_input_states[direction_index].cell_ = network_->addGather(*cell_init, *index, 0)->getOutput(0);
+    MS_ASSERT(layer_input_states[direction_index].hidden_);
+    MS_ASSERT(layer_input_states[direction_index].cell_);
+
+    // weight order: input, output, forget, cell
+    if (params_.data_type_ != nvinfer1::DataType::kFLOAT) {
+      MS_LOG(WARNING) << "more data type need to be done";
+      return RET_ERROR;
+    }
+    const float *input_weight_ptr = static_cast<const float *>(input_weight.Data().get());
+    const float *state_weight_ptr = static_cast<const float *>(state_weight.Data().get());
+    const float *bias_ptr = static_cast<const float *>(bias.Data().get());
+    nvinfer1::Weights slice_input_weight{params_.data_type_, input_weight_ptr + *input_weight_offset,
+                                         GetDimsVolume(dimW)};
+    (*input_weight_offset) += slice_input_weight.count;
+    nvinfer1::Weights slice_state_weight{params_.data_type_, state_weight_ptr + *state_weight_offset,
+                                         GetDimsVolume(dim_state_weight)};
+    (*state_weight_offset) += slice_state_weight.count;
+    layer_weights[direction_index].input_weights_ = network_->addConstant(dimW, slice_input_weight)->getOutput(0);
+    layer_weights[direction_index].state_weights_ =
+      network_->addConstant(dim_state_weight, slice_state_weight)->getOutput(0);
+    MS_ASSERT(layer_weights[direction_index].input_weights_);
+    MS_ASSERT(layer_weights[direction_index].state_weights_);
+
+    // bias
+    nvinfer1::Weights slice_input_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)};
+    (*bias_offset) += slice_input_bias.count;
+    nvinfer1::Weights slice_state_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)};
+    (*bias_offset) += slice_state_bias.count;
+    layer_weights[direction_index].input_bias_ = network_->addConstant(dim_bias, slice_input_bias)->getOutput(0);
+    layer_weights[direction_index].state_bias_ = network_->addConstant(dim_bias, slice_state_bias)->getOutput(0);
+    MS_ASSERT(layer_weights[direction_index].input_bias_);
+    MS_ASSERT(layer_weights[direction_index].state_bias_);
+  }
+  if (params_.directional_cnt_ == BIDIRECTIONAL) {
+    layer_weights[1].max_seq_size_ = layer_weights[0].max_seq_size_;
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *LSTMTensorRT::Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims) {
+  nvinfer1::IShuffleLayer *shuffle = network_->addShuffle(*tensor);
+  shuffle->setReshapeDimensions(dims);
+  return shuffle->getOutput(0);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::ConcateAll(std::vector<nvinfer1::ITensor *> all_tensor, int axis) {
+  if (all_tensor.size() == 1) {
+    return all_tensor[0];
+  }
+  nvinfer1::IConcatenationLayer *concat = network_->addConcatenation(all_tensor.data(), all_tensor.size());
+  if (concat == nullptr) {
+    MS_LOG(ERROR) << "addConcatenation failed for " << op_name_;
+    return nullptr;
+  }
+  if (axis >= all_tensor[0]->getDimensions().nbDims) {
+    MS_LOG(ERROR) << op_name_ << " concat axis is " << axis << ", larger than tensor dims "
+                  << all_tensor[0]->getDimensions().nbDims;
+    return nullptr;
+  }
+  concat->setAxis(axis);
+  return concat->getOutput(0);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights,
+                                             LstmState *next_state) {
+  nvinfer1::ITensor *backward_output = nullptr;
+  nvinfer1::ITensor *backward_hidden_out = nullptr;
+  nvinfer1::ITensor *backward_cell_out = nullptr;
+  nvinfer1::ITensor *forward_hidden_out = nullptr;
+  nvinfer1::ITensor *forward_cell_out = nullptr;
+
+  nvinfer1::ITensor *forward_output =
+    AddLSTMCalculation(layer_input_states[0], layer_weights[0], &forward_hidden_out, &forward_cell_out);
+  if (params_.directional_cnt_ == BIDIRECTIONAL) {
+    backward_output =
+      AddLSTMCalculation(layer_input_states[1], layer_weights[1], &backward_hidden_out, &backward_cell_out, true);
+  }
+
+  // concate forward and backward
+  nvinfer1::ITensor *output_tensor = forward_output;
+  nvinfer1::ITensor *cell_out = forward_cell_out;
+  nvinfer1::ITensor *hidden_out = forward_hidden_out;
+  if (backward_output != nullptr && backward_hidden_out != nullptr && backward_cell_out != nullptr) {
+    nvinfer1::ITensor *output_concat_input[BIDIRECTIONAL] = {forward_output, backward_output};
+    auto ouput_out_layer = network_->addConcatenation(output_concat_input, BIDIRECTIONAL);
+    this->layer_ = ouput_out_layer;
+    if (ouput_out_layer == nullptr) {
+      MS_LOG(ERROR) << "create one loop output concat failed for " << op_name_;
+      return nullptr;
+    }
+    ouput_out_layer->setAxis(1);  // ms: 0 sequence size, 1 layer * direction, 2 batchsize, 3 hidden
+    output_tensor = ouput_out_layer->getOutput(0);
+
+    nvinfer1::ITensor *hidden_concat_input[BIDIRECTIONAL] = {forward_hidden_out, backward_hidden_out};
+    auto hidden_out_layer = network_->addConcatenation(hidden_concat_input, BIDIRECTIONAL);
+    hidden_out_layer->setAxis(0);
+    hidden_out = hidden_out_layer->getOutput(0);
+
+    nvinfer1::ITensor *cell_concat_input[BIDIRECTIONAL] = {forward_cell_out, backward_cell_out};
+    auto cell_out_layer = network_->addConcatenation(cell_concat_input, BIDIRECTIONAL);
+    cell_out_layer->setAxis(0);
+    cell_out = cell_out_layer->getOutput(0);
+  }
+  if (hidden_out == nullptr || cell_out == nullptr) {
+    MS_LOG(ERROR) << "get one loop hidden_out and cell_out failed for " << op_name_;
+    return nullptr;
+  }
+  *next_state = LstmState{output_tensor, hidden_out, cell_out};
+  return output_tensor;
+}
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                                    nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                                    bool is_backward) {
+  std::vector<nvinfer1::ITensor *> all_batch_outputs;
+  std::vector<nvinfer1::ITensor *> all_batch_hidden;
+  std::vector<nvinfer1::ITensor *> all_batch_cell;
+  for (int batch_index = 0; batch_index < params_.batch_size_; batch_index++) {
+    LstmState one_batch_input_state;
+    nvinfer1::ITensor *batch_index_tensor =
+      network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, &INDICES[batch_index], 1})
+        ->getOutput(0);
+    one_batch_input_state.data_ = network_->addGather(*input_state.data_, *batch_index_tensor, 0)->getOutput(0);
+    one_batch_input_state.hidden_ = network_->addGather(*input_state.hidden_, *batch_index_tensor, 0)->getOutput(0);
+    one_batch_input_state.cell_ = network_->addGather(*input_state.cell_, *batch_index_tensor, 0)->getOutput(0);
+    nvinfer1::ITensor *one_batch_hidden = nullptr;
+    nvinfer1::ITensor *one_batch_cell = nullptr;
+    nvinfer1::ITensor *one_batch_output =
+      AddLSTMOneLoop(one_batch_input_state, lstm_weights, &one_batch_hidden, &one_batch_cell, is_backward);
+    if (one_batch_output == nullptr || one_batch_cell == nullptr || one_batch_hidden == nullptr) {
+      MS_LOG(ERROR) << "AddLSTMOneLoop failed for " << op_name_ << " at batch index " << batch_index;
+      return nullptr;
+    }
+    all_batch_outputs.push_back(one_batch_output);
+    all_batch_hidden.push_back(one_batch_hidden);
+    all_batch_cell.push_back(one_batch_cell);
+  }
+  *hidden_out = ConcateAll(all_batch_hidden, 1);
+  *cell_out = ConcateAll(all_batch_cell, 1);
+  return ConcateAll(all_batch_outputs, BATCH_SIZE_INDEX);
+}
+
+nvinfer1::ITensor *LSTMTensorRT::AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                                nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                                bool is_backward) {
+#if TRT_VERSION_GE(7, 0)
+  nvinfer1::ILoop *sequence_loop = network_->addLoop();
+  if (sequence_loop == nullptr) {
+    MS_LOG(ERROR) << "add sequence_loop layer failed for " << op_name_;
+    return nullptr;
+  }
+  std::string loop_name = op_name_ + "_loop" + (is_backward ? "_backward" : "_forward");
+  sequence_loop->setName(loop_name.c_str());
+  sequence_loop->addTripLimit(*sequence_size_input_, nvinfer1::TripLimit::kCOUNT);
+  nvinfer1::ITensor *input = sequence_loop->addIterator(*input_state.data_, 0, is_backward)->getOutput(0);
+
+  nvinfer1::ILayer *hidden_mid = sequence_loop->addRecurrence(*input_state.hidden_);
+  if (hidden_mid == nullptr) {
+    MS_LOG(ERROR) << "add hidden layer failed for " << op_name_;
+    return nullptr;
+  }
+  nvinfer1::ILayer *cell_mid = sequence_loop->addRecurrence(*input_state.cell_);
+  if (cell_mid == nullptr) {
+    MS_LOG(ERROR) << "add cell layer failed for " << op_name_;
+    return nullptr;
+  }
+
+  nvinfer1::ITensor *input_matmul =
+    network_
+      ->addMatrixMultiply(*input, nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.input_weights_,
+                          nvinfer1::MatrixOperation::kTRANSPOSE)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *hidden_matmul =
+    network_
+      ->addMatrixMultiply(*hidden_mid->getOutput(0), nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.state_weights_,
+                          nvinfer1::MatrixOperation::kTRANSPOSE)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *weights_add =
+    network_->addElementWise(*input_matmul, *hidden_matmul, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+  nvinfer1::ITensor *bias =
+    network_->addElementWise(*lstm_weights.input_bias_, *lstm_weights.state_bias_, nvinfer1::ElementWiseOperation::kSUM)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *gates_calculate =
+    network_->addElementWise(*weights_add, *bias, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+  const auto isolateGate = [&](nvinfer1::ITensor &gates, int gateIndex) -> nvinfer1::ITensor * {
+    nvinfer1::ISliceLayer *slice =
+      network_->addSlice(gates, nvinfer1::Dims{1, {gateIndex * params_.hidden_size_}},
+                         nvinfer1::Dims{1, {params_.hidden_size_}}, nvinfer1::Dims{1, {1}});
+    return Reshape(slice->getOutput(0), nvinfer1::Dims{1, {params_.hidden_size_}});
+  };
+  // weight order: input, output, forget, cell
+  nvinfer1::ITensor *i =
+    network_->addActivation(*isolateGate(*gates_calculate, 0), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+
+  nvinfer1::ITensor *o =
+    network_->addActivation(*isolateGate(*gates_calculate, 1), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
+
+  nvinfer1::ITensor *f =
+    network_->addActivation(*isolateGate(*gates_calculate, FORGET_GATE), nvinfer1::ActivationType::kSIGMOID)
+      ->getOutput(0);
+
+  nvinfer1::ITensor *c =
+    network_->addActivation(*isolateGate(*gates_calculate, CELL_GATE), nvinfer1::ActivationType::kTANH)->getOutput(0);
+
+  nvinfer1::ITensor *C =
+    network_
+      ->addElementWise(
+        *network_->addElementWise(*f, *cell_mid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
+        *network_->addElementWise(*i, *c, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
+        nvinfer1::ElementWiseOperation::kSUM)
+      ->getOutput(0);
+  nvinfer1::ITensor *H =
+    network_
+      ->addElementWise(*o, *network_->addActivation(*C, nvinfer1::ActivationType::kTANH)->getOutput(0),
+                       nvinfer1::ElementWiseOperation::kPROD)
+      ->getOutput(0);
+
+  // Recurrent backedge input for hidden and cell.
+  cell_mid->setInput(1, *C);
+  hidden_mid->setInput(1, *H);
+  // outputs
+  nvinfer1::LoopOutput output_mode = is_backward ? nvinfer1::LoopOutput::kREVERSE : nvinfer1::LoopOutput::kCONCATENATE;
+  nvinfer1::ILoopOutputLayer *output_layer = sequence_loop->addLoopOutput(*H, output_mode);
+  output_layer->setInput(1, *lstm_weights.max_seq_size_);
+  *hidden_out =
+    Reshape(sequence_loop->addLoopOutput(*hidden_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0),
+            nvinfer1::Dims3(1, 1, params_.hidden_size_));
+  *cell_out =
+    Reshape(sequence_loop->addLoopOutput(*cell_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0),
+            nvinfer1::Dims3(1, 1, params_.hidden_size_));
+  return Reshape(output_layer->getOutput(0), nvinfer1::Dims4(params_.sequence_size_, 1, 1, params_.hidden_size_));
+#else
+  MS_LOG(ERROR) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher";
+  return nullptr;
+#endif
+}
+
+int LSTMTensorRT::Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) {
+  if (op_binding_tensor_.size() == 0) {
+    MS_LOG(DEBUG) << "unsing serialized engine, add input tensor for " << op_name_;
+    mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT];
+    mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT];
+
+    op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(),
+                                               nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()});
+    op_binding_tensor_.push_back(
+      BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()});
+    params_.sequence_size_ = in_tensors_[0].Shape()[0];
+    op_binding_tensor_.push_back(
+      BindingHelper{(op_name_ + "_seq_input"), &params_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)});
+  }
+  for (auto tensor : op_binding_tensor_) {
+    auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor.name_, tensor.size_, tensor.data_type_);
+    if (device_ptr == nullptr) {
+      MS_LOG(ERROR) << "malloc for inputs tensor device memory failed " << tensor.name_;
+      return RET_ERROR;
+    }
+    int index = engine->getBindingIndex(tensor.name_.c_str());
+    network_tensor_bindings[index] = device_ptr;
+    runtime_->GetAllocator()->SyncMemInHostAndDevice(tensor.data_, tensor.name_, tensor.size_, true);
+    runtime_->GetAllocator()->MarkMemValid(tensor.name_, true);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LSTM, LSTMTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h
@ -0,0 +1,115 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <array>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+constexpr int INPUT_TENSOR_SIZE = 6;
+constexpr int OUTPUT_TENSOR_SIZE = 3;
+constexpr int INPUT_WEIGHT = 1;
+constexpr int STATE_WEIGHT = 2;
+constexpr int BIAS = 3;
+constexpr int HIDDEN_IN_TENSOR_INIT = 4;
+constexpr int CELL_IN_TENSOR_INIT = 5;
+constexpr int LSTM_GATE_NUM = 4;
+constexpr int BIDIRECTIONAL = 2;
+constexpr int OUTPUT_HIDDEN_INDEX = 1;
+constexpr int OUTPUT_CELL_INDEX = 2;
+constexpr int INPUT_SIZE_INDEX = 2;
+constexpr int FORGET_GATE = 2;
+constexpr int CELL_GATE = 3;
+constexpr int BATCH_SIZE_INDEX = 2;
+static const std::array<int, 4> INDICES{0, 1, 2, 3};
+
+struct LSTMParams {
+  int sequence_size_;
+  int input_data_size_;
+  int batch_size_;
+  int layer_count_;
+  int hidden_size_;
+  nvinfer1::DataType data_type_;
+  int directional_cnt_;
+};
+
+struct LstmState {
+  nvinfer1::ITensor *data_{nullptr};
+  nvinfer1::ITensor *hidden_{nullptr};
+  nvinfer1::ITensor *cell_{nullptr};
+};
+
+struct LstmWeights {
+  nvinfer1::ITensor *input_weights_{nullptr};
+  nvinfer1::ITensor *state_weights_{nullptr};
+  nvinfer1::ITensor *input_bias_{nullptr};
+  nvinfer1::ITensor *state_bias_{nullptr};
+  nvinfer1::ITensor *max_seq_size_{nullptr};
+};
+
+class LSTMTensorRT : public TensorRTOp {
+ public:
+  LSTMTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~LSTMTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  int Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) override;
+
+ private:
+  int PreProcess();
+
+  int AddLSTMLayers();
+
+  nvinfer1::ITensor *AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights,
+                                 LstmState *next_state);
+
+  nvinfer1::ITensor *Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims);
+
+  nvinfer1::ITensor *ConcateAll(std::vector<nvinfer1::ITensor *> all_tensort, int axis = 0);
+
+  nvinfer1::ITensor *AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                        nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                        bool is_backward = false);
+  nvinfer1::ITensor *AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights,
+                                    nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out,
+                                    bool is_backward = false);
+
+  int ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init,
+                          LstmState *input_state, int *input_weight_offset, int *state_weight_offset, int *bias_offset,
+                          LstmWeights *lstm_weights, const LstmState &next_state);
+
+  nvinfer1::INetworkDefinition *network_{nullptr};
+  nvinfer1::ITensor *input_data_{nullptr};
+  nvinfer1::ITensor *sequence_size_input_{nullptr};
+  nvinfer1::ITensor *op_data_out_{nullptr};
+  nvinfer1::ITensor *op_hidden_out_{nullptr};
+  nvinfer1::ITensor *op_cell_out_{nullptr};
+  LSTMParams params_;
+  std::string hidden_init_name_;
+  std::string cell_init_name_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc
@ -0,0 +1,202 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(MatmulOptPluginCreater);
+template class TensorRTPluginCreater<MatmulOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+// MatmulOptPlugin
+int MatmulOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                             const void *const *inputs, void *const *outputs, void *workspace,
+                             cudaStream_t stream) noexcept {
+  CHECK_NULL_RETURN(cublas_handle_);
+  CUBLAS_CHECK(cublasSetStream(cublas_handle_, stream));
+  const nvinfer1::PluginTensorDesc desc_a = inputDesc[0];
+  const nvinfer1::PluginTensorDesc desc_b = inputDesc[1];
+  const nvinfer1::PluginTensorDesc desc_c = outputDesc[0];
+
+  if (desc_a.dims.nbDims == DIMENSION_2D) {
+    // a: m * k, b: k * n, c: m * n
+    int m = desc_c.dims.d[0];
+    int n = desc_c.dims.d[1];
+    int k = b_trans_ ? desc_b.dims.d[1] : desc_b.dims.d[0];
+    const int mm_params[]{m, n, k};
+    CublasMM1Batch(inputs[0], inputs[1], outputs[0], mm_params, operations_, data_types_, cublas_handle_);
+  } else if (desc_a.dims.nbDims == DIMENSION_3D) {
+    return RunBatchedMatmul(inputDesc, outputDesc, inputs, outputs, workspace, stream);
+  } else {
+    MS_LOG(ERROR) << layer_name_ << " input dims needs check a: " << desc_a.dims.nbDims;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatmulOptPlugin::RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc,
+                                      const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                      void *const *outputs, void *workspace, cudaStream_t stream) {
+  const nvinfer1::PluginTensorDesc desc_b = inputDesc[1];
+  const nvinfer1::PluginTensorDesc desc_c = outputDesc[0];
+  int batch = desc_c.dims.d[0];
+  int m = desc_c.dims.d[1];
+  int n = desc_c.dims.d[DIMENSION_2D];
+  int k = b_trans_ ? desc_b.dims.d[DIMENSION_2D] : desc_b.dims.d[1];
+  const int mm_params[]{m, n, k, batch};
+  for (int i = 0; i < batch; i++) {
+    a_addrs_[i] = inputs[0] + i * m * k * sizeof(float);
+    b_addrs_[i] = inputs[1] + i * k * n * sizeof(float);
+    c_addrs_[i] = outputs[0] + i * m * n * sizeof(float);
+  }
+  int data_size = batch * sizeof(void *);
+  int max_batchsize = a_addrs_.size();
+  if (a_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  if (b_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  if (c_device_addrs_ == nullptr) {
+    CUDA_CHECK(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize));
+  }
+  CUDA_CHECK(cudaMemcpy(a_device_addrs_, a_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(b_device_addrs_, b_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(c_device_addrs_, c_addrs_.data(), data_size, cudaMemcpyHostToDevice));
+
+  CublasMMBatched(a_device_addrs_, b_device_addrs_, c_device_addrs_, mm_params, operations_, data_types_,
+                  cublas_handle_);
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *MatmulOptPlugin::clone() const noexcept {
+  auto *plugin = new MatmulOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs MatmulOptPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                         int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  if (nbInputs != INPUT_SIZE2 && nbInputs != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "invalid input size " << nbInputs << " of " << layer_name_;
+    return out_dims;
+  }
+  out_dims.nbDims = inputs[0].nbDims;
+  if (out_dims.nbDims == DIMENSION_2D) {
+    out_dims.d[0] = a_trans_ ? inputs[0].d[1] : inputs[0].d[0];
+    out_dims.d[1] = b_trans_ ? inputs[1].d[0] : inputs[1].d[1];
+    return out_dims;
+  } else if (out_dims.nbDims == DIMENSION_3D) {
+    out_dims.d[0] = inputs[0].d[0];
+    out_dims.d[1] = a_trans_ ? inputs[0].d[DIMENSION_2D] : inputs[0].d[1];
+    out_dims.d[DIMENSION_2D] = b_trans_ ? inputs[1].d[1] : inputs[1].d[DIMENSION_2D];
+    return out_dims;
+  }
+  MS_LOG(ERROR) << "invalid input dims " << out_dims.nbDims << " of " << layer_name_;
+  return out_dims;
+}
+
+void MatmulOptPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                                      const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {
+  operations_[0] = a_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N;
+  operations_[1] = b_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N;
+  data_types_[0] = ConvertDataType(in[0].desc.type);             // input a
+  data_types_[1] = ConvertDataType(in[1].desc.type);             // input b
+  data_types_[THIRD_INPUT] = ConvertDataType(out[0].desc.type);  // output c
+  data_types_[FOURTH_INPUT] =
+    (in[0].desc.type == nvinfer1::DataType::kHALF || in[1].desc.type == nvinfer1::DataType::kHALF)
+      ? CUDA_R_16F
+      : CUDA_R_32F;  // compute type
+  if (in[0].max.nbDims == DIMENSION_3D) {
+    int max_batchsize = in[0].max.d[0];
+    a_addrs_.resize(max_batchsize);
+    b_addrs_.resize(max_batchsize);
+    c_addrs_.resize(max_batchsize);
+    if (a_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+    if (b_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+    if (c_device_addrs_ == nullptr) {
+      CUDA_CHECK_VOID(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize));
+    }
+  }
+}
+
+int MatmulOptPlugin::initialize() noexcept {
+  if (cublas_handle_ == nullptr) {
+    CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  }
+  for (int i = 0; i < DIMENSION_4D; i++) {
+    if (data_types_[i] != CUDA_R_32F) {
+      MS_LOG(ERROR) << layer_name_ << " only support fp32";
+      return RET_ERROR;
+    }
+  }
+}
+
+void MatmulOptPlugin::terminate() noexcept {
+  if (cublas_handle_ != nullptr) {
+    auto cublas_ret = cublasDestroy(cublas_handle_);
+    if (cublas_ret != CUBLAS_STATUS_SUCCESS) {
+      MS_LOG(ERROR) << "cublasDestroy failed: " << cublas_ret;
+    } else {
+      cublas_handle_ = nullptr;
+    }
+  }
+  cudaError_t err;
+  if (a_device_addrs_ != nullptr) {
+    err = cudaFree(a_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    a_device_addrs_ = nullptr;
+  }
+  if (b_device_addrs_ != nullptr) {
+    err = cudaFree(b_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    b_device_addrs_ = nullptr;
+  }
+  if (c_device_addrs_ != nullptr) {
+    err = cudaFree(c_device_addrs_);
+    if (err != cudaSuccess) {
+      MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err;
+    }
+    c_device_addrs_ = nullptr;
+  }
+}
+
+size_t MatmulOptPlugin::getSerializationSize() const noexcept { return 2 * sizeof(bool); }
+
+void MatmulOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &a_trans_, sizeof(bool));
+  SerializeValue(&buffer, &b_trans_, sizeof(bool));
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h
@ -0,0 +1,80 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h"
+
+namespace mindspore::lite {
+constexpr char *MATMUL_OPT_PLUGIN_NAME{"MatmulOptPlugin"};
+class MatmulOptPlugin : public TensorRTPlugin {
+ public:
+  MatmulOptPlugin(const std::string name, bool a_trans, bool b_trans, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(MATMUL_OPT_PLUGIN_NAME), device_id), a_trans_(a_trans), b_trans_(b_trans) {}
+
+  MatmulOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    a_trans_ = static_cast<const bool *>(fields[0].data)[0];
+    b_trans_ = static_cast<const bool *>(fields[1].data)[0];
+  }
+
+  MatmulOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &a_trans_, sizeof(bool));
+    DeserializeValue(&serialData, &serialLength, &b_trans_, sizeof(bool));
+  }
+
+  MatmulOptPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  int initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                       const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream);
+
+  bool a_trans_{false};
+  bool b_trans_{false};
+  cublasHandle_t cublas_handle_{nullptr};
+  cublasOperation_t operations_[2]{CUBLAS_OP_N, CUBLAS_OP_N};
+  cudaDataType data_types_[4]{CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
+  std::vector<const void *> a_addrs_;
+  std::vector<const void *> b_addrs_;
+  std::vector<void *> c_addrs_;
+  void **a_device_addrs_{nullptr};
+  void **b_device_addrs_{nullptr};
+  void **c_device_addrs_{nullptr};
+};
+class MatmulOptPluginCreater : public TensorRTPluginCreater<MatmulOptPlugin> {
+ public:
+  MatmulOptPluginCreater() : TensorRTPluginCreater(std::string(MATMUL_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc
@ -0,0 +1,310 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/matmul_tensorrt.h"
+#include <memory>
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h"
+
+namespace mindspore::lite {
+MatMulTensorRT::~MatMulTensorRT() {
+  if (weight_ptr_ != nullptr) {
+    free(weight_ptr_);
+    weight_ptr_ = nullptr;
+  }
+}
+int MatMulTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                              const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatMulTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (type_ == schema::PrimitiveType_MatMulFusion) {
+    auto primitive = this->GetPrimitive()->value_as_MatMulFusion();
+    if (primitive == nullptr) {
+      MS_LOG(ERROR) << "convert to primitive matmul failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_a_ = primitive->transpose_a();
+    transpose_b_ = primitive->transpose_b();
+    activation_ = primitive->activation_type();
+  }
+  nvinfer1::ITensor *out_tensor = nullptr;
+  if (RunOptPlugin()) {
+    out_tensor = AddAsOptPlugin(ctx);
+  } else if (RunFullConnect()) {
+    MS_LOG(DEBUG) << "use fully connected instead of matmul for " << op_name_;
+    out_tensor = AddAsFullConnect(ctx);
+  } else {
+    MS_LOG(DEBUG) << "use origin tensorrt matmul for " << op_name_;
+    out_tensor = AddAsMatmul(ctx);
+  }
+  if (out_tensor == nullptr) {
+    MS_LOG(ERROR) << "add matmul failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  // add activation
+  if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    nvinfer1::ILayer *activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for matmul failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    out_tensor = activation_layer->getOutput(0);
+  }
+
+  out_tensor->setName((op_name_ + "_output").c_str());
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor, out_format_, true);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_});
+  return RET_OK;
+}
+
+int MatMulTensorRT::PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b) {
+  if (tensorrt_in_tensors_.size() == INPUT_SIZE2) {
+    int a_index =
+      GetDimsVolume(tensorrt_in_tensors_[0].trt_tensor_->getDimensions()) == GetDimsVolume(in_tensors_[0].Shape()) ? 0
+                                                                                                                   : 1;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[a_index], matmul_a);
+    ret += PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - a_index], matmul_b);
+    if (ret != RET_OK || matmul_a->trt_tensor_ == nullptr || matmul_b->trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul inputs failed for " << op_name_;
+      return ret;
+    }
+    out_format_ = matmul_a->format_;
+    if (matmul_a->format_ != matmul_b->format_) {
+      MS_LOG(WARNING) << "matmul input tensor has different format " << op_name_;
+      out_format_ = Format::NHWC;
+    }
+  } else if (tensorrt_in_tensors_.size() == 1) {
+    auto weight = ProcessWeightTensor(ctx);
+    if (weight == nullptr) {
+      MS_LOG(ERROR) << "create constant weight tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0;
+    ITensorHelper *weight_helper = (weight_index == 1) ? matmul_b : matmul_a;
+    ITensorHelper *var_helper = (weight_index == 1) ? matmul_a : matmul_b;
+    weight_helper->trt_tensor_ = weight;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - weight_index], var_helper);
+    if (ret != RET_OK || var_helper->trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul input var_helper failed for " << op_name_;
+      return ret;
+    }
+    out_format_ = var_helper->format_;
+  } else {
+    MS_LOG(ERROR) << op_name_ << " tensorrt in tensor size is invalid " << tensorrt_in_tensors_.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+nvinfer1::ITensor *MatMulTensorRT::ProcessWeightTensor(TensorRTContext *ctx) {
+  nvinfer1::ITensor *weight = nullptr;
+  int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0;
+  if (in_tensors_[weight_index].Shape().size() <
+      static_cast<size_t>(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) {
+    std::vector<int64_t> expect_shape(in_tensors_[1 - weight_index].Shape().size(), 1);
+    auto origin_shape = in_tensors_[weight_index].Shape();
+    for (int i = 0; i < origin_shape.size(); i++) {
+      expect_shape[expect_shape.size() - 1 - i] = origin_shape[origin_shape.size() - 1 - i];
+    }
+    weight = ConvertTensorWithExpandDims(ctx, in_tensors_[weight_index], expect_shape, op_name_);
+  } else if (in_tensors_[weight_index].Shape().size() ==
+             static_cast<size_t>(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) {
+    weight = ConvertConstantTensor(ctx, in_tensors_[weight_index], op_name_);
+  } else {
+    MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_;
+    return nullptr;
+  }
+  return weight;
+}
+
+nvinfer1::ITensor *MatMulTensorRT::AddAsMatmul(TensorRTContext *ctx) {
+  ITensorHelper matmul_a;
+  ITensorHelper matmul_b;
+
+  int ret = PreprocessMatMulInputs(ctx, &matmul_a, &matmul_b);
+  if (ret != RET_OK || matmul_a.trt_tensor_ == nullptr || matmul_b.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessMatMulInputs matmul failed for " << op_name_;
+    return nullptr;
+  }
+
+  MS_LOG(DEBUG) << "matmul input a " << GetTensorFormat(matmul_a);
+  MS_LOG(DEBUG) << "matmul input b " << GetTensorFormat(matmul_b);
+
+  auto matmul_layer = ctx->network()->addMatrixMultiply(
+    *matmul_a.trt_tensor_, transpose_a_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE,
+    *matmul_b.trt_tensor_, transpose_b_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE);
+  if (matmul_layer == nullptr) {
+    MS_LOG(ERROR) << "addMatrixMultiply failed for " << op_name_;
+    return nullptr;
+  }
+  this->layer_ = matmul_layer;
+  matmul_layer->setName(op_name_.c_str());
+  return AddBias(ctx, matmul_layer->getOutput(0));
+}
+
+nvinfer1::ITensor *MatMulTensorRT::AddAsFullConnect(TensorRTContext *ctx) {
+  nvinfer1::Weights weight;
+  nvinfer1::Weights bias = ConvertWeight(in_tensors_[kBiasIndex]);
+  nvinfer1::ITensor *input_a = tensorrt_in_tensors_[0].trt_tensor_;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  if (input_a->getDimensions().nbDims != DIMENSION_4D) {
+    nvinfer1::Dims in_dims(input_a->getDimensions());
+    in_dims.nbDims = DIMENSION_4D;
+    for (int i = input_a->getDimensions().nbDims; i < DIMENSION_4D; i++) {
+      in_dims.d[i] = 1;
+    }
+    input_a = Reshape(ctx, input_a, in_dims);
+    if (input_a == nullptr) {
+      MS_LOG(ERROR) << "reshape input failed for " << op_name_;
+      return nullptr;
+    }
+    MS_LOG(DEBUG) << "full connect expand input a to " << GetTensorFormat(input_a);
+  } else {
+    ITensorHelper tmp_input;
+    int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &tmp_input);
+    if (ret != RET_OK || tmp_input.trt_tensor_ == nullptr) {
+      MS_LOG(ERROR) << "rPreprocessInputs2SameDim failed for " << op_name_;
+      return nullptr;
+    }
+    input_a = tmp_input.trt_tensor_;
+    out_format_ = tmp_input.format_;
+    MS_LOG(DEBUG) << "full connect preprocess input a to " << GetTensorFormat(tmp_input);
+  }
+  if (!transpose_b_) {
+    // transpose weight
+    weight = TransposeWeight2D(in_tensors_[1], &weight_ptr_);
+    if (weight.values == nullptr || weight_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "TransposeWeight2D input weight failed for " << op_name_;
+      return nullptr;
+    }
+  } else {
+    weight = ConvertWeight(in_tensors_[1]);
+  }
+
+  int output_cnt = in_tensors_[kBiasIndex].Shape()[0];
+
+  auto fc_layer = ctx->network()->addFullyConnected(*input_a, output_cnt, weight, bias);
+  if (fc_layer == nullptr) {
+    MS_LOG(ERROR) << "add fully connected layer failed for " << op_name_;
+    return nullptr;
+  }
+  this->layer_ = fc_layer;
+  fc_layer->setName((op_name_ + "_fullyconnected").c_str());
+  nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0);
+  if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) {
+    std::vector<int64_t> out_dims(out_tensors_[0].Shape());
+    out_dims[0] = out_tensor->getDimensions().d[0];
+    out_tensor = Reshape(ctx, out_tensor, out_dims);
+  }
+  return out_tensor;
+}
+nvinfer1::ITensor *MatMulTensorRT::AddAsOptPlugin(TensorRTContext *ctx) {
+  nvinfer1::ITensor *weight_tensor = nullptr;
+  if (tensorrt_in_tensors_.size() >= INPUT_SIZE2) {
+    weight_tensor = tensorrt_in_tensors_[1].trt_tensor_;
+  } else {
+    weight_tensor = ConvertConstantTensor(ctx, in_tensors_[1], op_name_);
+  }
+
+  auto plugin = std::make_shared<MatmulOptPlugin>(op_name_, transpose_a_, transpose_b_, device_id_);
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create MatmulOptPlugin failed for " << op_name_;
+    return nullptr;
+  }
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, weight_tensor};
+  nvinfer1::IPluginV2Layer *matmul_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin);
+  if (matmul_layer == nullptr) {
+    MS_LOG(ERROR) << "add matmul opt plugin layer failed for " << op_name_;
+    return nullptr;
+  }
+  layer_ = matmul_layer;
+  return AddBias(ctx, matmul_layer->getOutput(0));
+}
+nvinfer1::ITensor *MatMulTensorRT::AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor) {
+  nvinfer1::ITensor *out_tensor = input_tensor;
+  if (in_tensors_.size() == kBiasIndex + 1) {
+    nvinfer1::ITensor *bias = nullptr;
+    if (in_tensors_[kBiasIndex].Shape().size() < static_cast<size_t>(out_tensor->getDimensions().nbDims)) {
+      std::vector<int64_t> expect_dims(out_tensors_[0].Shape());
+      expect_dims[0] = out_tensor->getDimensions().d[0];
+      bias = ConvertTensorWithExpandDims(ctx, in_tensors_[kBiasIndex], expect_dims, op_name_);
+    } else if (in_tensors_[kBiasIndex].Shape().size() == static_cast<size_t>(out_tensor->getDimensions().nbDims)) {
+      bias = ConvertConstantTensor(ctx, in_tensors_[kBiasIndex], op_name_);
+    } else {
+      MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_;
+      return nullptr;
+    }
+    if (bias == nullptr) {
+      MS_LOG(ERROR) << "create constant bias tensor failed for " << op_name_;
+      return nullptr;
+    }
+    auto bias_layer = ctx->network()->addElementWise(*out_tensor, *bias, nvinfer1::ElementWiseOperation::kSUM);
+    if (bias_layer == nullptr) {
+      MS_LOG(ERROR) << "add bias add layer failed for " << op_name_;
+      return nullptr;
+    }
+    auto bias_layer_name = op_name_ + "_bias";
+    bias_layer->setName(bias_layer_name.c_str());
+    out_tensor = bias_layer->getOutput(0);
+  }
+  return out_tensor;
+}
+
+bool MatMulTensorRT::RunOptPlugin() {
+  if (quant_type_ == schema::QuantType_QUANT_NONE &&
+      runtime_->GetRuntimePrecisionMode() == RuntimePrecisionMode::RuntimePrecisionMode_FP32) {
+    if (in_tensors_[0].Shape().size() == DIMENSION_2D && in_tensors_[1].Shape().size() == DIMENSION_2D &&
+        in_tensors_[0].Shape()[0] > 1 && tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0] == -1) {
+      MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 2D dynamic batchsize";
+      return true;
+    } else if (in_tensors_[0].Shape().size() == DIMENSION_3D && in_tensors_[1].Shape().size() == DIMENSION_3D) {
+      //  batched matmul using opt
+      MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 3D batchsized";
+      return true;
+    }
+  }
+  return false;
+}
+bool MatMulTensorRT::RunFullConnect() {
+  if (in_tensors_.size() == INPUT_SIZE3 && in_tensors_[1].Data() != nullptr &&
+      in_tensors_[kBiasIndex].Data() != nullptr && !transpose_a_ && in_tensors_[1].Shape().size() == DIMENSION_2D &&
+      (in_tensors_[0].Shape().size() == DIMENSION_2D || in_tensors_[0].Shape().size() == DIMENSION_4D)) {
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MatMulFusion, MatMulTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h
@ -0,0 +1,62 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class MatMulTensorRT : public TensorRTOp {
+ public:
+  MatMulTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~MatMulTensorRT() override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+ private:
+  int PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b);
+
+  nvinfer1::ITensor *ProcessWeightTensor(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsMatmul(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsFullConnect(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddAsOptPlugin(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor);
+
+  bool RunOptPlugin();
+  bool RunFullConnect();
+
+  bool transpose_a_{false};
+  bool transpose_b_{false};
+  Format out_format_{Format::NHWC};
+  schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION};
+  void *weight_ptr_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc
@ -0,0 +1,59 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h"
+#include "NvInferRuntimeCommon.h"
+#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(NormalizeOptPluginCreater);
+template class TensorRTPluginCreater<NormalizeOptPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int NormalizeOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  auto input = static_cast<const float *>(inputs[0]);
+  auto gamma = static_cast<const float *>(inputs[1]);
+  auto beta = static_cast<const float *>(inputs[2]);
+  auto output = static_cast<float *>(outputs[0]);
+  auto input_dims = inputDesc[0].dims;
+  size_t dim_at_axis = input_dims.d[axis_];
+  int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies<int64_t>());
+  Normalize(input, gamma, beta, output, dim_at_axis, epsilion_, element_cnt, stream);
+}
+
+nvinfer1::IPluginV2DynamicExt *NormalizeOptPlugin::clone() const noexcept {
+  auto *plugin = new NormalizeOptPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+size_t NormalizeOptPlugin::getSerializationSize() const noexcept { return sizeof(size_t) + sizeof(float); }
+
+void NormalizeOptPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &axis_, sizeof(size_t));
+  SerializeValue(&buffer, &epsilion_, sizeof(float));
+}
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+
+namespace mindspore::lite {
+constexpr char *NORMALIZE_OPT_PLUGIN_NAME{"NormalizeOptPlugin"};
+class NormalizeOptPlugin : public TensorRTPlugin {
+ public:
+  NormalizeOptPlugin(const std::string name, size_t axis, float epsilion, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(NORMALIZE_OPT_PLUGIN_NAME), device_id), axis_(axis), epsilion_(epsilion) {}
+
+  NormalizeOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    axis_ = static_cast<const size_t *>(fields[0].data)[0];
+    epsilion_ = static_cast<const float *>(fields[1].data)[0];
+  }
+
+  NormalizeOptPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &axis_, sizeof(size_t));
+    DeserializeValue(&serialData, &serialLength, &epsilion_, sizeof(float));
+  }
+
+  NormalizeOptPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  size_t axis_{0};
+  float epsilion_{0.0f};
+};
+class NormalizeOptPluginCreater : public TensorRTPluginCreater<NormalizeOptPlugin> {
+ public:
+  NormalizeOptPluginCreater() : TensorRTPluginCreater(std::string(NORMALIZE_OPT_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc
@ -0,0 +1,178 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/normalize_tensorrt.h"
+#include <functional>
+#include <memory>
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h"
+
+namespace mindspore::lite {
+int NormalizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != INPUT_SIZE3 && out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  auto norm_op = primitive->value_as_LayerNormFusion();
+  CHECK_NULL_RETURN(norm_op);
+  int being_norm_axis = norm_op->begin_norm_axis();
+  being_norm_axis = being_norm_axis >= 0 ? being_norm_axis : in_tensors[0].Shape().size() + being_norm_axis;
+  int begin_params_axis = norm_op->begin_params_axis();
+  begin_params_axis = begin_params_axis >= 0 ? begin_params_axis : in_tensors[0].Shape().size() + begin_params_axis;
+  if (begin_params_axis != being_norm_axis || begin_params_axis != in_tensors[0].Shape().size() - 1) {
+    MS_LOG(ERROR) << "only support normalize on last one dim, being_norm_axis is " << being_norm_axis << " for "
+                  << op_name_;
+    return RET_ERROR;
+  }
+  axis_ = begin_params_axis;
+  epsilon_ = norm_op->epsilon();
+  return RET_OK;
+}
+
+int NormalizeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  CHECK_NULL_RETURN(ctx->network());
+  int ret = PreprocessInputs(ctx);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "preprocess input failed for " << op_name_;
+    return ret;
+  }
+  return RunOptPlugin() ? RunAsOptPlugin(ctx) : RunAsTrtOps(ctx);
+}
+
+int NormalizeTensorRT::PreprocessInputs(TensorRTContext *ctx) {
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &norm_input_);
+  if (ret != RET_OK || norm_input_.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim norm_input failed for " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == BETA_INDEX + 1) {
+    gamma_ = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + in_tensors_[1].Name());
+    CHECK_NULL_RETURN(gamma_);
+    beta_ = ConvertTensorWithExpandDims(ctx, in_tensors_[BETA_INDEX], in_tensors_[0].Shape(),
+                                        op_name_ + in_tensors_[BETA_INDEX].Name());
+    CHECK_NULL_RETURN(beta_);
+  }
+  return RET_OK;
+}
+
+int NormalizeTensorRT::RunAsOptPlugin(TensorRTContext *ctx) {
+  auto plugin = std::make_shared<NormalizeOptPlugin>(op_name_, axis_, epsilon_, device_id_);
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "create NormalizeOptPlugin failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *inputTensors[] = {norm_input_.trt_tensor_, gamma_, beta_};
+  nvinfer1::IPluginV2Layer *norm_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE3, *plugin);
+  if (norm_layer == nullptr) {
+    MS_LOG(ERROR) << "add norm opt plugin layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  layer_ = norm_layer;
+  layer_->setName(op_name_.c_str());
+  AddInnerOutTensors(ITensorHelper{norm_layer->getOutput(0), norm_input_.format_, norm_input_.same_format_});
+  return RET_OK;
+}
+
+int NormalizeTensorRT::RunAsTrtOps(TensorRTContext *ctx) {
+  size_t axis = 1u << axis_;
+  // first output, add later
+  AddInnerOutTensors(ITensorHelper{nullptr, norm_input_.format_, norm_input_.same_format_});
+
+  // mean
+  auto mean =
+    ctx->network()->addReduce(*(norm_input_.trt_tensor_), nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0);
+  CHECK_NULL_RETURN(mean);
+  if (out_tensors_.size() == INPUT_SIZE3) {
+    AddInnerOutTensors(ITensorHelper{mean, norm_input_.format_, norm_input_.same_format_});
+  }
+  // x - mean
+  auto sub_mean = ctx->network()
+                    ->addElementWise(*(norm_input_.trt_tensor_), *mean, nvinfer1::ElementWiseOperation::kSUB)
+                    ->getOutput(0);
+  CHECK_NULL_RETURN(sub_mean);
+  // (x - mean)^2
+  auto const_two =
+    ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &two_, DataType::kNumberTypeFloat32, op_name_ + "_two");
+  CHECK_NULL_RETURN(const_two);
+  auto pow = ctx->network()->addElementWise(*sub_mean, *const_two, nvinfer1::ElementWiseOperation::kPOW)->getOutput(0);
+  CHECK_NULL_RETURN(pow);
+  // mean of (x - mean)^2
+  auto var = ctx->network()->addReduce(*pow, nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0);
+  CHECK_NULL_RETURN(var);
+  if (out_tensors_.size() == INPUT_SIZE3) {
+    AddInnerOutTensors(ITensorHelper{var, norm_input_.format_, norm_input_.same_format_});
+  }
+
+  // var + min epsilon
+  auto const_epsilon = ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &epsilon_,
+                                              DataType::kNumberTypeFloat32, op_name_ + "_epsilion");
+  CHECK_NULL_RETURN(const_epsilon);
+  auto var_epsilon =
+    ctx->network()->addElementWise(*var, *const_epsilon, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+  CHECK_NULL_RETURN(var_epsilon);
+
+  // standard deviation
+  auto std_dev = ctx->network()->addUnary(*var_epsilon, nvinfer1::UnaryOperation::kSQRT)->getOutput(0);
+  CHECK_NULL_RETURN(std_dev);
+
+  // sub_mean / std_dev
+  auto norm_layer = ctx->network()->addElementWise(*sub_mean, *std_dev, nvinfer1::ElementWiseOperation::kDIV);
+  CHECK_NULL_RETURN(norm_layer);
+  this->layer_ = norm_layer;
+  auto norm = norm_layer->getOutput(0);
+  CHECK_NULL_RETURN(norm);
+
+  // scale with gamma and beta
+  if (gamma_ != nullptr && beta_ != nullptr) {
+    auto gamma_out =
+      ctx->network()->addElementWise(*norm, *gamma_, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
+    CHECK_NULL_RETURN(gamma_out);
+    auto beta_out =
+      ctx->network()->addElementWise(*gamma_out, *beta_, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+    CHECK_NULL_RETURN(beta_out);
+    tensorrt_out_tensors_[0].trt_tensor_ = beta_out;
+  } else {
+    tensorrt_out_tensors_[0].trt_tensor_ = norm;
+  }
+  return RET_OK;
+}
+
+bool NormalizeTensorRT::RunOptPlugin() {
+  if (out_tensors_.size() == 1 && in_tensors_.size() == INPUT_SIZE3 && axis_ == in_tensors_[0].Shape().size() - 1 &&
+      in_tensors_[0].Shape()[axis_] < GET_THREADS) {
+    // insufficient shared memory
+    int dim_sum = std::accumulate(in_tensors_[0].Shape().begin(), in_tensors_[0].Shape().begin() + axis_, 1,
+                                  std::multiplies<int>());
+    const int kSharedMemoryThreshold = 2048;
+    if (dim_sum > kSharedMemoryThreshold) {
+      return false;
+    }
+    MS_LOG(INFO) << op_name_ << " use opt plugin";
+    return true;
+  }
+  return false;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LayerNormFusion, NormalizeTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h
@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+constexpr int BETA_INDEX = 2;
+
+class NormalizeTensorRT : public TensorRTOp {
+ public:
+  NormalizeTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~NormalizeTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int PreprocessInputs(TensorRTContext *ctx);
+
+  int RunAsOptPlugin(TensorRTContext *ctx);
+
+  int RunAsTrtOps(TensorRTContext *ctx);
+
+  bool RunOptPlugin();
+
+  ITensorHelper norm_input_;
+  nvinfer1::ITensor *gamma_{nullptr};
+  nvinfer1::ITensor *beta_{nullptr};
+  size_t axis_{0};
+  const float two_{2.0f};
+  float epsilon_{0.0f};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc
@ -0,0 +1,140 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/pad_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PadTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                           const std::vector<mindspore::MSTensor> &in_tensors,
+                           const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors_[1].Data() == nullptr) {
+    MS_LOG(ERROR) << "invalid pad tensor for: " << op_name_;
+    return RET_ERROR;
+  }
+  auto pad_primitive = this->GetPrimitive()->value_as_PadFusion();
+  if (pad_primitive == nullptr) {
+    MS_LOG(ERROR) << "convert PadFusion failed: " << op_name_;
+    return RET_ERROR;
+  }
+  schema::PaddingMode padding_mode = pad_primitive->padding_mode();
+  if (padding_mode != schema::PaddingMode::PaddingMode_CONSTANT) {
+    MS_LOG(ERROR) << "Unsupported padding mode: " << schema::PaddingMode(padding_mode) << ", for op: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  constant_value_ = pad_primitive->constant_value();
+  return RET_OK;
+}
+
+int PadTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  mindspore::MSTensor &pad_tensor = in_tensors_[1];
+  int element_cnt = std::accumulate(pad_tensor.Shape().begin(), pad_tensor.Shape().end(), 1, std::multiplies<int>());
+  if (element_cnt != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims * INPUT_SIZE2) {
+    MS_LOG(ERROR) << "pad tensor cnt is invalid. cnt: " << element_cnt
+                  << ", input tensor dims cnt: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *pad_input = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "before transpose "
+                << GetTensorFormat(pad_input, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_);
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    pad_input = transpose_layer_in->getOutput(0);
+    MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(pad_input, Format::NCHW, false);
+  }
+
+  // trt 6 only support 2D padding
+  const int *padding_data = reinterpret_cast<const int *>(in_tensors_[1].Data().get());
+  MS_ASSERT(padding_data);
+  nvinfer1::IPaddingLayer *padding_layer = nullptr;
+  if (element_cnt == index_NHWC_ * INPUT_SIZE2) {
+    // only support pad at HW index
+    int h_pre;
+    int h_post;
+    int w_pre;
+    int w_post;
+    if (SameDims(pad_input->getDimensions(), in_tensors_[0].Shape())) {
+      // NCHW: 0: N_pre, 1: N_post, 2: C_pre, 3: C_post, 4: H_pre, 5: H_post, 6: W_pre, 7: W_post
+      if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 2) != 0 || *(padding_data + 3) != 0) {
+        MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_;
+      }
+      h_pre = 4;
+      h_post = 5;
+      w_pre = 6;
+      w_post = 7;
+    } else {
+      // NHWC: 0: N_pre, 1: N_post, 2: H_pre, 3: H_post, 4: W_pre, 5: W_post, 6: C_pre, 7: C_post
+      if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 6) != 0 || *(padding_data + 7) != 0) {
+        MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_;
+      }
+      h_pre = 2;
+      h_post = 3;
+      w_pre = 4;
+      w_post = 5;
+    }
+    nvinfer1::DimsHW prePadding{*(padding_data + h_pre), *(padding_data + w_pre)};
+    nvinfer1::DimsHW postPadding{*(padding_data + h_post), *(padding_data + w_post)};
+    MS_LOG(DEBUG) << op_name_ << " prePadding: " << prePadding.d[0] << ", " << prePadding.d[1]
+                  << "; postPadding: " << postPadding.d[0] << ", " << postPadding.d[1];
+
+    padding_layer = ctx->network()->addPadding(*pad_input, prePadding, postPadding);
+  } else {
+    MS_LOG(ERROR) << "need check for pad_tensor dims: " << op_name_
+                  << ", pad_tensor ElementNum: " << pad_tensor.ElementNum();
+    return RET_ERROR;
+  }
+  if (padding_layer == nullptr) {
+    MS_LOG(ERROR) << "add padding layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  this->layer_ = padding_layer;
+  padding_layer->setName(op_name_.c_str());
+  padding_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  bool same_format = SameDims(padding_layer->getOutput(0)->getDimensions(), out_tensors_[0].Shape()) &&
+                     SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape());
+  this->AddInnerOutTensors(ITensorHelper{padding_layer->getOutput(0), Format::NCHW, same_format});
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PadFusion, PadTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h
@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PadTensorRT : public TensorRTOp {
+ public:
+  PadTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+              const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+              const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PadTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  const int index_NHWC_ = 4;
+  float constant_value_ = 0.0f;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc
@ -0,0 +1,220 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/pool_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PoolTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                            const std::vector<mindspore::MSTensor> &in_tensors,
+                            const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PoolTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (tensorrt_in_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << tensorrt_in_tensors_.size();
+    return RET_ERROR;
+  }
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]);
+  int ret = ParseParams();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ParseParams failed for : " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *pool_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    this->transpose_layer_ = transpose_layer_in;
+    pool_input = transpose_layer_in->getOutput(0);
+  }
+
+  // pooling layer
+  nvinfer1::Dims windowSize = lite::ConvertCudaDims(kernel_size_);
+  if (windowSize.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::IPoolingLayer *pooling_layer = ctx->network()->addPoolingNd(*pool_input, pooling_type_, windowSize);
+  if (pooling_layer == nullptr) {
+    MS_LOG(ERROR) << "addPoolingNd failed for TensorRT.";
+    return RET_ERROR;
+  }
+  AddParams(pooling_layer);
+  pooling_layer->setName(op_name_.c_str());
+  this->layer_ = pooling_layer;
+
+  // add activation
+  nvinfer1::ILayer *activation_layer = nullptr;
+  if (activation_type_ == schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    activation_layer = pooling_layer;
+  } else {
+    activation_layer =
+      ActivationTensorRT::AddActivation(ctx, activation_type_, 0, 0, 0, pooling_layer->getOutput(0), device_id_);
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for pool failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+  }
+  nvinfer1::ITensor *out_trt_tensor = activation_layer->getOutput(0);
+  out_trt_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_trt_tensor, Format::NCHW, false});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+int PoolTensorRT::ParseParams() {
+  int in_h = in_tensors_[0].Shape()[kNHWC_H];
+  int in_w = in_tensors_[0].Shape()[kNHWC_W];
+  int out_h = out_tensors_[0].Shape()[kNHWC_H];
+  int out_w = out_tensors_[0].Shape()[kNHWC_W];
+  int kernel_h;
+  int kernel_w;
+  switch (type_) {
+    case (schema::PrimitiveType_AvgPoolFusion): {
+      const schema::AvgPoolFusion *pool_primitive = this->GetPrimitive()->value_as_AvgPoolFusion();
+      if (pool_primitive == nullptr) {
+        MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_;
+        return RET_ERROR;
+      }
+      pooling_type_ = nvinfer1::PoolingType::kAVERAGE;
+
+      auto stride = pool_primitive->strides();
+      if (stride == nullptr) {
+        MS_LOG(ERROR) << "get stride failed: " << op_name_;
+        return RET_ERROR;
+      }
+      stride_ = std::vector<int64_t>(stride->begin(), stride->end());
+      kernel_h = in_h - (out_h - 1) * stride_[0];
+      kernel_w = in_w - (out_w - 1) * stride_[1];
+      auto kernel_size = pool_primitive->kernel_size();
+      if (kernel_size == nullptr) {
+        kernel_size_.push_back(kernel_h);
+        kernel_size_.push_back(kernel_w);
+        MS_LOG(WARNING) << op_name_ << "don't has kernel size, calculate kernel size on ms tensor, kernel_h is "
+                        << kernel_h << ", kernel_w is " << kernel_w;
+      } else {
+        kernel_size_ = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+      }
+      auto padding = pool_primitive->pad();
+      if (padding != nullptr && padding->size() != DIMENSION_4D) {
+        MS_LOG(ERROR) << op_name_ << "has invalid pad dims: " << padding->size();
+        return RET_ERROR;
+      } else if (padding == nullptr || padding->size() == 0) {
+        padding_ = std::vector<int64_t>(DIMENSION_4D, 0);
+      } else {
+        padding_ = std::vector<int64_t>(padding->begin(), padding->end());
+      }
+
+      pad_mode_ = pool_primitive->pad_mode();
+      activation_type_ = pool_primitive->activation_type();
+      break;
+    }
+    case (schema::PrimitiveType_MaxPoolFusion): {
+      const schema::MaxPoolFusion *pool_primitive = this->GetPrimitive()->value_as_MaxPoolFusion();
+      if (pool_primitive == nullptr) {
+        MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_;
+        return RET_ERROR;
+      }
+      pooling_type_ = nvinfer1::PoolingType::kMAX;
+
+      auto kernel_size = pool_primitive->kernel_size();
+      if (kernel_size == nullptr) {
+        MS_LOG(ERROR) << "get kernel size failed: " << op_name_;
+        return RET_ERROR;
+      }
+      kernel_size_ = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
+
+      auto stride = pool_primitive->strides();
+      if (stride == nullptr) {
+        MS_LOG(ERROR) << "get stride failed: " << op_name_;
+        return RET_ERROR;
+      }
+      stride_ = std::vector<int64_t>(stride->begin(), stride->end());
+      kernel_h = in_h - (out_h - 1) * stride_[0];
+      kernel_w = in_w - (out_w - 1) * stride_[1];
+      auto padding = pool_primitive->pad();
+      if (padding == nullptr) {
+        MS_LOG(INFO) << "get padding is null, set to default 0: " << op_name_;
+        padding_ = {0, 0, 0, 0};
+      } else {
+        padding_ = std::vector<int64_t>(padding->begin(), padding->end());
+      }
+
+      pad_mode_ = pool_primitive->pad_mode();
+      activation_type_ = pool_primitive->activation_type();
+      break;
+    }
+    default: {
+      MS_LOG(ERROR) << "unsupported primitive type of " << type_ << " for node: " << op_name_;
+      return RET_ERROR;
+    }
+  }
+  // some model kernel size is large than hw, correct it
+  if (kernel_size_[0] > in_h || kernel_size_[1] > in_w) {
+    MS_LOG(WARNING) << op_name_ << " kernel size is larger than input size";
+    kernel_size_[0] = kernel_size_[0] > kernel_h ? kernel_h : kernel_size_[0];
+    kernel_size_[1] = kernel_size_[1] > kernel_w ? kernel_w : kernel_size_[1];
+  }
+  return RET_OK;
+}
+
+void PoolTensorRT::AddParams(nvinfer1::IPoolingLayer *pooling_layer) {
+  nvinfer1::Dims stride_dims = ConvertCudaDims(stride_);
+  if (stride_dims.nbDims == -1) {
+    MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_;
+    return;
+  }
+  pooling_layer->setStrideNd(stride_dims);
+  if (pad_mode_ == schema::PadMode::PadMode_SAME) {
+    pooling_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+  } else {
+    nvinfer1::Dims dims{};
+    dims.nbDims = DIMENSION_2D;
+    dims.d[0] = padding_[0];
+    dims.d[1] = padding_[DIMENSION_2D];
+    pooling_layer->setPaddingNd(dims);
+  }
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AvgPoolFusion, PoolTensorRT)
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MaxPoolFusion, PoolTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PoolTensorRT : public TensorRTOp {
+ public:
+  PoolTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+               const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+               const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PoolTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int ParseParams();
+
+  void AddParams(nvinfer1::IPoolingLayer *pooling_layer);
+
+  std::vector<int64_t> kernel_size_;
+
+  std::vector<int64_t> stride_;
+
+  std::vector<int64_t> padding_;
+
+  nvinfer1::PoolingType pooling_type_;
+
+  schema::PadMode pad_mode_;
+
+  schema::ActivationType activation_type_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc
@ -0,0 +1,79 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/prelu_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int PReluTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                             const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int PReluTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  ITensorHelper prelu_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &prelu_input);
+  if (ret != RET_OK || prelu_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return ret;
+  }
+  int input_nbdims = prelu_input.trt_tensor_->getDimensions().nbDims;
+  int slope_nbdims = in_tensors_[1].Shape().size();
+  auto slope = tensorrt_in_tensors_[1].trt_tensor_;
+  if (input_nbdims != slope_nbdims) {
+    slope = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + "_slope");
+    tensorrt_in_tensors_[1].trt_tensor_ = slope;
+  }
+  if (slope == nullptr) {
+    MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+    return RET_ERROR;
+  }
+  ITensorHelper slope_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &slope_helper);
+  if (ret != RET_OK || slope_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim slope tensor failed for " << op_name_;
+    return ret;
+  }
+
+  auto *prelu_layer = ctx->network()->addParametricReLU(*prelu_input.trt_tensor_, *slope_helper.trt_tensor_);
+  if (prelu_layer == nullptr) {
+    MS_LOG(ERROR) << "addParameticReLU failed for TensorRT : " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *out_tensor = prelu_layer->getOutput(0);
+  out_tensor->setName((op_name_ + "_0").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, prelu_input.format_, prelu_input.same_format_});
+  this->layer_ = prelu_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PReLUFusion, PReluTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class PReluTensorRT : public TensorRTOp {
+ public:
+  PReluTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~PReluTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc
@ -0,0 +1,139 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <valarray>
+#include "src/runtime/delegate/tensorrt/op/reduce_tensorrt.h"
+
+namespace mindspore::lite {
+int ReduceTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  return RET_OK;
+}
+
+int ReduceTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  auto reduce_op = op_primitive_->value_as_ReduceFusion();
+  if (reduce_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed";
+    return RET_ERROR;
+  }
+  bool keep_dims = reduce_op->keep_dims();
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  nvinfer1::ITensor *reduce_input = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]);
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      !SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape())) {
+    if (tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+      // NCHW->NHWC
+      nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      reduce_input = transpose_layer->getOutput(0);
+      out_format_ = Format::NHWC;
+      this->transpose_layer_ = transpose_layer;
+    } else if (tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+      // NHWC->NCHW
+      nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+        return RET_ERROR;
+      }
+      transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+      reduce_input = transpose_layer->getOutput(0);
+      out_format_ = Format::NCHW;
+      this->transpose_layer_ = transpose_layer;
+    } else {
+      MS_LOG(WARNING) << "input tensor format needs check: " << op_name_;
+    }
+  }
+  MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(reduce_input, out_format_, true);
+  if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) {
+    // x^2
+    auto *pow2_layer =
+      ctx->network()->addElementWise(*reduce_input, *reduce_input, nvinfer1::ElementWiseOperation::kPROD);
+    CHECK_NULL_RETURN(pow2_layer);
+    pow2_layer->setName((op_name_ + "_pow2").c_str());
+
+    reduce_input = pow2_layer->getOutput(0);
+    CHECK_NULL_RETURN(reduce_input);
+  }
+
+  uint32_t reduceAxis = GetAxis();
+  auto reduce_operation_opt = TryConvertTRTReduceMode(reduce_op->mode());
+  if (!reduce_operation_opt) {
+    MS_LOG(WARNING) << "invalid reduce for TensorRT, need check: " << static_cast<int>(reduce_op->mode());
+    return RET_ERROR;
+  }
+  nvinfer1::IReduceLayer *layer =
+    ctx->network()->addReduce(*reduce_input, reduce_operation_opt.value(), reduceAxis, keep_dims);
+  CHECK_NULL_RETURN(layer);
+  layer->setName(op_name_.c_str());
+  this->layer_ = layer;
+
+  nvinfer1::ITensor *out_tensor = layer->getOutput(0);
+  CHECK_NULL_RETURN(out_tensor);
+
+  if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) {
+    auto sqrt_layer = ctx->network()->addUnary(*out_tensor, nvinfer1::UnaryOperation::kSQRT);
+    CHECK_NULL_RETURN(sqrt_layer);
+    sqrt_layer->setName((op_name_ + "_sqrt").c_str());
+    out_tensor = sqrt_layer->getOutput(0);
+  }
+  out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_, true});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+uint32_t ReduceTensorRT::GetAxis() {
+  // axis
+  uint32_t reduceAxis = 0;
+  mindspore::MSTensor axis_tensor = this->in_tensors_[1];
+  if (axis_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "invalid axis_tensor";
+    return reduceAxis;
+  }
+  if (axis_tensor.DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(WARNING) << "not int data type";
+  }
+  int *axis_data = reinterpret_cast<int *>(axis_tensor.MutableData());
+  CHECK_NULL_RETURN(axis_data);
+  for (int i = 0; i < axis_tensor.ElementNum(); i++) {
+    int format_axis_data = (*axis_data == -1) ? in_tensors_[0].Shape().size() - 1 : *axis_data;
+    MS_LOG(DEBUG) << op_name_ << " reduceAxis at index : " << *axis_data;
+    reduceAxis |= 1u << format_axis_data;
+    axis_data++;
+  }
+  return reduceAxis;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceFusion, ReduceTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h
@ -0,0 +1,44 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ReduceTensorRT : public TensorRTOp {
+ public:
+  ReduceTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ReduceTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  uint32_t GetAxis();
+  Format out_format_;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc
@ -0,0 +1,126 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h"
+#include <numeric>
+#include <thread>
+#include "NvInferRuntimeCommon.h"
+
+namespace mindspore::lite {
+REGISTER_TENSORRT_PLUGIN(ReduceScatterPluginCreater);
+template class TensorRTPluginCreater<ReduceScatterPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int ReduceScatterTensorRT::IsSupport(const schema::Primitive *primitive,
+                                     const std::vector<mindspore::MSTensor> &in_tensors,
+                                     const std::vector<mindspore::MSTensor> &out_tensors) {
+#ifndef LITE_CUDA_DISTRIBUTION
+  MS_LOG(ERROR)
+    << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on.";
+  return RET_ERROR;
+#else
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+#endif
+}
+
+int ReduceScatterTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_};
+  auto reduce_op = op_primitive_->value_as_ReduceScatter();
+  if (reduce_op == nullptr) {
+    MS_LOG(ERROR) << "convert failed for " << op_name_;
+    return RET_ERROR;
+  }
+  auto reduce_mode = reduce_op->mode();
+  auto rank = GetGPUGroupSize();
+  auto plugin = std::make_shared<ReduceScatterPlugin>(op_name_, reduce_mode, rank, device_id_);
+  MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID();
+  nvinfer1::IPluginV2Layer *reduce_scatter_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin);
+  if (reduce_scatter_layer == nullptr) {
+    MS_LOG(ERROR) << "create ReduceScatter layer failed for: " << op_name_;
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *reduce_scatter_out = reduce_scatter_layer->getOutput(0);
+  reduce_scatter_layer->setName(op_name_.c_str());
+  reduce_scatter_out->setName((op_name_ + "_output").c_str());
+  this->layer_ = reduce_scatter_layer;
+  this->AddInnerOutTensors(
+    ITensorHelper{reduce_scatter_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_});
+  return RET_OK;
+}
+
+// ReduceScatterPlugin
+int ReduceScatterPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                 const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                 void *const *outputs, void *workspace, cudaStream_t stream) noexcept {
+  MS_LOG(INFO) << "ReduceScatter run at rank id: " << GetRankID() << " stream: " << stream;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+  int recieve_element_cnt =
+    std::accumulate(output_dims.d, output_dims.d + output_dims.nbDims, 1, std::multiplies<int64_t>());
+  const void *input = inputs[0];
+  void *output = outputs[0];
+  auto data_type = inputDesc->type;
+  auto ret = DistributionCollective::instance().ReduceScatterWrapper(input, output, recieve_element_cnt, data_type,
+                                                                     red_mode_, stream, NCCL_WORLD_GROUP);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ReduceScatter nccl run failed for " << layer_name_;
+    return ret;
+  }
+  return RET_OK;
+}
+
+nvinfer1::IPluginV2DynamicExt *ReduceScatterPlugin::clone() const noexcept {
+  auto *plugin = new ReduceScatterPlugin(*this);
+  plugin->setPluginNamespace(name_space_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs ReduceScatterPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                                             int nbInputs,
+                                                             nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs out_dims{};
+  out_dims.nbDims = inputs->nbDims;
+  auto rank_dim = exprBuilder.constant(rank_);
+  out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kCEIL_DIV, *inputs->d[0], *rank_dim);
+  for (int i = 1; i < inputs->nbDims; i++) {
+    out_dims.d[i] = inputs->d[i];
+  }
+  return out_dims;
+}
+
+size_t ReduceScatterPlugin::getSerializationSize() const noexcept { return sizeof(schema::ReduceMode); }
+
+void ReduceScatterPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &red_mode_, sizeof(schema::ReduceMode));
+}
+
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceScatter, ReduceScatterTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h
@ -0,0 +1,83 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h"
+
+namespace mindspore::lite {
+constexpr char *REDUCESCATTER_PLUGIN_NAME{"ReduceScatterPlugin"};
+class ReduceScatterTensorRT : public TensorRTOp {
+ public:
+  ReduceScatterTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                        const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                        const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ReduceScatterTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+
+class ReduceScatterPlugin : public TensorRTPlugin {
+ public:
+  ReduceScatterPlugin(const std::string name, schema::ReduceMode red_mode, int rank, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(REDUCESCATTER_PLUGIN_NAME), device_id), red_mode_(red_mode), rank_(rank) {}
+
+  ReduceScatterPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    red_mode_ = static_cast<const schema::ReduceMode *>(fields[0].data)[0];
+    rank_ = static_cast<const int *>(fields[1].data)[0];
+  }
+
+  ReduceScatterPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &red_mode_, sizeof(schema::ReduceMode));
+    DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int));
+  }
+
+  ReduceScatterPlugin() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+
+ private:
+  int rank_{0};
+  schema::ReduceMode red_mode_;
+};
+class ReduceScatterPluginCreater : public TensorRTPluginCreater<ReduceScatterPlugin> {
+ public:
+  ReduceScatterPluginCreater() : TensorRTPluginCreater(std::string(REDUCESCATTER_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc
@ -0,0 +1,230 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/resize_tensorrt.h"
+#include "nnacl/nnacl_common.h"
+
+namespace mindspore::lite {
+int ResizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                              const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+  }
+  resize_op_ = op_primitive_->value_as_Resize();
+  if (resize_op_ == nullptr) {
+    MS_LOG(ERROR) << "convert failed " << op_name_;
+    return RET_ERROR;
+  }
+  if (resize_op_->method() == schema::ResizeMethod_LINEAR) {
+    MS_LOG(WARNING) << "TensorRT linear resize has precision issue, using cpu instead for " << op_name_;
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_hw_dynamic_ =
+    (resize_op_->new_height() > 0 && resize_op_->new_width() > 0) ? false : true;
+  // constant new hw op don't support hw resize
+  return RET_OK;
+}
+
+int ResizeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *resize_in_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+  MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]);
+
+  if (resize_in_tensor->getDimensions().nbDims == DIMENSION_4D && tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+    resize_in_tensor = transpose_layer->getOutput(0);
+    this->transpose_layer_ = transpose_layer;
+  }
+  MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(resize_in_tensor, Format::NCHW, false);
+
+  nvinfer1::IResizeLayer *resize_layer = ctx->network()->addResize(*resize_in_tensor);
+  if (resize_layer == nullptr) {
+    MS_LOG(ERROR) << "create resize layer failed for " << op_name_;
+    return RET_ERROR;
+  }
+  int ret = SetOutputDims(resize_in_tensor, resize_layer);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetOutputDims failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  ret = SetParams(resize_layer);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "SetParams failed for " << op_name_;
+    return RET_ERROR;
+  }
+
+  resize_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{resize_layer->getOutput(0), Format::NCHW, false});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  this->layer_ = resize_layer;
+  return RET_OK;
+}
+
+int ResizeTensorRT::SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer) {
+  nvinfer1::Dims in_dims = resize_in_tensor->getDimensions();
+  if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) {
+    nvinfer1::Dims4 new_dims(in_dims.d[0], in_dims.d[1], resize_op_->new_height(), resize_op_->new_width());  // nchw
+    resize_layer->setOutputDimensions(new_dims);  // static shape
+  } else if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_hw_dynamic_ &&
+             dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) {
+    // hw is static, but has dynamic batch size
+    float scales[DIMENSION_4D]{1, 1, 1, 1};
+    scales[kNCHW_H] = static_cast<float>(resize_op_->new_height()) / static_cast<float>(in_dims.d[kNCHW_H]);
+    scales[kNCHW_W] = static_cast<float>(resize_op_->new_width()) / static_cast<float>(in_dims.d[kNCHW_W]);
+    resize_layer->setScales(scales, DIMENSION_4D);
+  } else {
+    auto shape_value_tensor = in_tensors_[1];
+    if (shape_value_tensor.Data() == nullptr && tensorrt_in_tensors_.size() >= INPUT_SIZE2) {
+      // dynamic output shape
+      resize_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_);
+    } else {
+      std::vector<float> out_shape;
+      ParseValueFromShapeTensor(shape_value_tensor, &out_shape);
+      if (SameDims(out_shape, out_tensors_[0].Shape())) {
+        // static dims
+        if (out_shape.size() == DIMENSION_4D) {
+          // convert nhwc to nchw
+          auto channel = out_shape[out_shape.size() - 1];
+          out_shape.insert(out_shape.begin() + 1, channel);
+          out_shape.erase(out_shape.begin() + out_shape.size() - 1);
+        }
+        resize_layer->setOutputDimensions(ConvertCudaDims(out_shape));
+      } else if (IsScaleOutputDim(in_tensors_[0].Shape(), out_tensors_[0].Shape(), out_shape)) {
+        // scale dims
+        float scales[DIMENSION_4D]{1, 1, 1, 1};
+        scales[kNCHW_H] =
+          static_cast<float>(out_tensors_[0].Shape()[kNHWC_H]) / static_cast<float>(in_tensors_[0].Shape()[kNHWC_H]);
+        scales[kNCHW_W] =
+          static_cast<float>(out_tensors_[0].Shape()[kNHWC_W]) / static_cast<float>(in_tensors_[0].Shape()[kNHWC_W]);
+        resize_layer->setScales(scales, DIMENSION_4D);
+      } else if (out_tensors_[0].Shape().size() == DIMENSION_4D) {
+        MS_LOG(DEBUG) << op_name_ << " output shape tensor value is const, but set to scales for dynamic input shape.";
+        float scales[out_tensors_[0].Shape().size()];
+        for (size_t i = 0; i < out_tensors_[0].Shape().size(); i++) {
+          scales[i] = static_cast<float>(out_tensors_[0].Shape()[i]) / static_cast<float>(in_tensors_[0].Shape()[i]);
+        }
+        // change to nchw
+        scales[kNCHW_W] = scales[kNHWC_W];
+        scales[kNCHW_H] = scales[kNHWC_H];
+        scales[kNCHW_C] = 1;
+        MS_LOG(DEBUG) << op_name_ << "scale at H " << kNCHW_H << ": " << scales[kNCHW_H] << ", W " << kNCHW_W << ": "
+                      << scales[kNCHW_W];
+        resize_layer->setScales(scales, out_tensors_[0].Shape().size());
+      } else {
+        MS_LOG(ERROR) << "resize dims needs check for " << op_name_;
+        return RET_ERROR;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+void ResizeTensorRT::ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor,
+                                               std::vector<float> *out_shape) {
+  switch (shape_value_tensor.DataType()) {
+    case DataType::kNumberTypeFloat32: {
+      const float *shape_data_fp32 = static_cast<const float *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(*(shape_data_fp32 + i));
+      }
+      break;
+    }
+    case DataType::kNumberTypeFloat16: {
+      const uint16_t *shape_data_fp16 = static_cast<const uint16_t *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(ShortToFloat32(*(shape_data_fp16 + i)));
+      }
+      break;
+    }
+    case DataType::kNumberTypeInt32: {
+      const int *shape_data_fp16 = static_cast<const int *>(shape_value_tensor.Data().get());
+      for (int i = 0; i < shape_value_tensor.ElementNum(); i++) {
+        out_shape->push_back(*(shape_data_fp16 + i));
+      }
+      break;
+    }
+    default:
+      MS_LOG(WARNING) << op_name_
+                      << " more datatype need to check: " << static_cast<int>(shape_value_tensor.DataType());
+      break;
+  }
+  if (out_shape->size() == DIMENSION_2D &&
+      tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D) {
+    // out_shape: origin_n, out_shape[0], out_shape[1], origin_c
+    out_shape->insert(out_shape->begin(),
+                      tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0]);  // batch size is dynamic
+    out_shape->push_back(in_tensors_[0].Shape()[kNHWC_C]);                         // channel is const
+  }
+}
+
+bool ResizeTensorRT::IsScaleOutputDim(const std::vector<int64_t> &in_shape, const std::vector<int64_t> &out_shape,
+                                      const std::vector<float> &shape_tensor_val) {
+  if (out_shape.size() != DIMENSION_4D) {
+    MS_LOG(WARNING) << "dims count needs check for " << op_name_;
+    return false;
+  }
+  if (in_shape.size() != out_shape.size() || shape_tensor_val.size() != in_shape.size()) {
+    MS_LOG(WARNING) << "tensor shape is not same for " << op_name_;
+    return false;
+  }
+  for (size_t i = 0; i < in_shape.size(); i++) {
+    if (std::abs(in_shape[i] * shape_tensor_val[i] - out_shape[i]) > 1e-6) {
+      return false;
+    }
+  }
+  return true;
+}
+
+int ResizeTensorRT::SetParams(nvinfer1::IResizeLayer *resize_layer) {
+  auto method = resize_op_->method();
+  std::map<schema::ResizeMethod, nvinfer1::ResizeMode> method_map = {
+    {schema::ResizeMethod_LINEAR, nvinfer1::ResizeMode::kLINEAR},
+    {schema::ResizeMethod_NEAREST, nvinfer1::ResizeMode::kNEAREST}};
+  if (method_map.find(method) == method_map.end()) {
+    MS_LOG(ERROR) << op_name_ << " unsupported resize mode " << EnumNameResizeMethod(method);
+    return RET_ERROR;
+  }
+  resize_layer->setResizeMode(method_map.at(method));
+
+  // unsupported for trt6, but support setCoordinateTransformation() in version8
+  auto coordinate_transform_mode = resize_op_->coordinate_transform_mode();
+  if (coordinate_transform_mode != schema::CoordinateTransformMode_ASYMMETRIC) {
+    MS_LOG(WARNING) << op_name_ << " has coordinate_transform_mode may not supported: "
+                    << EnumNameCoordinateTransformMode(coordinate_transform_mode);
+  }
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Resize, ResizeTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h
@ -0,0 +1,52 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include <map>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ResizeTensorRT : public TensorRTOp {
+ public:
+  ResizeTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                 const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                 const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ResizeTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  int SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer);
+
+  void ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor, std::vector<float> *out_shape);
+
+  bool IsScaleOutputDim(const std::vector<int64_t> &in_shape, const std::vector<int64_t> &out_shape,
+                        const std::vector<float> &shape_tensor_val);
+
+  int SetParams(nvinfer1::IResizeLayer *resize_layer);
+
+  const schema::Resize *resize_op_{nullptr};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc
@ -0,0 +1,227 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <functional>
+#include "src/runtime/delegate/tensorrt/op/scale_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+constexpr int SCALE_INDEX = 1;
+constexpr int SHIFT_INDEX = 2;
+constexpr int POWER_INDEX = 3;
+
+int ScaleTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != INPUT_SIZE4) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is: " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is: " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ScaleTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  CHECK_NULL_RETURN(ctx);
+  auto scale_op = op_primitive_->value_as_ScaleFusion();
+  CHECK_NULL_RETURN(scale_op);
+
+  schema::ActivationType activation_type = scale_op->activation_type();
+  // mode of scale
+  axis_ = scale_op->axis();
+  axis_ = axis_ < 0 ? static_cast<int64_t>(in_tensors_[0].Shape().size() + axis_) : axis_;
+  out_format_ = tensorrt_in_tensors_[0].format_;
+  out_same_format_ = tensorrt_in_tensors_[0].same_format_;
+  mode_ = GetScaleMode(axis_);
+  MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]);
+
+  nvinfer1::ITensor *scale_in_tensor = PreProcessInputTensor(ctx);
+  if (scale_in_tensor == nullptr) {
+    MS_LOG(ERROR) << "PreProcessInputTensor failed: " << op_name_;
+    return RET_ERROR;
+  }
+
+  MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(scale_in_tensor, out_format_, out_same_format_);
+
+  nvinfer1::ITensor *op_out_tensor{nullptr};
+  if (scale_in_tensor->getDimensions().nbDims == DIMENSION_4D) {
+    op_out_tensor = RunAs4DimsScale(ctx, scale_in_tensor);
+  } else {
+    op_out_tensor = RunAsMutiDimsScale(ctx, scale_in_tensor);
+  }
+  CHECK_NULL_RETURN(op_out_tensor);
+
+  // add activation
+  if (activation_type != schema::ActivationType::ActivationType_NO_ACTIVATION) {
+    auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation_type, 0, 0, 0, op_out_tensor, device_id_);
+    CHECK_NULL_RETURN(activation_layer);
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    op_out_tensor = activation_layer->getOutput(0);
+  }
+
+  op_out_tensor->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, out_format_, out_same_format_});
+  MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]);
+  return RET_OK;
+}
+
+nvinfer1::ITensor *ScaleTensorRT::PreProcessInputTensor(TensorRTContext *ctx) {
+  nvinfer1::ITensor *scale_in_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      mode_ == nvinfer1::ScaleMode::kCHANNEL) {
+    // per channel input format should be nchw, otherwise should be same with scale nhwc
+    // transpose: NHWC->NCHW
+    if ((tensorrt_in_tensors_[0].format_ == Format::NHWC && axis_ == kNHWC_C) ||
+        (tensorrt_in_tensors_[0].same_format_ == true && axis_ == kNHWC_C)) {
+      nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+      if (transpose_layer_in == nullptr) {
+        MS_LOG(ERROR) << "op action convert failed";
+        return nullptr;
+      }
+      transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+      scale_in_tensor = transpose_layer_in->getOutput(0);
+      out_format_ = Format::NCHW;
+      out_same_format_ = !out_same_format_;
+    } else if (out_format_ != Format::NCHW && axis_ != kNCHW_C) {
+      MS_LOG(WARNING) << op_name_ << " out format (NHWC:1, NCHW:0) infer as " << out_format_ << ", and axis is "
+                      << axis_;
+    }
+  } else if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+             tensorrt_in_tensors_[0].format_ == Format::NCHW && mode_ == nvinfer1::ScaleMode::kELEMENTWISE) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return nullptr;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    scale_in_tensor = transpose_layer_in->getOutput(0);
+    out_format_ = Format::NHWC;
+    out_same_format_ = true;
+  }
+  return scale_in_tensor;
+}
+
+nvinfer1::ScaleMode ScaleTensorRT::GetScaleMode(int64_t axis) {
+  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kUNIFORM;
+  auto input_data_shape = in_tensors_[0].Shape();
+  auto input_weight_shape = in_tensors_[1].Shape();
+  int total = std::accumulate(input_data_shape.begin(), input_data_shape.end(), 1, std::multiplies<int>());
+  if (input_weight_shape.size() == 0 || (input_weight_shape.size() == 1 && input_weight_shape[0] == 1)) {
+    mode = nvinfer1::ScaleMode::kUNIFORM;
+  } else if ((axis < static_cast<int64_t>(input_data_shape.size()) && input_weight_shape.size() == 1 &&
+              input_data_shape[axis] == input_weight_shape[0]) ||
+             (input_data_shape.size() == DIMENSION_4D && axis == DIMENSION_3D)) {
+    mode = nvinfer1::ScaleMode::kCHANNEL;
+  } else if (input_weight_shape.size() == 1 && input_weight_shape[0] == total) {
+    mode = nvinfer1::ScaleMode::kELEMENTWISE;
+  } else {
+    MS_LOG(ERROR) << "ScaleMode create failed: " << op_name_;
+    return mode;
+  }
+  MS_LOG(DEBUG) << op_name_ << " ScaleMode(UNIFORM 0, CHANNEL 1, ELEMENTWISE 2): " << static_cast<int>(mode);
+  return mode;
+}
+
+nvinfer1::ITensor *ScaleTensorRT::RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) {
+  bool nd = false;
+  // (input * scale + shift) ^ power
+  nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, nullptr, 0};
+  if (in_tensors_.size() > SCALE_INDEX) {
+    scale.values = in_tensors_[SCALE_INDEX].MutableData();
+    MS_ASSERT(scale.values);
+    scale.count = in_tensors_[SCALE_INDEX].ElementNum();
+    scale.type = ConvertDataType(in_tensors_[SCALE_INDEX].DataType());
+    shift.type = scale.type;
+    power.type = scale.type;
+    nd = in_tensors_[1].Shape().size() == 1 ? false : true;
+  }
+  if (in_tensors_.size() > SHIFT_INDEX) {
+    shift.values = in_tensors_[SHIFT_INDEX].MutableData();
+    MS_ASSERT(shift.values);
+    shift.count = in_tensors_[SHIFT_INDEX].ElementNum();
+  }
+  if (in_tensors_.size() > POWER_INDEX) {
+    power.values = in_tensors_[POWER_INDEX].MutableData();
+    MS_ASSERT(power.values);
+    power.count = in_tensors_[POWER_INDEX].ElementNum();
+  }
+  nvinfer1::IScaleLayer *cal_layer = nullptr;
+
+  if (nd) {
+    MS_LOG(WARNING) << "multi dims ScaleMode enter";
+    cal_layer = ctx->network()->addScaleNd(*scale_in_tensor, mode_, shift, scale, power, axis_);
+  } else {
+    cal_layer = ctx->network()->addScale(*scale_in_tensor, mode_, shift, scale, power);
+  }
+
+  if (cal_layer == nullptr) {
+    MS_LOG(ERROR) << "addScaleNd failed for: " << op_name_;
+    return nullptr;
+  }
+  cal_layer->setName(op_name_.c_str());
+  this->layer_ = cal_layer;
+  return cal_layer->getOutput(0);
+}
+
+nvinfer1::ITensor *ScaleTensorRT::RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) {
+  auto scale_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_);
+  if (scale_tensor == nullptr) {
+    MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_;
+    return nullptr;
+  }
+  auto mul_layer =
+    ctx->network()->addElementWise(*scale_in_tensor, *scale_tensor, nvinfer1::ElementWiseOperation::kPROD);
+  if (mul_layer == nullptr) {
+    MS_LOG(ERROR) << "add mul failed for " << op_name_;
+    return nullptr;
+  }
+  mul_layer->setName((op_name_ + "_scale").c_str());
+  layer_ = mul_layer;
+  nvinfer1::ITensor *out_tensor = mul_layer->getOutput(0);
+  // add shift
+  if (in_tensors_.size() >= INPUT_SIZE3) {
+    auto shift_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[SHIFT_INDEX], in_tensors_[0].Shape(), op_name_);
+    if (shift_tensor == nullptr) {
+      MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_;
+      return nullptr;
+    }
+    auto shift_layer = ctx->network()->addElementWise(*out_tensor, *shift_tensor, nvinfer1::ElementWiseOperation::kSUM);
+    if (shift_layer == nullptr) {
+      MS_LOG(ERROR) << "add bias failed for " << op_name_;
+      return nullptr;
+    }
+    shift_layer->setName((op_name_ + "_shift").c_str());
+    out_tensor = shift_layer->getOutput(0);
+  }
+  if (in_tensors_.size() == INPUT_SIZE4) {
+    MS_LOG(WARNING) << op_name_ << " has power";
+    return nullptr;
+  }
+  return out_tensor;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScaleFusion, ScaleTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h
@ -0,0 +1,57 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
+#include <string>
+#include <vector>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+namespace mindspore::lite {
+class ScaleTensorRT : public TensorRTOp {
+ public:
+  ScaleTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ScaleTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+
+ private:
+  nvinfer1::ScaleMode GetScaleMode(int64_t axis);
+
+  nvinfer1::ITensor *PreProcessInputTensor(TensorRTContext *ctx);
+
+  nvinfer1::ITensor *RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor);
+
+  nvinfer1::ITensor *RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor);
+
+  Format out_format_;
+
+  bool out_same_format_{false};
+
+  nvinfer1::ScaleMode mode_;
+
+  int64_t axis_{0};
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc
@ -0,0 +1,99 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include "src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h"
+#include "src/runtime/delegate/tensorrt/tensorrt_utils.h"
+
+namespace mindspore::lite {
+int ScatterNdTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
+                                 const std::vector<mindspore::MSTensor> &in_tensors,
+                                 const std::vector<mindspore::MSTensor> &out_tensors) {
+#if TRT_VERSION_GE(8, 2)
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != INPUT_SIZE3) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_;
+    return RET_ERROR;
+  }
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher";
+  return RET_ERROR;
+#endif
+}
+
+int ScatterNdTensorRT::AddInnerOp(TensorRTContext *ctx) {
+#if TRT_VERSION_GE(8, 2)
+  ITensorHelper scatter_input;
+  int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &scatter_input);
+  if (ret != RET_OK || scatter_input.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_;
+    return ret;
+  }
+  if (tensorrt_in_tensors_.size() < INPUT_SIZE3) {
+    auto indices = ConvertConstantTensor(ctx, in_tensors_[1], op_name_ + "_indice");
+    if (indices == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{indices});
+    auto updates = ConvertConstantTensor(ctx, in_tensors_[INPUT_SIZE2], op_name_ + "_update");
+    if (updates == nullptr) {
+      MS_LOG(ERROR) << "add const input tensor failed for " << op_name_;
+      return RET_ERROR;
+    }
+    tensorrt_in_tensors_.push_back(ITensorHelper{updates});
+  }
+  ITensorHelper indices_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &indices_helper);
+  if (ret != RET_OK || indices_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim indices tensor failed for " << op_name_;
+    return ret;
+  }
+  ITensorHelper updates_helper;
+  ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[INPUT_SIZE2], &updates_helper);
+  if (ret != RET_OK || updates_helper.trt_tensor_ == nullptr) {
+    MS_LOG(ERROR) << "PreprocessInputs2SameDim update tensor failed for " << op_name_;
+    return ret;
+  }
+
+  nvinfer1::IScatterLayer *scatter_layer = ctx->network()->addScatter(
+    *scatter_input.trt_tensor_, *indices_helper.trt_tensor_, *updates_helper.trt_tensor_, nvinfer1::ScatterMode::kND);
+  if (scatter_layer == nullptr) {
+    MS_LOG(ERROR) << "addScatter failed for TensorRT.";
+    return RET_ERROR;
+  }
+
+  nvinfer1::ITensor *out_tensor = scatter_layer->getOutput(0);
+  out_tensor->setName((op_name_ + "_0").c_str());
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, scatter_input.format_, scatter_input.same_format_});
+  this->layer_ = scatter_layer;
+  return RET_OK;
+#else
+  MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher";
+  return RET_ERROR;
+#endif
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScatterNdUpdate, ScatterNdTensorRT)
+}  // namespace mindspore::lite
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h
@ -0,0 +1,39 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h"
+
+namespace mindspore::lite {
+class ScatterNdTensorRT : public TensorRTOp {
+ public:
+  ScatterNdTensorRT(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                    const std::vector<mindspore::MSTensor> &out_tensors, const std::string &name,
+                    const schema::QuantType &quant_type)
+      : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {}
+
+  ~ScatterNdTensorRT() override = default;
+
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                const std::vector<mindspore::MSTensor> &out_tensors) override;
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_
--- a/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc
+++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc
@ -0,0 +1,69 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/delegate/tensorrt/op/shape_tensorrt.h"
+
+namespace mindspore::lite {
+int ShapeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector<mindspore::MSTensor> &in_tensors,
+                             const std::vector<mindspore::MSTensor> &out_tensors) {
+  if (!IsShapeKnown()) {
+    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
+    return RET_ERROR;
+  }
+  if (in_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  dynamic_shape_params_.support_dynamic_ = false;
+  dynamic_shape_params_.support_hw_dynamic_ = false;
+  return RET_OK;
+}
+int ShapeTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  nvinfer1::ITensor *shape_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NCHW->NHWC failed for " << op_name_;
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    shape_input = transpose_layer_in->getOutput(0);
+    this->transpose_layer_ = transpose_layer_in;
+  }
+  nvinfer1::IShapeLayer *shape_layer = ctx->network()->addShape(*shape_input);
+
+  if (shape_layer == nullptr) {
+    MS_LOG(ERROR) << "add shape op failed for TensorRT.";
+    return RET_ERROR;
+  }
+  shape_layer->setName(op_name_.c_str());
+  shape_layer->getOutput(0)->setName((op_name_ + "_output").c_str());
+  this->AddInnerOutTensors(ITensorHelper{shape_layer->getOutput(0), Format::NHWC, true});
+  this->layer_ = shape_layer;
+  return RET_OK;
+}
+REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Shape, ShapeTensorRT)
+}  // namespace mindspore::lite
--- a/Show More
+++ b/Show More