!22945 [MSLITE] optimize Execute time for tensorrt delegate by remove unnecessary transpose op

Merge pull request !22945 from Liu_Xuu/trt_0903_transpose
2021-09-08 07:58:01 +00:00 · 2021-09-08 07:58:01 +00:00 · 277cf0d892
parent c4b3329ecc e7ed943212
commit 277cf0d892
26 changed files with 463 additions and 250 deletions
--- a/mindspore/lite/src/delegate/tensorrt/op/activation_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/activation_tensorrt.cc
@ -15,7 +15,6 @@
 */

 #include "src/delegate/tensorrt/op/activation_tensorrt.h"
-#include "src/delegate/tensorrt/tensorrt_utils.h"

 namespace mindspore::lite {
 int ActivationTensorRT::IsSupport(const schema::Primitive *primitive,
@ -58,8 +57,8 @@ int ActivationTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }
  float alpha = activation_op->alpha();

-  nvinfer1::IActivationLayer *activation_layer =
-    ActivationTensorRT::AddActivation(network, activation_op->activation_type(), alpha, tensorrt_in_tensors_[0]);
+  nvinfer1::IActivationLayer *activation_layer = ActivationTensorRT::AddActivation(
+    network, activation_op->activation_type(), alpha, tensorrt_in_tensors_[0].trt_tensor_);
  if (activation_layer == nullptr) {
    MS_LOG(ERROR) << "add activation op failed for TensorRT.";
    return RET_ERROR;
@ -67,7 +66,7 @@ int ActivationTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {

  activation_layer->setName(op_name_.c_str());
  activation_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(activation_layer->getOutput(0));
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), tensorrt_in_tensors_[0].format_});

  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/tensorrt/op/concate_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/concate_tensorrt.cc
@ -24,7 +24,7 @@ int ConcateTensorRT::IsSupport(const schema::Primitive *primitive, const std::ve
    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
    return RET_ERROR;
  }
-  if (in_tensors.size() < 1) {
+  if (in_tensors.size() != INPUT_SIZE2) {
    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
    return RET_ERROR;
  }
@ -50,11 +50,46 @@ int ConcateTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }

+  nvinfer1::ITensor *trt_input_tensors[tensorrt_in_tensors_.size()];
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims !=
+      tensorrt_in_tensors_[1].trt_tensor_->getDimensions().nbDims) {
+    MS_LOG(ERROR) << "dims of inputs is invalid for " << op_name_;
+    return RET_ERROR;
+  }
+
+  // make sure two inputs have same format
+  Format out_format = tensorrt_in_tensors_[0].format_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D) {
+    if (tensorrt_in_tensors_[0].format_ == tensorrt_in_tensors_[1].format_) {
+      for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+        trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+      }
+    } else {
+      // when inputs format are different, change to NHWC
+      out_format = Format::NHWC;
+      int transpose_tensor_index = tensorrt_in_tensors_[0].format_ == Format::NCHW ? 0 : 1;
+      trt_input_tensors[1 - transpose_tensor_index] = tensorrt_in_tensors_[1 - transpose_tensor_index].trt_tensor_;
+      nvinfer1::IShuffleLayer *transpose_layer =
+        NCHW2NHWC(network, *tensorrt_in_tensors_[transpose_tensor_index].trt_tensor_);
+      if (transpose_layer == nullptr) {
+        MS_LOG(ERROR) << "op action convert failed";
+        return RET_ERROR;
+      }
+      trt_input_tensors[transpose_tensor_index] = transpose_layer->getOutput(0);
+    }
+  } else {
+    for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) {
+      trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_;
+    }
+  }
+
  int axis = RET_INVALID_OP_ATTR;
  axis = concate_op->axis();
-
-  nvinfer1::ITensor *trt_input_tensors[tensorrt_in_tensors_.size()];
-  std::copy(tensorrt_in_tensors_.begin(), tensorrt_in_tensors_.end(), trt_input_tensors);
+  if (out_format == Format::NCHW) {
+    // when inputs all NCHW, change axis
+    axis = ConvertAxisFromNHWC2NCHW(axis);
+    MS_LOG(INFO) << "concate axis change to " << axis << " when using NCHW format.";
+  }

  nvinfer1::IConcatenationLayer *concate_layer =
    network->addConcatenation(trt_input_tensors, static_cast<int>(tensorrt_in_tensors_.size()));
@ -68,8 +103,7 @@ int ConcateTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }
  concate_layer->setName(op_name_.c_str());
  concate_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(concate_layer->getOutput(0));
-
+  this->AddInnerOutTensors(ITensorHelper{concate_layer->getOutput(0), out_format});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/convolution_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/convolution_tensorrt.cc
@ -16,7 +16,6 @@

 #include "src/delegate/tensorrt/op/convolution_tensorrt.h"
 #include "src/delegate/tensorrt/op/activation_tensorrt.h"
-#include "src/delegate/tensorrt/tensorrt_utils.h"

 namespace mindspore::lite {
 constexpr int BIAS_INDEX = 2;
@ -28,7 +27,7 @@ int ConvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
    return RET_ERROR;
  }
-  if (in_tensors.size() != 2 && in_tensors.size() != 3) {
+  if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) {
    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
    return RET_ERROR;
  }
@ -36,6 +35,10 @@ int ConvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
    return RET_ERROR;
  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
  return RET_OK;
 }

@ -49,13 +52,19 @@ int ConvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "op action convert failed";
    return RET_ERROR;
  }
-  // transpose: NHWC->NCHW
-  nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0]);
-  if (transpose_layer_in == nullptr) {
-    MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
-    return RET_ERROR;
+
+  nvinfer1::ITensor *conv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    conv_input = transpose_layer_in->getOutput(0);
  }
-  transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());

  // transpose weight
  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
@ -86,7 +95,7 @@ int ConvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }

  nvinfer1::IConvolutionLayer *conv_layer =
-    network->addConvolutionNd(*transpose_layer_in->getOutput(0), nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+    network->addConvolutionNd(*conv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);

  if (conv_layer == nullptr) {
    MS_LOG(ERROR) << "ConvolutionLayer failed";
@ -111,15 +120,8 @@ int ConvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    activation_layer->setName((op_name_ + "_activation").c_str());
  }

-  // transpose: NCHW->NHWC
-  nvinfer1::IShuffleLayer *transpose_layer_out = NCHW2NHWC(network, *activation_layer->getOutput(0));
-  if (transpose_layer_out == nullptr) {
-    MS_LOG(ERROR) << "op action convert failed";
-    return RET_ERROR;
-  }
-  transpose_layer_out->setName((op_name_ + "_transpose2NHWC").c_str());
-  transpose_layer_out->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(transpose_layer_out->getOutput(0));
+  activation_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW});
  return RET_OK;
 }

--- a/mindspore/lite/src/delegate/tensorrt/op/deconvolution_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/deconvolution_tensorrt.cc
@ -16,7 +16,6 @@

 #include "src/delegate/tensorrt/op/deconvolution_tensorrt.h"
 #include "src/delegate/tensorrt/op/activation_tensorrt.h"
-#include "src/delegate/tensorrt/tensorrt_utils.h"
 #include "nnacl/pack.h"

 namespace mindspore::lite {
@ -35,6 +34,10 @@ int DeconvolutionTensorRT::IsSupport(const schema::Primitive *primitive,
    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
    return RET_ERROR;
  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
  return RET_OK;
 }
 int DeconvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
@ -47,13 +50,18 @@ int DeconvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "op action convert failed";
    return RET_ERROR;
  }
-  // transpose: NHWC->NCHW
-  nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0]);
-  if (transpose_layer_in == nullptr) {
-    MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
-    return RET_ERROR;
+  nvinfer1::ITensor *deconv_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    deconv_input = transpose_layer_in->getOutput(0);
  }
-  transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());

  // transpose weight
  const mindspore::MSTensor &weight_tensor = in_tensors_[1];
@ -83,8 +91,8 @@ int DeconvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    biasWeights.values = nullptr;
  }

-  nvinfer1::IDeconvolutionLayer *deconv_layer = network->addDeconvolutionNd(
-    *transpose_layer_in->getOutput(0), nbOutputMaps, kernelSize, kernelWeights, biasWeights);
+  nvinfer1::IDeconvolutionLayer *deconv_layer =
+    network->addDeconvolutionNd(*deconv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights);

  if (deconv_layer == nullptr) {
    MS_LOG(ERROR) << "DeconvolutionLayer failed";
@ -109,15 +117,8 @@ int DeconvolutionTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    activation_layer->setName((op_name_ + "_activation").c_str());
  }

-  // transpose: NCHW->NHWC
-  nvinfer1::IShuffleLayer *transpose_layer_out = NCHW2NHWC(network, *activation_layer->getOutput(0));
-  if (transpose_layer_out == nullptr) {
-    MS_LOG(ERROR) << "op action convert failed";
-    return RET_ERROR;
-  }
-  transpose_layer_out->setName((op_name_ + "_transpose2NHWC").c_str());
-  transpose_layer_out->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(transpose_layer_out->getOutput(0));
+  activation_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW});
  return RET_OK;
 }

--- a/mindspore/lite/src/delegate/tensorrt/op/elementwise_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/elementwise_tensorrt.cc
@ -80,26 +80,38 @@ int ElementWiseTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "network or input tensor size is invalid";
    return RET_ERROR;
  }
-  first_in_tensor_index_ = strcmp(tensorrt_in_tensors_[0]->getName(), in_tensors_[0].Name().c_str()) == 0 ? 0 : 1;
-  // add elementwise
+  first_in_tensor_index_ =
+    strcmp(tensorrt_in_tensors_[0].trt_tensor_->getName(), in_tensors_[0].Name().c_str()) == 0 ? 0 : 1;
+
  if (this->tensorrt_in_tensors_.size() != INPUT_SIZE2) {
-    // create ITensor from MS constant tensor of index 1 - first_in_tensor_index_
-    nvinfer1::ITensor *constant_input = nullptr;
-    if (this->in_tensors_[1 - first_in_tensor_index_].Shape().size() == 0) {
-      constant_input = lite::ConvertScalarToITensor(network, this->in_tensors_[first_in_tensor_index_].Shape().size(),
-                                                    in_tensors_[1 - first_in_tensor_index_].Data().get());
-    } else {
-      constant_input = lite::ConvertConstantTensor(network, in_tensors_[1 - first_in_tensor_index_]);
+    int ret = AddConstTensor(network);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "AddConstTensor failed for " << op_name_;
+      return ret;
    }
-    if (constant_input == nullptr) {
-      MS_LOG(ERROR) << "create Itensor from constant tensor failed: " << op_name_;
-      return RET_ERROR;
-    }
-    this->AddInnerInTensors(constant_input);
  }

-  nvinfer1::IElementWiseLayer *cal_layer = network->addElementWise(
-    *tensorrt_in_tensors_[first_in_tensor_index_], *tensorrt_in_tensors_[1 - first_in_tensor_index_], element_wise_op_);
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ != tensorrt_in_tensors_[1].format_) {
+    // when inputs format are different, change to NHWC
+    int transpose_input_tensor = tensorrt_in_tensors_[0].format_ == Format::NCHW ? 0 : 1;
+    nvinfer1::IShuffleLayer *transpose_layer =
+      NCHW2NHWC(network, *tensorrt_in_tensors_[transpose_input_tensor].trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer->setName((op_name_ + "_input_transpose2NHWC").c_str());
+    tensorrt_in_tensors_[transpose_input_tensor].trt_tensor_ = transpose_layer->getOutput(0);
+    tensorrt_in_tensors_[transpose_input_tensor].format_ = Format::NHWC;
+  } else if (tensorrt_in_tensors_[0].format_ != tensorrt_in_tensors_[1].format_) {
+    MS_LOG(ERROR) << "elementwise op inputs are in different format: " << op_name_;
+    return RET_ERROR;
+  }
+
+  nvinfer1::IElementWiseLayer *cal_layer =
+    network->addElementWise(*tensorrt_in_tensors_[first_in_tensor_index_].trt_tensor_,
+                            *tensorrt_in_tensors_[1 - first_in_tensor_index_].trt_tensor_, element_wise_op_);

  if (cal_layer == nullptr) {
    MS_LOG(ERROR) << "addElementWise failed for TensorRT.";
@ -129,9 +141,8 @@ int ElementWiseTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
      MS_LOG(WARNING) << "deal with scale and shift for pow op";
    }
  }
-
  op_out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(op_out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, tensorrt_in_tensors_[1].format_});
  return RET_OK;
 }

@ -184,4 +195,26 @@ nvinfer1::ITensor *ElementWiseTensorRT::AddActivation(nvinfer1::INetworkDefiniti
  }
  return activation_out_tensor;
 }
+int ElementWiseTensorRT::AddConstTensor(nvinfer1::INetworkDefinition *network) {
+  // create ITensor from MS constant tensor of index 1 - first_in_tensor_index_
+  nvinfer1::ITensor *constant_input = nullptr;
+  if (this->in_tensors_[1 - first_in_tensor_index_].Shape().size() == 0) {
+    constant_input = lite::ConvertScalarToITensor(network, this->in_tensors_[first_in_tensor_index_].Shape().size(),
+                                                  in_tensors_[1 - first_in_tensor_index_].Data().get());
+    if (constant_input == nullptr) {
+      MS_LOG(ERROR) << "create Itensor from constant tensor failed: " << op_name_;
+      return RET_ERROR;
+    }
+    this->AddInnerInTensors(ITensorHelper{constant_input, tensorrt_in_tensors_[0].format_});
+  } else {
+    constant_input = lite::ConvertConstantTensor(network, in_tensors_[1 - first_in_tensor_index_]);
+    if (constant_input == nullptr) {
+      MS_LOG(ERROR) << "create Itensor from constant tensor failed: " << op_name_;
+      return RET_ERROR;
+    }
+    this->AddInnerInTensors(ITensorHelper{constant_input, Format::NHWC});
+  }
+  return RET_OK;
+}
+
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/elementwise_tensorrt.h
+++ b/mindspore/lite/src/delegate/tensorrt/op/elementwise_tensorrt.h
@ -37,6 +37,8 @@ class ElementWiseTensorRT : public TensorRTOp {
 private:
  nvinfer1::ITensor *AddActivation(nvinfer1::INetworkDefinition *network, nvinfer1::ITensor *in_tensor);

+  int AddConstTensor(nvinfer1::INetworkDefinition *network);
+
  nvinfer1::ElementWiseOperation element_wise_op_;

  // index of first input MSTensor in the trt input tensor vector
--- a/mindspore/lite/src/delegate/tensorrt/op/gather_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/gather_tensorrt.cc
@ -59,15 +59,30 @@ int GatherTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "add a new tensor failed for TensorRT GatherTensorRTOp.";
    return RET_ERROR;
  }
-  nvinfer1::IGatherLayer *gather_layer =
-    network->addGather(*tensorrt_in_tensors_[0], *add_tensor /* indices */, axis_ /* axis */);
+
+  nvinfer1::ITensor *gather_input = tensorrt_in_tensors_[0].trt_tensor_;
+  Format out_format = tensorrt_in_tensors_[0].format_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    gather_input = transpose_layer_in->getOutput(0);
+    out_format = Format::NHWC;
+  }
+
+  nvinfer1::IGatherLayer *gather_layer = network->addGather(*gather_input, *add_tensor /* indices */, axis_ /* axis */);
  if (gather_layer == nullptr) {
    MS_LOG(ERROR) << "addGather failed for TensorRT.";
    return RET_ERROR;
  }
  gather_layer->setName(op_name_.c_str());
  gather_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(gather_layer->getOutput(0));
+  this->AddInnerOutTensors(ITensorHelper{gather_layer->getOutput(0), out_format});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/matmul_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/matmul_tensorrt.cc
@ -43,7 +43,22 @@ int MatMulTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  transpose_b_ = primitive->transpose_b() ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE;
  auto weight = ConvertTensorWithExpandDims(network, in_tensors_[1], in_tensors_[0].Shape().size());

-  auto matmul_layer = network->addMatrixMultiply(*tensorrt_in_tensors_[0], transpose_a_, *weight, transpose_b_);
+  nvinfer1::ITensor *matmul_input = tensorrt_in_tensors_[0].trt_tensor_;
+  Format out_format = tensorrt_in_tensors_[0].format_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    matmul_input = transpose_layer_in->getOutput(0);
+    out_format = Format::NHWC;
+  }
+
+  auto matmul_layer = network->addMatrixMultiply(*matmul_input, transpose_a_, *weight, transpose_b_);
  matmul_layer->setName(op_name_.c_str());
  nvinfer1::ITensor *out_tensor = matmul_layer->getOutput(0);

@ -56,7 +71,7 @@ int MatMulTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }

  out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/pad_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/pad_tensorrt.cc
@ -49,6 +49,10 @@ int PadTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
    MS_LOG(ERROR) << "Unsupported padding mode: " << pad_primitive << ", for op: " << op_name_;
    return RET_ERROR;
  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
  constant_value_ = pad_primitive->constant_value();
  return RET_OK;
 }
@ -56,18 +60,24 @@ int PadTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
 int PadTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  mindspore::MSTensor &pad_tensor = in_tensors_[1];
  int element_cnt = std::accumulate(pad_tensor.Shape().begin(), pad_tensor.Shape().end(), 1, std::multiplies<int>());
-  if (element_cnt != tensorrt_in_tensors_[0]->getDimensions().nbDims * 2) {
+  if (element_cnt != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims * 2) {
    MS_LOG(ERROR) << "pad tensor cnt is invalid. cnt: " << element_cnt
-                  << ", input tensor dims cnt: " << tensorrt_in_tensors_[0]->getDimensions().nbDims;
+                  << ", input tensor dims cnt: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims;
    return RET_ERROR;
  }
-  // transpose: NHWC->NCHW
-  nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0]);
-  if (transpose_layer_in == nullptr) {
-    MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
-    return RET_ERROR;
+
+  nvinfer1::ITensor *pad_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    pad_input = transpose_layer_in->getOutput(0);
  }
-  transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());

  // trt 6 only support 2D padding
  const int *padding_data = reinterpret_cast<const int *>(in_tensors_[1].Data().get());
@ -84,7 +94,7 @@ int PadTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(INFO) << "prePadding: " << *(padding_data + 2) << ", " << *(padding_data + 4);
    MS_LOG(INFO) << "postPadding: " << *(padding_data + 3) << ", " << *(padding_data + 5);

-    padding_layer = network->addPadding(*transpose_layer_in->getOutput(0), prePadding, postPadding);
+    padding_layer = network->addPadding(*pad_input, prePadding, postPadding);
  } else {
    MS_LOG(ERROR) << "need check for pad_tensor dims: " << op_name_
                  << ", pad_tensor ElementNum: " << pad_tensor.ElementNum();
@ -95,17 +105,8 @@ int PadTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  padding_layer->setName(op_name_.c_str());
-
-  // transpose: NCHW->NHWC
-  nvinfer1::IShuffleLayer *transpose_layer_out = NCHW2NHWC(network, *padding_layer->getOutput(0));
-  if (transpose_layer_out == nullptr) {
-    MS_LOG(ERROR) << "op action convert failed";
-    return RET_ERROR;
-  }
-  transpose_layer_out->setName((op_name_ + "_transpose2NHWC").c_str());
-  transpose_layer_out->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-
-  this->AddInnerOutTensors(transpose_layer_out->getOutput(0));
+  padding_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
+  this->AddInnerOutTensors(ITensorHelper{padding_layer->getOutput(0), Format::NCHW});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/pool_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/pool_tensorrt.cc
@ -34,6 +34,10 @@ int PoolTensorRT::IsSupport(const mindspore::schema::Primitive *primitive,
    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
    return RET_ERROR;
  }
+  if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) {
+    MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format();
+    return RET_ERROR;
+  }
  return RET_OK;
 }

@ -47,13 +51,18 @@ int PoolTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "invalid input tensor size: " << tensorrt_in_tensors_.size();
    return RET_ERROR;
  }
-  // transpose: NHWC->NCHW
-  nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0]);
-  if (transpose_layer_in == nullptr) {
-    MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
-    return RET_ERROR;
+  nvinfer1::ITensor *pool_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NHWC) {
+    // transpose: NHWC->NCHW
+    nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "transpose: NHWC->NCHW failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());
+    pool_input = transpose_layer_in->getOutput(0);
  }
-  transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str());

  // pooling layer
  nvinfer1::PoolingType pooling_type = nvinfer1::PoolingType::kAVERAGE;
@ -64,8 +73,7 @@ int PoolTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }
  std::vector<int64_t> kernel_size_val = std::vector<int64_t>(kernel_size->begin(), kernel_size->end());
  nvinfer1::Dims windowSize = lite::ConvertCudaDims(kernel_size_val);
-  nvinfer1::IPoolingLayer *pooling_layer =
-    network->addPoolingNd(*transpose_layer_in->getOutput(0), pooling_type, windowSize);
+  nvinfer1::IPoolingLayer *pooling_layer = network->addPoolingNd(*pool_input, pooling_type, windowSize);
  if (pooling_layer == nullptr) {
    MS_LOG(ERROR) << "addPoolingNd failed for TensorRT.";
    return RET_ERROR;
@ -86,15 +94,8 @@ int PoolTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    }
    activation_layer->setName((op_name_ + "_activation").c_str());
  }
-  // transpose: NCHW->NHWC
-  nvinfer1::IShuffleLayer *transpose_layer_out = NCHW2NHWC(network, *activation_layer->getOutput(0));
-  if (transpose_layer_out == nullptr) {
-    MS_LOG(ERROR) << "op action convert failed";
-    return RET_ERROR;
-  }
-  transpose_layer_out->setName((op_name_ + "_transpose2NHWC").c_str());
-  transpose_layer_out->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(transpose_layer_out->getOutput(0));
+  activation_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
+  this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW});
  return RET_OK;
 }

--- a/mindspore/lite/src/delegate/tensorrt/op/reduce_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/reduce_tensorrt.cc
@ -28,7 +28,7 @@ int ReduceTensorRT::IsSupport(const schema::Primitive *primitive, const std::vec
    MS_LOG(ERROR) << "convert failed";
    return RET_ERROR;
  }
-  if (in_tensors.size() != 2) {
+  if (in_tensors.size() != INPUT_SIZE2) {
    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
  }
  if (out_tensors.size() != 1) {
@ -55,23 +55,17 @@ int ReduceTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  bool keep_dims = reduce_op->keep_dims();
-  // axis
-  uint32_t reduceAxes = 0;
-  mindspore::MSTensor axis_tensor = this->in_tensors_[1];
-  if (axis_tensor.Data() == nullptr) {
-    MS_LOG(ERROR) << "invalid axis_tensor";
-    return RET_ERROR;
+  nvinfer1::ITensor *reduce_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    out_format_ = Format::NHWC;
+  } else {
+    out_format_ = tensorrt_in_tensors_[0].format_;
  }
-  if (axis_tensor.DataType() != DataType::kNumberTypeInt32) {
-    MS_LOG(WARNING) << "not int data type";
-  }
-  int *axis_data = reinterpret_cast<int *>(axis_tensor.MutableData());
-  for (int i = 0; i < axis_tensor.ElementNum(); i++) {
-    reduceAxes |= (16 - (1u << *axis_data));
-    axis_data++;
-  }
-  MS_LOG(INFO) << "reduceAxes: " << reduceAxes;
-  nvinfer1::IReduceLayer *layer = network->addReduce(*tensorrt_in_tensors_[0], reduce_op_, reduceAxes, keep_dims);
+
+  uint32_t reduceAxis = GetAxis();
+
+  nvinfer1::IReduceLayer *layer = network->addReduce(*reduce_input, reduce_op_, reduceAxis, keep_dims);
  if (layer == nullptr) {
    MS_LOG(ERROR) << "addReduce failed for TensorRT.";
    return RET_ERROR;
@ -84,7 +78,29 @@ int ReduceTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_});
  return RET_OK;
 }
+uint32_t ReduceTensorRT::GetAxis() {
+  // axis
+  uint32_t reduceAxis = 0;
+  mindspore::MSTensor axis_tensor = this->in_tensors_[1];
+  if (axis_tensor.Data() == nullptr) {
+    MS_LOG(ERROR) << "invalid axis_tensor";
+    return reduceAxis;
+  }
+  if (axis_tensor.DataType() != DataType::kNumberTypeInt32) {
+    MS_LOG(WARNING) << "not int data type";
+  }
+  int *axis_data = reinterpret_cast<int *>(axis_tensor.MutableData());
+  bool need_transpose_axis =
+    (out_format_ == Format::NCHW) && (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D);
+  for (int i = 0; i < axis_tensor.ElementNum(); i++) {
+    int format_axis_data = need_transpose_axis ? ConvertAxisFromNHWC2NCHW(*axis_data) : *axis_data;
+    reduceAxis |= (16 - (1u << format_axis_data));
+    axis_data++;
+  }
+  MS_LOG(INFO) << "reduceAxis: " << reduceAxis;
+  return reduceAxis;
+}
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/reduce_tensorrt.h
+++ b/mindspore/lite/src/delegate/tensorrt/op/reduce_tensorrt.h
@ -36,6 +36,7 @@ class ReduceTensorRT : public TensorRTOp {
                const std::vector<mindspore::MSTensor> &out_tensors) override;

 private:
+  uint32_t GetAxis();
  std::map<schema::ReduceMode, nvinfer1::ReduceOperation> reduce_ops_ = {
    {schema::ReduceMode::ReduceMode_ReduceMean, nvinfer1::ReduceOperation::kAVG},
    {schema::ReduceMode::ReduceMode_ReduceMax, nvinfer1::ReduceOperation::kMAX},
@ -44,6 +45,7 @@ class ReduceTensorRT : public TensorRTOp {
    {schema::ReduceMode::ReduceMode_ReduceSum, nvinfer1::ReduceOperation::kSUM},
  };
  nvinfer1::ReduceOperation reduce_op_;
+  Format out_format_;
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_
--- a/mindspore/lite/src/delegate/tensorrt/op/scale_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/scale_tensorrt.cc
@ -17,6 +17,7 @@
 #include <numeric>
 #include <functional>
 #include "src/delegate/tensorrt/op/scale_tensorrt.h"
+#include "src/delegate/tensorrt/op/activation_tensorrt.h"
 #include "src/delegate/tensorrt/tensorrt_utils.h"

 namespace mindspore::lite {
@ -53,14 +54,26 @@ int ScaleTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }

  schema::ActivationType activation_type = scale_op->activation_type();
-  nvinfer1::ITensor *scale_in_tensor = tensorrt_in_tensors_[0];
-  // unsqueeze input Itensor to 4 dims
+  nvinfer1::ITensor *scale_in_tensor = tensorrt_in_tensors_[0].trt_tensor_;
+  Format out_format = in_tensors_[0].format();
  if (in_tensors_[0].Shape().size() < INPUT_SIZE4) {
+    // unsqueeze input Itensor to 4 dims
    scale_in_tensor = AddUnsqueezeOp(network);
    if (scale_in_tensor == nullptr) {
      MS_LOG(ERROR) << "AddUnsqueezeOp failed";
      return RET_ERROR;
    }
+  } else if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == 4 &&
+             tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    scale_in_tensor = transpose_layer_in->getOutput(0);
+    out_format = Format::NHWC;
  }
  // mode of scale
  size_t axis = scale_op->axis();
@ -100,18 +113,27 @@ int ScaleTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  cal_layer->setName(op_name_.c_str());
-  nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0);
-  if (op_out_tensor == nullptr) {
-    MS_LOG(ERROR) << "addScaleNd output tensor is invalid for: " << op_name_;
-    return RET_ERROR;
-  }

  // add activation
+  nvinfer1::ITensor *activation_tensor = cal_layer->getOutput(0);
  if (activation_type != schema::ActivationType::ActivationType_NO_ACTIVATION) {
-    MS_LOG(WARNING) << "need activation for: " << op_name_;
+    auto activation_layer = ActivationTensorRT::AddActivation(network, activation_type, 0, cal_layer->getOutput(0));
+    if (activation_layer == nullptr) {
+      MS_LOG(ERROR) << "addActivation for scale failed";
+      return RET_ERROR;
+    }
+    activation_layer->setName((op_name_ + "_activation").c_str());
+    activation_tensor = activation_layer->getOutput(0);
  }
+
+  // squeeze to origin dim
+  nvinfer1::ITensor *op_out_tensor = activation_tensor;
+  if (activation_tensor->getDimensions().nbDims > static_cast<int>(out_tensors_[0].Shape().size())) {
+    op_out_tensor = AddSqueezeOp(activation_tensor, network);
+  }
+
  op_out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(op_out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, out_format});
  return RET_OK;
 }

@ -136,7 +158,7 @@ nvinfer1::ScaleMode ScaleTensorRT::GetScaleMode(size_t axis) {
 }

 nvinfer1::ITensor *ScaleTensorRT::AddUnsqueezeOp(nvinfer1::INetworkDefinition *network) {
-  nvinfer1::IShuffleLayer *unsqueeze_layer = network->addShuffle(*this->tensorrt_in_tensors_[0]);
+  nvinfer1::IShuffleLayer *unsqueeze_layer = network->addShuffle(*this->tensorrt_in_tensors_[0].trt_tensor_);
  if (unsqueeze_layer == nullptr) {
    MS_LOG(ERROR) << "addShuffle failed for: " << op_name_;
    return nullptr;
@ -150,4 +172,17 @@ nvinfer1::ITensor *ScaleTensorRT::AddUnsqueezeOp(nvinfer1::INetworkDefinition *n
  unsqueeze_layer->setReshapeDimensions(unsqueeze_dims);
  return unsqueeze_layer->getOutput(0);
 }
+
+nvinfer1::ITensor *ScaleTensorRT::AddSqueezeOp(nvinfer1::ITensor *in_tensor, nvinfer1::INetworkDefinition *network) {
+  nvinfer1::IShuffleLayer *squeeze_layer = network->addShuffle(*in_tensor);
+  if (squeeze_layer == nullptr) {
+    MS_LOG(ERROR) << "addShuffle failed for: " << op_name_;
+    return nullptr;
+  }
+  squeeze_layer->setName((op_name_ + "_squeeze").c_str());
+  nvinfer1::Dims squeeze_dims = lite::ConvertCudaDims(out_tensors_[0].Shape());
+  MS_LOG(INFO) << "squeeze_dims cnt for scale: " << squeeze_dims.nbDims;
+  squeeze_layer->setReshapeDimensions(squeeze_dims);
+  return squeeze_layer->getOutput(0);
+}
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/scale_tensorrt.h
+++ b/mindspore/lite/src/delegate/tensorrt/op/scale_tensorrt.h
@ -38,6 +38,8 @@ class ScaleTensorRT : public TensorRTOp {
 private:
  nvinfer1::ITensor *AddUnsqueezeOp(nvinfer1::INetworkDefinition *network);

+  nvinfer1::ITensor *AddSqueezeOp(nvinfer1::ITensor *in_tensor, nvinfer1::INetworkDefinition *network);
+
  nvinfer1::ScaleMode GetScaleMode(size_t axis);
 };
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/shape_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/shape_tensorrt.cc
@ -38,7 +38,7 @@ int ShapeTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "network is invalid";
    return RET_ERROR;
  }
-  nvinfer1::IShapeLayer *shape_layer = network->addShape(*tensorrt_in_tensors_[0]);
+  nvinfer1::IShapeLayer *shape_layer = network->addShape(*tensorrt_in_tensors_[0].trt_tensor_);

  if (shape_layer == nullptr) {
    MS_LOG(DEBUG) << "add shape op failed for TensorRT.";
@ -46,7 +46,7 @@ int ShapeTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  }
  shape_layer->setName(op_name_.c_str());
  shape_layer->getOutput(0)->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(shape_layer->getOutput(0));
+  this->AddInnerOutTensors(ITensorHelper{shape_layer->getOutput(0), tensorrt_in_tensors_[0].format_});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/shuffle_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/shuffle_tensorrt.cc
@ -72,58 +72,56 @@ int ShuffleTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "network is invalid";
    return RET_ERROR;
  }
-  nvinfer1::IShuffleLayer *shuffle_layer = network->addShuffle(*tensorrt_in_tensors_[0]);
+
+  nvinfer1::ITensor *shuffler_input = tensorrt_in_tensors_[0].trt_tensor_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == 4 &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW && !tensorrt_in_tensors_[0].trt_tensor_->isNetworkInput()) {
+    // network input tensor format can be NCHW
+    nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer == nullptr) {
+      MS_LOG(ERROR) << "create transpose layer failed for " << op_name_;
+    }
+    transpose_layer->setName((op_name_ + "_transpose_in").c_str());
+    shuffler_input = transpose_layer->getOutput(0);
+  }
+
+  nvinfer1::IShuffleLayer *shuffle_layer = network->addShuffle(*shuffler_input);
  if (shuffle_layer == nullptr) {
    MS_LOG(ERROR) << "add Shuffle op failed for TensorRT.";
    return RET_ERROR;
  }
  shuffle_layer->setName(op_name_.c_str());

+  int ret = RET_OK;
  switch (type_) {
    case schema::PrimitiveType_Unsqueeze: {
-      int ret = AddUnsqueezeOp(shuffle_layer);
-      if (ret != RET_OK) {
-        MS_LOG(ERROR) << "AddUnSqueezeOp failed.";
-        return ret;
-      }
+      ret = AddUnsqueezeOp(shuffle_layer);
      break;
    }
    case schema::PrimitiveType_Squeeze: {
-      int ret = AddSqueezeOp(shuffle_layer);
-      if (ret != RET_OK) {
-        MS_LOG(ERROR) << "AddSqueezeOp failed.";
-        return ret;
-      }
+      ret = AddSqueezeOp(shuffle_layer);
      break;
    }
    case schema::PrimitiveType_Transpose: {
-      int ret = AddTransposeOp(shuffle_layer);
-      if (ret != RET_OK) {
-        MS_LOG(ERROR) << "AddTransposeOpss failed.";
-        return ret;
-      }
+      ret = AddTransposeOp(shuffle_layer);
      break;
    }
    case schema::PrimitiveType_Reshape: {
-      int ret = AddReshapeOp(shuffle_layer);
-      if (ret != RET_OK) {
-        MS_LOG(ERROR) << "AddReshapeOp failed.";
-        return ret;
-      }
+      ret = AddReshapeOp(shuffle_layer);
      break;
    }
    case schema::PrimitiveType_Flatten: {
-      int ret = AddFlattenOp(shuffle_layer);
-      if (ret != RET_OK) {
-        MS_LOG(ERROR) << "AddFlattenOp failed.";
-        return ret;
-      }
+      ret = AddFlattenOp(shuffle_layer);
      break;
    }
    default:
-      MS_LOG(ERROR) << "Unsupported op type.";
+      MS_LOG(ERROR) << "Unsupported op type for " << op_name_;
      return RET_ERROR;
  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "AddOp failed for " << op_name_;
+    return ret;
+  }

  nvinfer1::ITensor *out_tensor = shuffle_layer->getOutput(0);
  if (out_tensor == nullptr) {
@ -131,7 +129,7 @@ int ShuffleTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, Format::NHWC});
  return RET_OK;
 }

@ -177,7 +175,7 @@ int ShuffleTensorRT::AddUnsqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
    MS_LOG(WARNING) << "AddUnsqueezeOp size of in tensort needs check: " << in_tensors_.size();
  }
  // axis
-  auto unsqueeze_shape = tensorrt_in_tensors_[0]->getDimensions();
+  auto unsqueeze_shape = tensorrt_in_tensors_[0].trt_tensor_->getDimensions();
  std::vector<int64_t> new_shape(unsqueeze_shape.d, unsqueeze_shape.d + unsqueeze_shape.nbDims);
  auto axis = unsqueeze_op->axis();

@ -229,7 +227,7 @@ int ShuffleTensorRT::AddReshapeOp(nvinfer1::IShuffleLayer *shuffle_layer) {
      MS_LOG(ERROR) << "invalid shape tensor for reshape " << op_name_;
      return RET_ERROR;
    }
-    shuffle_layer->setInput(1, *tensorrt_in_tensors_[1]);
+    shuffle_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_);
  }
  return RET_OK;
 }
--- a/mindspore/lite/src/delegate/tensorrt/op/slice_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/slice_tensorrt.cc
@ -53,7 +53,22 @@ int SliceTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  nvinfer1::Dims size_dims = lite::ConvertCudaDims(out_tensors_[0].Shape());
  nvinfer1::Dims stride_dims = lite::ConvertCudaDims(stride.Data().get(), stride.ElementNum());

-  nvinfer1::ISliceLayer *slice_layer = network->addSlice(*tensorrt_in_tensors_[0], start_dims, size_dims, stride_dims);
+  nvinfer1::ITensor *slice_input = tensorrt_in_tensors_[0].trt_tensor_;
+  Format out_format = tensorrt_in_tensors_[0].format_;
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == 4 &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose: NCHW->NHWC
+    nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(network, *tensorrt_in_tensors_[0].trt_tensor_);
+    if (transpose_layer_in == nullptr) {
+      MS_LOG(ERROR) << "op action convert failed";
+      return RET_ERROR;
+    }
+    transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str());
+    slice_input = transpose_layer_in->getOutput(0);
+    out_format = Format::NHWC;
+  }
+
+  nvinfer1::ISliceLayer *slice_layer = network->addSlice(*slice_input, start_dims, size_dims, stride_dims);
  if (slice_layer == nullptr) {
    MS_LOG(ERROR) << "add Slice op failed for TensorRT: " << op_name_;
    return RET_ERROR;
@ -65,7 +80,7 @@ int SliceTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    return RET_ERROR;
  }
  out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/softmax_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/softmax_tensorrt.cc
@ -23,19 +23,10 @@ int SoftMaxTensorRT::IsSupport(const schema::Primitive *primitive, const std::ve
    MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_;
    return RET_ERROR;
  }
-  if (primitive->value_type() == schema::PrimitiveType::PrimitiveType_LogSoftmax) {
-    with_log_ = true;
-    auto softmax_op = primitive->value_as_LogSoftmax();
-    if (softmax_op == nullptr) {
-      MS_LOG(ERROR) << "LogSoftmax convert failed";
-      return RET_ERROR;
-    }
-  } else {
-    auto softmax_op = primitive->value_as_Softmax();
-    if (softmax_op == nullptr) {
-      MS_LOG(ERROR) << "convert failed";
-      return RET_ERROR;
-    }
+  softmax_op_ = primitive->value_as_Softmax();
+  if (softmax_op_ == nullptr) {
+    MS_LOG(ERROR) << "convert failed";
+    return RET_ERROR;
  }

  if (in_tensors.size() != 1) {
@ -48,7 +39,6 @@ int SoftMaxTensorRT::IsSupport(const schema::Primitive *primitive, const std::ve
  }
  return RET_OK;
 }
-
 int SoftMaxTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
  if (network == nullptr) {
    MS_LOG(ERROR) << "network is invalid";
@ -66,58 +56,36 @@ int SoftMaxTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "softmax output tensor create failed for TensorRT.";
    return RET_ERROR;
  }
-  if (with_log_) {
-    nvinfer1::IUnaryLayer *log_layer = network->addUnary(*out_tensor, nvinfer1::UnaryOperation::kLOG);
-    if (log_layer == nullptr) {
-      MS_LOG(ERROR) << "add log op failed for TensorRT.";
-      return RET_ERROR;
-    }
-    log_layer->setName((op_name_ + "_log").c_str());
-    out_tensor = log_layer->getOutput(0);
-    if (out_tensor == nullptr) {
-      MS_LOG(ERROR) << "softmax log output tensor create failed for TensorRT.";
-      return RET_ERROR;
-    }
-  }
  out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_});
  return RET_OK;
 }

 nvinfer1::ISoftMaxLayer *SoftMaxTensorRT::AddSoftMaxOp(nvinfer1::INetworkDefinition *network) {
-  nvinfer1::ISoftMaxLayer *current_layer_ = network->addSoftMax(*this->GetInnerInTensors()[0]);
+  nvinfer1::ISoftMaxLayer *current_layer_ = network->addSoftMax(*tensorrt_in_tensors_[0].trt_tensor_);
  if (current_layer_ == nullptr) {
    MS_LOG(ERROR) << "add softmax op failed for TensorRT.";
    return nullptr;
  }
  std::vector<int64_t> axis_val;
-  if (with_log_) {
-    auto softmax_op = this->GetPrimitive()->value_as_LogSoftmax();
-    if (softmax_op == nullptr) {
-      MS_LOG(ERROR) << "LogSoftmax convert failed";
-      return nullptr;
-    }
-    int64_t axis = softmax_op->axis();
-    axis_val.push_back(axis);
-  } else {
-    auto softmax_op = this->GetPrimitive()->value_as_Softmax();
-    if (softmax_op == nullptr) {
-      MS_LOG(ERROR) << "Softmax convert failed";
-      return nullptr;
-    }
-    auto axis = softmax_op->axis();
-    axis_val = std::vector<int64_t>(axis->begin(), axis->end());
-  }
+  auto axis = softmax_op_->axis();
+  axis_val = std::vector<int64_t>(axis->begin(), axis->end());

  if (axis_val.size() != 1) {
    MS_LOG(WARNING) << "axis needs check";
  }

-  if (axis_val[0] >= this->tensorrt_in_tensors_[0]->getDimensions().nbDims) {
+  if (axis_val[0] >= this->tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims) {
    MS_LOG(ERROR) << "axis is larger than input tensor dims.";
    return nullptr;
  }
-  current_layer_->setAxes(axis_val[0]);
+  int64_t axis_format_value = axis_val[0];
+  if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == 4 &&
+      tensorrt_in_tensors_[0].format_ == Format::NCHW) {
+    // transpose axis to NCHW
+    axis_format_value = ConvertAxisFromNHWC2NCHW(axis_val[0]);
+  }
+  current_layer_->setAxes(axis_format_value);
  return current_layer_;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/op/softmax_tensorrt.h
+++ b/mindspore/lite/src/delegate/tensorrt/op/softmax_tensorrt.h
@ -34,8 +34,9 @@ class SoftMaxTensorRT : public TensorRTOp {
                const std::vector<mindspore::MSTensor> &out_tensors) override;

 private:
-  bool with_log_ = false;
  nvinfer1::ISoftMaxLayer *AddSoftMaxOp(nvinfer1::INetworkDefinition *network);
+
+  const schema::Softmax *softmax_op_;
 };
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_
--- a/mindspore/lite/src/delegate/tensorrt/op/tensorrt_op.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/tensorrt_op.cc
@ -19,13 +19,13 @@
 namespace mindspore::lite {
 const schema::Primitive *TensorRTOp::GetPrimitive() { return this->op_primitive_; }

-void TensorRTOp::AddInnerInTensors(nvinfer1::ITensor *tensor) { this->tensorrt_in_tensors_.push_back(tensor); }
+void TensorRTOp::AddInnerInTensors(ITensorHelper tensor) { this->tensorrt_in_tensors_.push_back(tensor); }

-void TensorRTOp::AddInnerOutTensors(nvinfer1::ITensor *tensor) { this->tensorrt_out_tensors_.push_back(tensor); }
+void TensorRTOp::AddInnerOutTensors(ITensorHelper tensor) { this->tensorrt_out_tensors_.push_back(tensor); }

-std::vector<nvinfer1::ITensor *> &TensorRTOp::GetInnerOutTensor() { return this->tensorrt_out_tensors_; }
+std::vector<ITensorHelper> &TensorRTOp::GetInnerOutTensor() { return this->tensorrt_out_tensors_; }

-std::vector<nvinfer1::ITensor *> &TensorRTOp::GetInnerInTensors() { return this->tensorrt_in_tensors_; }
+std::vector<ITensorHelper> &TensorRTOp::GetInnerInTensors() { return this->tensorrt_in_tensors_; }

 std::string TensorRTOp::GetOpName() { return this->op_name_; }

--- a/mindspore/lite/src/delegate/tensorrt/op/tensorrt_op.h
+++ b/mindspore/lite/src/delegate/tensorrt/op/tensorrt_op.h
@ -23,12 +23,18 @@
 #include "include/api/kernel.h"
 #include "src/common/log_adapter.h"
 #include "include/errorcode.h"
+#include "src/delegate/tensorrt/tensorrt_utils.h"

 namespace mindspore::lite {
 constexpr int INPUT_SIZE2 = 2;
 constexpr int INPUT_SIZE3 = 3;
 constexpr int INPUT_SIZE4 = 4;

+struct ITensorHelper {
+  nvinfer1::ITensor *trt_tensor_{nullptr};
+  mindspore::Format format_;
+};
+
 class TensorRTOp {
 public:
  explicit TensorRTOp(const schema::Primitive *primitive, std::vector<mindspore::MSTensor> in_tensors,
@ -51,13 +57,13 @@ class TensorRTOp {

  const schema::Primitive *GetPrimitive();

-  void AddInnerInTensors(nvinfer1::ITensor *tensor);
+  void AddInnerInTensors(ITensorHelper tensor);

-  void AddInnerOutTensors(nvinfer1::ITensor *tensor);
+  void AddInnerOutTensors(ITensorHelper tensor);

-  std::vector<nvinfer1::ITensor *> &GetInnerOutTensor();
+  std::vector<ITensorHelper> &GetInnerOutTensor();

-  std::vector<nvinfer1::ITensor *> &GetInnerInTensors();
+  std::vector<ITensorHelper> &GetInnerInTensors();

  std::string GetOpName();

@ -86,9 +92,9 @@ class TensorRTOp {

  std::vector<mindspore::MSTensor> out_tensors_;

-  std::vector<nvinfer1::ITensor *> tensorrt_in_tensors_;
+  std::vector<ITensorHelper> tensorrt_in_tensors_;

-  std::vector<nvinfer1::ITensor *> tensorrt_out_tensors_;
+  std::vector<ITensorHelper> tensorrt_out_tensors_;

  std::vector<TensorRTOp *> in_ops_;

--- a/mindspore/lite/src/delegate/tensorrt/op/unary_tensorrt.cc
+++ b/mindspore/lite/src/delegate/tensorrt/op/unary_tensorrt.cc
@ -44,7 +44,7 @@ int UnaryTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {
    MS_LOG(ERROR) << "network or input tensor is invalid";
    return RET_ERROR;
  }
-  nvinfer1::IUnaryLayer *cal_layer = network->addUnary(*tensorrt_in_tensors_[0], unary_op_);
+  nvinfer1::IUnaryLayer *cal_layer = network->addUnary(*tensorrt_in_tensors_[0].trt_tensor_, unary_op_);
  if (cal_layer == nullptr) {
    MS_LOG(ERROR) << "addUnary failed for: " << op_name_;
    return RET_ERROR;
@ -53,7 +53,7 @@ int UnaryTensorRT::AddInnerOp(nvinfer1::INetworkDefinition *network) {

  nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0);
  op_out_tensor->setName(out_tensors_[0].Name().c_str());
-  this->AddInnerOutTensors(op_out_tensor);
+  this->AddInnerOutTensors(ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_});
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.cc
@ -187,6 +187,7 @@ nvinfer1::ITensor *TensorRTSubGraph::SetTensorRTNetworkInput(const mindspore::MS

  // only support NHWC HW dim resize
  if (input_hw_index_ != -1) {
+    MS_LOG(INFO) << "input tensor format: " << in_tensor.format();
    input_hw_index_ = in_tensor.format() == Format::NHWC ? 1 : /* NCHW*/ 2;
    input_dims.d[input_hw_index_] = -1;
    input_dims.d[input_hw_index_ + 1] = -1;
@ -208,19 +209,20 @@ int TensorRTSubGraph::BuildTensorRTGraph() {
          MS_LOG(ERROR) << "SetTensorRTNetworkInput failed for " << in_tensor.Name();
          return RET_ERROR;
        }
-        cur_op->AddInnerInTensors(trt_tensor);
+        cur_op->AddInnerInTensors(ITensorHelper{trt_tensor, in_tensor.format()});
        continue;
      }

-      auto trt_tensor = FindTensorRTInputs(cur_op, in_tensor);
-      // weight tensor
-      if (trt_tensor == nullptr) {
+      ITensorHelper trt_tensor = FindTensorRTInputs(cur_op, in_tensor);
+      if (trt_tensor.trt_tensor_ == nullptr) {
+        // weight tensor
        if (trt_specific_weight_nodes_.find(cur_op->type()) == trt_specific_weight_nodes_.end()) {
          if (in_tensor == nullptr) {
            MS_LOG(ERROR) << "Weight Tensor is nullptr.";
            return RET_ERROR;
          }
-          trt_tensor = lite::ConvertConstantTensor(this->network_, in_tensor);
+          trt_tensor.trt_tensor_ = lite::ConvertConstantTensor(this->network_, in_tensor);
+          trt_tensor.format_ = Format::NHWC;
          MS_LOG(INFO) << "auto convert constant tensor for: " << cur_op->GetOpName();
          cur_op->AddInnerInTensors(trt_tensor);
        }
@ -236,16 +238,44 @@ int TensorRTSubGraph::BuildTensorRTGraph() {
    }
  }

+  ret = MarkOutputs();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "MarkOutputs failed in TensorRT network";
+    return ret;
+  }
+
+  ret = BuildEngine();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Create engine failed in TensorRT network";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int TensorRTSubGraph::MarkOutputs() {
  // Mark NetWork Output Tensor.
  for (auto out_tensor : outputs_) {
    for (auto out_op : this->out_ops_) {
      for (size_t index = 0; index < out_op->outputs().size(); index++) {
        if (out_op->outputs()[index] == out_tensor) {
-          out_op->GetInnerOutTensor()[index]->setName(out_tensor.Name().c_str());
+          nvinfer1::ITensor *out_trt_tensor = out_op->GetInnerOutTensor()[index].trt_tensor_;
+          if (out_op->GetInnerOutTensor()[index].trt_tensor_->getDimensions().nbDims == 4 &&
+              out_op->GetInnerOutTensor()[index].format_ == Format::NCHW) {
+            // transpose subgraph output from nchw to nhwc
+            nvinfer1::IShuffleLayer *transpose_layer_out =
+              NCHW2NHWC(network_, *out_op->GetInnerOutTensor()[index].trt_tensor_);
+            if (transpose_layer_out == nullptr) {
+              MS_LOG(ERROR) << "op action convert failed";
+              return RET_ERROR;
+            }
+            transpose_layer_out->setName((out_tensor.Name() + "_transpose2NHWC").c_str());
+          }
+
+          out_trt_tensor->setName(out_tensor.Name().c_str());
          MS_LOG(INFO) << "markOutput for: " << out_tensor.Name();
-          this->network_->markOutput(*out_op->GetInnerOutTensor()[index]);
-          for (int n = 0; n < out_op->GetInnerOutTensor()[index]->getDimensions().nbDims; n++) {
-            if (out_op->GetInnerOutTensor()[index]->getDimensions().d[n] == -1) {
+          this->network_->markOutput(*out_trt_tensor);
+          for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) {
+            if (out_trt_tensor->getDimensions().d[n] == -1) {
              output_batchsize_index_ = n;
              break;
            }
@ -254,12 +284,6 @@ int TensorRTSubGraph::BuildTensorRTGraph() {
      }
    }
  }
-
-  ret = BuildEngine();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Create engine failed in TensorRT network";
-    return ret;
-  }
  return RET_OK;
 }

@ -292,7 +316,7 @@ int TensorRTSubGraph::Prepare() {
    trt_in_tensor_name_.push_back(tensor.Name());
    nvinfer1::Dims input_dims = ConvertCudaDims(tensor.Shape());
    for (int od = 0; od < input_dims.nbDims; od++) {
-      MS_LOG(INFO) << "in tensor " << tensor.Name() << " dims at " << od << " is " << input_dims.d[od];
+      MS_LOG(DEBUG) << "in tensor " << tensor.Name() << " dims at " << od << " is " << input_dims.d[od];
    }

    if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
@ -363,7 +387,7 @@ int TensorRTSubGraph::ReSize() {
    // Set actual input size
    nvinfer1::Dims input_dims = ConvertCudaDims(inputs_[i].Shape());
    for (int od = 0; od < input_dims.nbDims; od++) {
-      MS_LOG(INFO) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od];
+      MS_LOG(DEBUG) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od];
    }

    if (!this->trt_context_->setBindingDimensions(index, input_dims)) {
@ -420,7 +444,7 @@ int TensorRTSubGraph::Execute() {
      new_shape[output_batchsize_index_] = runtime_->GetBatchSize();
    }
    for (int od = 0; od < out_dims.nbDims; od++) {
-      MS_LOG(INFO) << "out tensor " << trt_out_tensor_name_[i] << " dims at " << od << " is " << new_shape[od];
+      MS_LOG(DEBUG) << "out tensor " << trt_out_tensor_name_[i] << " dims at " << od << " is " << new_shape[od];
    }
    outputs_[i].SetShape(new_shape);

@ -438,7 +462,7 @@ int TensorRTSubGraph::Execute() {
  return RET_OK;
 }

-nvinfer1::ITensor *TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) {
+ITensorHelper TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) {
  for (auto input_op : cur_op->in_ops()) {
    for (size_t i = 0; i < input_op->outputs().size(); i++) {
      auto out_tensor = input_op->outputs().at(i);
@ -447,6 +471,6 @@ nvinfer1::ITensor *TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, cons
      }
    }
  }
-  return nullptr;
+  return ITensorHelper{};
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_subgraph.h
@ -67,7 +67,9 @@ class TensorRTSubGraph : public kernel::Kernel {

  nvinfer1::ITensor *SetTensorRTNetworkInput(const mindspore::MSTensor &in_tensor);

-  static nvinfer1::ITensor *FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);
+  ITensorHelper FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor);
+
+  int MarkOutputs();

  std::vector<TensorRTOp *> all_ops_{};
  // subgraph input nodes.
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.cc
@ -254,4 +254,37 @@ void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_) {
  }
  MS_LOG(INFO) << "cuda is running on device: " << device;
 }
+Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm) {
+  if (input_format == Format::NHWC) {
+    if (perm.order[0] == 0 && perm.order[1] == 3 && perm.order[2] == 2 && perm.order[3] == 1) {
+      return Format::NCHW;
+    }
+  } else if (input_format == Format::NCHW) {
+    if (perm.order[0] == 0 && perm.order[1] == 2 && perm.order[2] == 3 && perm.order[3] == 1) {
+      return Format::NHWC;
+    }
+  }
+  MS_LOG(WARNING) << "transpose out format needs to check for " << input_format;
+  return input_format;
+}
+int ConvertAxisFromNHWC2NCHW(int nhwc_axis) {
+  // N0H1W2C3->N0C1H2W3
+  if (nhwc_axis > kNHWC_C) {
+    return nhwc_axis;
+  }
+  switch (nhwc_axis) {
+    case kNHWC_N:
+      return kNCHW_N;
+    case kNHWC_H:
+      return kNCHW_H;
+    case kNHWC_W:
+      return kNCHW_W;
+    case kNHWC_C:
+      return kNCHW_C;
+    default:
+      MS_LOG(ERROR) << "invalid input axis for nhwc: " << nhwc_axis;
+  }
+  return nhwc_axis;
+}
+
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h
+++ b/mindspore/lite/src/delegate/tensorrt/tensorrt_utils.h
@ -23,6 +23,10 @@
 #include "schema/ops_generated.h"
 #include "nnacl/pack.h"

+#define kNCHW_N 0
+#define kNCHW_C 1
+#define kNCHW_H 2
+#define kNCHW_W 3
 namespace mindspore::lite {
 struct ActivationParams {
  nvinfer1::ActivationType activation_type;
@ -61,5 +65,9 @@ nvinfer1::Weights TransposeWeight(const mindspore::MSTensor &ms_tensor, float **
 nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor);

 void SetCudaDevice(std::shared_ptr<GPUDeviceInfo> device_info_);
+
+Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm);
+
+int ConvertAxisFromNHWC2NCHW(int nhwc_axis);
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_