From ed8e877f02ab3e810963bfc18b58b4940057045f Mon Sep 17 00:00:00 2001 From: jpc_chenjianping Date: Mon, 4 Jul 2022 20:24:26 +0800 Subject: [PATCH] copy tensor code to runtime --- mindspore/lite/src/CMakeLists.txt | 4 +- .../delegate/tensorrt/tensorrt_subgraph.cc | 4 + .../parameter_cache/cache_algorithm.h | 43 ++ .../delegate/parameter_cache/cache_mem_base.h | 41 + .../parameter_cache/embedding_cache.cc | 237 ++++++ .../parameter_cache/embedding_cache.h | 89 +++ .../embedding_cache_manager.cc | 194 +++++ .../parameter_cache/embedding_cache_manager.h | 60 ++ .../parameter_cache/factory_mgr_base.h | 81 ++ .../parameter_cache/gpu/gpu_cache_mem.cc | 158 ++++ .../parameter_cache/gpu/gpu_cache_mem.h | 48 ++ .../delegate/parameter_cache/lfu_cache.cc | 243 ++++++ .../delegate/parameter_cache/lfu_cache.h | 55 ++ .../parameter_cache/load_host_cache_model.cc | 148 ++++ .../parameter_cache/load_host_cache_model.h | 48 ++ .../runtime/delegate/tensorrt/CMakeLists.txt | 95 +++ .../delegate/tensorrt/cuda_impl/activation.cu | 56 ++ .../tensorrt/cuda_impl/activation.cuh | 26 + .../delegate/tensorrt/cuda_impl/cast.cu | 49 ++ .../delegate/tensorrt/cuda_impl/cast.cuh | 23 + .../tensorrt/cuda_impl/cublas_utils.cc | 70 ++ .../tensorrt/cuda_impl/cublas_utils.h | 62 ++ .../tensorrt/cuda_impl/cuda_helper.cc | 48 ++ .../delegate/tensorrt/cuda_impl/cuda_helper.h | 63 ++ .../tensorrt/cuda_impl/cudnn_utils.cc | 41 + .../delegate/tensorrt/cuda_impl/cudnn_utils.h | 48 ++ .../delegate/tensorrt/cuda_impl/equal.cu | 35 + .../delegate/tensorrt/cuda_impl/equal.cuh | 23 + .../delegate/tensorrt/cuda_impl/hash.cu | 64 ++ .../delegate/tensorrt/cuda_impl/hash.cuh | 27 + .../delegate/tensorrt/cuda_impl/logical.cu | 63 ++ .../delegate/tensorrt/cuda_impl/logical.cuh | 29 + .../delegate/tensorrt/cuda_impl/normalize.cu | 98 +++ .../delegate/tensorrt/cuda_impl/normalize.cuh | 24 + .../delegate/tensorrt/cuda_impl/utils.cuh | 41 + .../distribution/distribution_base.cc | 23 + .../tensorrt/distribution/distribution_base.h | 31 + .../distribution/distribution_base_impl.cc | 28 + .../distribution/distribution_collective.cc | 38 + .../distribution/distribution_collective.h | 45 ++ .../distribution_collective_impl.cc | 72 ++ .../distribution/distribution_utils.cc | 58 ++ .../distribution/distribution_utils.h | 32 + .../tensorrt/op/activation_opt_plugin.cc | 116 +++ .../tensorrt/op/activation_opt_plugin.h | 72 ++ .../tensorrt/op/activation_tensorrt.cc | 153 ++++ .../tensorrt/op/activation_tensorrt.h | 43 ++ .../tensorrt/op/allgather_tensorrt.cc | 113 +++ .../delegate/tensorrt/op/allgather_tensorrt.h | 75 ++ .../delegate/tensorrt/op/cast_plugin.cc | 83 ++ .../delegate/tensorrt/op/cast_plugin.h | 67 ++ .../delegate/tensorrt/op/cast_tensorrt.cc | 79 ++ .../delegate/tensorrt/op/cast_tensorrt.h | 43 ++ .../delegate/tensorrt/op/concate_tensorrt.cc | 158 ++++ .../delegate/tensorrt/op/concate_tensorrt.h | 50 ++ .../tensorrt/op/convolution_tensorrt.cc | 187 +++++ .../tensorrt/op/convolution_tensorrt.h | 43 ++ .../tensorrt/op/deconvolution_tensorrt.cc | 199 +++++ .../tensorrt/op/deconvolution_tensorrt.h | 43 ++ .../tensorrt/op/elementwise_tensorrt.cc | 312 ++++++++ .../tensorrt/op/elementwise_tensorrt.h | 50 ++ .../delegate/tensorrt/op/equal_tensorrt.cc | 96 +++ .../delegate/tensorrt/op/equal_tensorrt.h | 63 ++ .../tensorrt/op/fullyconnected_tensorrt.cc | 106 +++ .../tensorrt/op/fullyconnected_tensorrt.h | 45 ++ .../delegate/tensorrt/op/gather_d_tensorrt.cc | 139 ++++ .../delegate/tensorrt/op/gather_d_tensorrt.h | 80 ++ .../delegate/tensorrt/op/gather_tensorrt.cc | 108 +++ .../delegate/tensorrt/op/gather_tensorrt.h | 42 + .../tensorrt/op/logical_not_tensorrt.cc | 119 +++ .../tensorrt/op/logical_not_tensorrt.h | 78 ++ .../delegate/tensorrt/op/logical_tensorrt.cc | 129 ++++ .../delegate/tensorrt/op/logical_tensorrt.h | 78 ++ .../delegate/tensorrt/op/lstm_tensorrt.cc | 493 ++++++++++++ .../delegate/tensorrt/op/lstm_tensorrt.h | 115 +++ .../delegate/tensorrt/op/matmul_opt_plugin.cc | 202 +++++ .../delegate/tensorrt/op/matmul_opt_plugin.h | 80 ++ .../delegate/tensorrt/op/matmul_tensorrt.cc | 310 ++++++++ .../delegate/tensorrt/op/matmul_tensorrt.h | 62 ++ .../tensorrt/op/normalize_opt_plugin.cc | 59 ++ .../tensorrt/op/normalize_opt_plugin.h | 61 ++ .../tensorrt/op/normalize_tensorrt.cc | 178 +++++ .../delegate/tensorrt/op/normalize_tensorrt.h | 56 ++ .../delegate/tensorrt/op/pad_tensorrt.cc | 140 ++++ .../delegate/tensorrt/op/pad_tensorrt.h | 42 + .../delegate/tensorrt/op/pool_tensorrt.cc | 220 ++++++ .../delegate/tensorrt/op/pool_tensorrt.h | 55 ++ .../delegate/tensorrt/op/prelu_tensorrt.cc | 79 ++ .../delegate/tensorrt/op/prelu_tensorrt.h | 39 + .../delegate/tensorrt/op/reduce_tensorrt.cc | 139 ++++ .../delegate/tensorrt/op/reduce_tensorrt.h | 44 ++ .../tensorrt/op/reducescatter_tensorrt.cc | 126 +++ .../tensorrt/op/reducescatter_tensorrt.h | 83 ++ .../delegate/tensorrt/op/resize_tensorrt.cc | 230 ++++++ .../delegate/tensorrt/op/resize_tensorrt.h | 52 ++ .../delegate/tensorrt/op/scale_tensorrt.cc | 227 ++++++ .../delegate/tensorrt/op/scale_tensorrt.h | 57 ++ .../tensorrt/op/scatternd_tensorrt.cc | 99 +++ .../delegate/tensorrt/op/scatternd_tensorrt.h | 39 + .../delegate/tensorrt/op/shape_tensorrt.cc | 69 ++ .../delegate/tensorrt/op/shape_tensorrt.h | 38 + .../delegate/tensorrt/op/shuffle_tensorrt.cc | 437 +++++++++++ .../delegate/tensorrt/op/shuffle_tensorrt.h | 58 ++ .../delegate/tensorrt/op/slice_tensorrt.cc | 281 +++++++ .../delegate/tensorrt/op/slice_tensorrt.h | 66 ++ .../delegate/tensorrt/op/softmax_tensorrt.cc | 95 +++ .../delegate/tensorrt/op/softmax_tensorrt.h | 43 ++ .../delegate/tensorrt/op/split_tensorrt.cc | 160 ++++ .../delegate/tensorrt/op/split_tensorrt.h | 45 ++ .../delegate/tensorrt/op/tensorrt_op.cc | 132 ++++ .../delegate/tensorrt/op/tensorrt_op.h | 175 +++++ .../delegate/tensorrt/op/tensorrt_plugin.cc | 81 ++ .../delegate/tensorrt/op/tensorrt_plugin.h | 106 +++ .../delegate/tensorrt/op/tile_tensorrt.cc | 183 +++++ .../delegate/tensorrt/op/tile_tensorrt.h | 94 +++ .../delegate/tensorrt/op/topk_tensorrt.cc | 160 ++++ .../delegate/tensorrt/op/topk_tensorrt.h | 49 ++ .../delegate/tensorrt/op/unary_tensorrt.cc | 84 ++ .../delegate/tensorrt/op/unary_tensorrt.h | 56 ++ .../delegate/tensorrt/tensorrt_allocator.cc | 150 ++++ .../delegate/tensorrt/tensorrt_allocator.h | 64 ++ .../delegate/tensorrt/tensorrt_context.cc | 56 ++ .../delegate/tensorrt/tensorrt_context.h | 40 + .../delegate/tensorrt/tensorrt_delegate.cc | 243 ++++++ .../delegate/tensorrt/tensorrt_delegate.h | 70 ++ .../delegate/tensorrt/tensorrt_runtime.cc | 52 ++ .../delegate/tensorrt/tensorrt_runtime.h | 82 ++ .../delegate/tensorrt/tensorrt_serializer.cc | 63 ++ .../delegate/tensorrt/tensorrt_serializer.h | 45 ++ .../delegate/tensorrt/tensorrt_subgraph.cc | 681 +++++++++++++++++ .../delegate/tensorrt/tensorrt_subgraph.h | 159 ++++ .../delegate/tensorrt/tensorrt_utils.cc | 721 ++++++++++++++++++ .../delegate/tensorrt/tensorrt_utils.h | 184 +++++ 133 files changed, 14041 insertions(+), 2 deletions(-) create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc create mode 100644 mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh create mode 100755 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu create mode 100755 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc create mode 100644 mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index 679383b93bf..bb6f1942f2c 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -498,14 +498,14 @@ if(SUPPORT_TENSORRT) set(CUDA_LIB_PATH ${CUDA_PATH}/lib64) include_directories(${TENSORRT_PATH}/include) include_directories(${CUDA_PATH}/include) - add_subdirectory(extendrt/delegate/tensorrt) + add_subdirectory(runtime/delegate/tensorrt) endif() target_link_libraries(mindspore-lite tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective) target_link_libraries(mindspore-lite_static tensorrt_kernel_mid cuda_kernel_mid gpu_distribution_collective) else() if(NOT MSLITE_ENABLE_CLOUD_FUSION_INFERENCE) set(TENSORRT_STUB - ${CMAKE_CURRENT_SOURCE_DIR}/extendrt/delegate/tensorrt/distribution/distribution_base.cc + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/delegate/tensorrt/distribution/distribution_base.cc ) add_library(tensorrt_stub OBJECT ${TENSORRT_STUB}) endif() diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc index dc78b17383b..a085955c6dc 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_subgraph.cc @@ -381,6 +381,10 @@ int TensorRTSubGraph::Prepare() { return RET_ERROR; } int binding_num = this->engine_->getNbBindings(); + if (binding_num < 0) { + MS_LOG(ERROR) << "invalid binding_num " << binding_num; + return RET_ERROR; + } tensor_bindings_ = new (std::nothrow) void *[binding_num]; if (tensor_bindings_ == nullptr) { MS_LOG(ERROR) << "malloc tensor binding array failed."; diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h new file mode 100644 index 00000000000..c496b76b947 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_algorithm.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_ + +#include +#include "include/api/status.h" + +namespace mindspore { +namespace cache { +struct CacheNoe { + CacheNoe(int _index, int _frequency, int _value) : key(_index), frequency(_frequency), value(_value) {} + int key; // host input index + int frequency; + int value; // cache index +}; + +class CacheAlgorithm { + public: + virtual ~CacheAlgorithm() {} + virtual int Get(int key) = 0; + virtual void Put(int key, int value) = 0; + virtual Status Init(size_t cache_size, int min_host_index, int max_host_index) = 0; + virtual Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index, + std::vector *need_swap_indies, std::vector *need_swap_indies_cache_index) = 0; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_ALGORITHM_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h new file mode 100644 index 00000000000..8844e787404 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/cache_mem_base.h @@ -0,0 +1,41 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_ +#include +#include + +namespace mindspore { +namespace cache { +class CacheMemBase { + public: + CacheMemBase() = default; + virtual ~CacheMemBase() = default; + virtual bool InitDevice(uint32_t device_id, const void *context) = 0; + virtual void *MallocMemory(size_t size) = 0; + virtual void FreeMemory(void *buf) = 0; + virtual bool SynchronizeStream() = 0; + virtual bool CopyHostMemToDevice(void *dst, const void *src, size_t size) = 0; + virtual bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) = 0; + virtual bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, + size_t cache_vocab_size, size_t embedding_size, size_t swap_out_size) = 0; + virtual bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, + size_t cache_vocab_size, size_t embedding_size, size_t swap_in_size) = 0; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_CACHE_MEM_BASE_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc new file mode 100644 index 00000000000..10222514736 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.cc @@ -0,0 +1,237 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/delegate/parameter_cache/embedding_cache.h" +#include +#include +#include +#include +#include +#include +#include "src/common/log_adapter.h" +#include "include/errorcode.h" +#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h" +#include "src/runtime/delegate/parameter_cache/lfu_cache.h" +#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h" + +namespace { +constexpr size_t kEmbeddingTensorShapeSize = 2; +} +namespace mindspore { +namespace cache { +void LookUpTableTask(size_t indices_lens, size_t first_dim_size, const char *input_addr, const int *indices_addr, + char *output_addr, size_t embedding_len, int min_host_index) { + for (size_t i = 0; i < indices_lens; ++i) { + int index = indices_addr[i] - min_host_index; + if (index >= 0 && index < static_cast(first_dim_size)) { + size_t pos = index * embedding_len; + std::memcpy(output_addr, input_addr + pos, embedding_len); + } else { + memset(output_addr, 0, embedding_len); + } + output_addr += embedding_len; + } +} + +EmbeddingCache::~EmbeddingCache() { + if (hash_swap_value_device_addr_ != nullptr) { + device_cache_->FreeMemory(hash_swap_value_device_addr_); + hash_swap_value_device_addr_ = nullptr; + } + if (hash_swap_value_addr_ != nullptr) { + free(hash_swap_value_addr_); + hash_swap_value_addr_ = nullptr; + } + if (hash_swap_index_addr_ != nullptr) { + device_cache_->FreeMemory(hash_swap_index_addr_); + hash_swap_index_addr_ = nullptr; + } +} + +Status EmbeddingCache::Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor) { + MS_ASSERT(device_tensor.Shape().size() == kEmbeddingTensorShapeSize); + MS_ASSERT(host_cache_tensor.Shape().size() == kEmbeddingTensorShapeSize); + MS_ASSERT(device_tensor.DataType() == host_cache_tensor.DataType()); + MS_ASSERT(host_cache_tensor.Data() != nullptr); + + if (device_tensor.Shape()[1] != host_cache_tensor.Shape()[1]) { + MS_LOG(ERROR) << device_tensor.Name() << " embedding_size is invalid, device size is " << device_tensor.Shape()[1] + << ", host size is " << host_cache_tensor.Shape()[1]; + return kLiteError; + } + if (host_cache_size_ != host_cache_tensor.Shape()[0]) { + MS_LOG(ERROR) << device_tensor.Name() << " host_cache_size is invalid, host_cache_size" + << host_cache_tensor.Shape()[0] << ", index begin:" << min_host_index_ + << ", index end:" << max_host_index_ << "rank_group_size_ num:" << rank_group_size_ + << ", rank id:" << rank_id_ << ", vocab_size_:" << vocab_size_; + return kLiteError; + } + + data_type_ = device_tensor.DataType(); + switch (data_type_) { + case DataType::kNumberTypeFloat32: + sizeof_data_type_ = sizeof(float); + break; + default: + MS_LOG(ERROR) << device_tensor.Name() << " unsupported data type " << static_cast(data_type_); + return kLiteError; + } + host_addr_ = host_cache_tensor.MutableData(); + embedding_size_ = device_tensor.Shape()[1]; + device_start_index_ = device_cache_size_ * rank_id_; + // host cache tensor is device tensor + if (device_tensor.Shape()[0] == host_cache_tensor.Shape()[0]) { + device_start_index_ = min_host_index_; + } + return kSuccess; +} + +Status EmbeddingCache::MallocCacheMemory() { + auto hash_swap_value_size = embedding_size_ * batch_elements_ * sizeof_data_type_; + hash_swap_value_device_addr_ = device_cache_->MallocMemory(hash_swap_value_size); + if (hash_swap_value_device_addr_ == nullptr) { + MS_LOG(ERROR) << "malloc hash_swap_value_device failed, malloc size " << hash_swap_value_size; + return kLiteMemoryFailed; + } + + hash_swap_value_addr_ = malloc(hash_swap_value_size); + if (hash_swap_value_addr_ == nullptr) { + MS_LOG(ERROR) << "malloc hash_swap_value failed, malloc size " << hash_swap_value_size; + return kLiteMemoryFailed; + } + + // data type of index + hash_swap_index_addr_ = static_cast(device_cache_->MallocMemory(batch_elements_ * sizeof(int))); + if (hash_swap_index_addr_ == nullptr) { + MS_LOG(ERROR) << "malloc hash_swap_index failed, malloc size " << batch_elements_ * sizeof(int); + return kLiteMemoryFailed; + } + return kSuccess; +} + +Status EmbeddingCache::Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor, + mindspore::MSTensor device_tensor) { + auto ret = Init(host_cache_tensor, device_tensor); + if (ret != kSuccess) { + return ret; + } + cache_ = lite::FactoryManagerBase::Instance().GetProduct("lfu"); + if (cache_ == nullptr) { + MS_LOG(ERROR) << "malloc LFUCacheAlgorithm failed"; + return kLiteMemoryFailed; + } + ret = cache_->Init(device_cache_size_, min_host_index_, max_host_index_); + if (ret != kSuccess) { + MS_LOG(ERROR) << "init cache failed," << ret.CodeAsString; + return kLiteError; + } + + device_cache_ = lite::FactoryManagerBase::Instance().GetProduct("gpu"); + if (device_cache_ == nullptr) { + MS_LOG(ERROR) << "get cache failed"; + return kLiteMemoryFailed; + } + if (!device_cache_->InitDevice(device_id, context)) { + MS_LOG(ERROR) << "init device failed"; + return kLiteError; + } + ret = MallocCacheMemory(); + if (ret != kSuccess) { + return ret; + } + + MS_LOG(INFO) << "init succ, rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_ + << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_ + << ", device_cache_size_:" << device_cache_size_ << ", embedding_size_:" << embedding_size_ + << ", batch_elements_:" << batch_elements_ << ", index begin:" << min_host_index_ + << ", index end:" << max_host_index_; + return kSuccess; +} + +Status EmbeddingCache::SetHostCacheAddr(void *addr, size_t size) { + if (sizeof_data_type_ * host_cache_size_ * embedding_size_ != size) { + return kLiteParamInvalid; + } + host_addr_ = addr; + + // copy part of host mem to device + auto ret = + device_cache_->CopyHostMemToDevice(device_addr_, addr, sizeof_data_type_ * device_cache_size_ * embedding_size_); + if (!ret) { + MS_LOG(ERROR) << "CopyHostMemToDevice failed, copy size " + << sizeof_data_type_ * device_cache_size_ * embedding_size_; + return kLiteMemoryFailed; + } + + // init cache + auto index_num = device_cache_size_; + for (size_t i = 0; i < index_num; i++) { + cache_->Put(min_host_index_ + i, i); + } + + return kSuccess; +} + +Status EmbeddingCache::SetDeviceCacheAddr(void *device_mem_addr, size_t size) { + if (sizeof_data_type_ * device_cache_size_ * embedding_size_ != size) { + return kLiteParamInvalid; + } + + device_addr_ = device_mem_addr; + SetHostCacheAddr(host_addr_, sizeof_data_type_ * host_cache_size_ * embedding_size_); + + return kSuccess; +} + +Status EmbeddingCache::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index) { + std::vector need_swap_indies; + std::vector need_swap_indies_cache_index; + auto ret = + cache_->CheckCacheHit(batch_ids, batch_ids_len, cache_index, &need_swap_indies, &need_swap_indies_cache_index); + if (ret != kSuccess) { + MS_LOG(ERROR) << "CheckCacheHit failed"; + return ret; + } + auto swap_indices_size = need_swap_indies.size(); + if (swap_indices_size > 0) { + LookUpTableTask(swap_indices_size, host_cache_size_, static_cast(host_addr_), need_swap_indies.data(), + static_cast(hash_swap_value_addr_), embedding_size_ * sizeof_data_type_, min_host_index_); + + auto device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_value_device_addr_, hash_swap_value_addr_, + swap_indices_size * embedding_size_ * sizeof_data_type_); + if (!device_cache_ret) { + MS_LOG(ERROR) << "copy swap value to device failed"; + return kLiteMemoryFailed; + } + + device_cache_ret = device_cache_->CopyHostMemToDevice(hash_swap_index_addr_, need_swap_indies_cache_index.data(), + swap_indices_size * sizeof(int)); + if (!device_cache_ret) { + MS_LOG(ERROR) << "copy swap indies to device failed"; + return kLiteMemoryFailed; + } + + device_cache_ret = device_cache_->HashSwapIn(device_addr_, hash_swap_value_device_addr_, hash_swap_index_addr_, + device_cache_size_, embedding_size_, swap_indices_size); + if (!device_cache_ret) { + MS_LOG(ERROR) << "HashSwapIn failed"; + return kLiteMemoryFailed; + } + } + + return kSuccess; +} +} // namespace cache +} // namespace mindspore diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h new file mode 100644 index 00000000000..4dab859cd52 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache.h @@ -0,0 +1,89 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_ +#include +#include +#include +#include "include/api/status.h" +#include "include/api/data_type.h" +#include "src/common/log_adapter.h" +#include "src/runtime/delegate/parameter_cache/cache_algorithm.h" +#include "src/runtime/delegate/parameter_cache/cache_mem_base.h" + +namespace mindspore { +namespace cache { +class EmbeddingCache { + public: + EmbeddingCache(size_t vocab_size, size_t device_cache_size, size_t batch_elements, int rank_id, int rank_group_size) + : vocab_size_(vocab_size), + device_cache_size_(device_cache_size), + batch_elements_(batch_elements), + rank_id_(rank_id), + rank_group_size_(rank_group_size) { + MS_ASSERT(rank_group_size_ != 0); + auto local_shard_size = static_cast(std::ceil(static_cast(vocab_size_) / rank_group_size_)); + min_host_index_ = local_shard_size * rank_id_; + max_host_index_ = std::min(min_host_index_ + local_shard_size, static_cast(vocab_size_)); + host_cache_size_ = max_host_index_ - min_host_index_; + + MS_LOG(INFO) << "rank_group_size_ num:" << rank_group_size_ << ", rank id:" << rank_id_ + << ", vocab_size_:" << vocab_size_ << ", host_cache_size_:" << host_cache_size_ + << ", index begin:" << min_host_index_ << ", index end:" << max_host_index_; + } + + ~EmbeddingCache(); + Status Init(uint32_t device_id, const void *context, mindspore::MSTensor host_cache_tensor, + mindspore::MSTensor device_tensor); + Status SetHostCacheAddr(void *addr, size_t size); + Status SetDeviceCacheAddr(void *host_mem_addr, size_t size); + Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *hash_index); + size_t GetDeviceStartIndex() { return device_start_index_; } + + private: + Status Init(mindspore::MSTensor host_cache_tensor, mindspore::MSTensor device_tensor); + Status MallocCacheMemory(); + + private: + std::shared_ptr device_cache_{nullptr}; + std::shared_ptr cache_{nullptr}; + + size_t vocab_size_{0}; // total size + size_t host_cache_size_{0}; // local host size + size_t device_cache_size_{0}; // local device cache size + size_t device_start_index_{0}; + size_t embedding_size_{0}; + size_t batch_elements_{0}; + + DataType data_type_{DataType::kNumberTypeFloat32}; + size_t sizeof_data_type_{0}; + + void *device_addr_{nullptr}; // hash_info.device_address.addr + void *host_addr_{nullptr}; + + int *hash_swap_index_addr_; // embedding_device_cache_->hash_swap_index_addr_ + void *hash_swap_value_addr_; + void *hash_swap_value_device_addr_; + + int rank_id_; + int rank_group_size_; + int min_host_index_{0}; + int max_host_index_{0}; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc new file mode 100644 index 00000000000..4d48521c917 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.cc @@ -0,0 +1,194 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h" +#include +#include +#include +#include "src/common/log_adapter.h" +#include "include/errorcode.h" + +namespace { +constexpr size_t kGatherInputsSize = 3; +} +namespace mindspore { +namespace cache { +Status EmbeddingCacheManager::Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size) { + if (cache_model_path.empty() || vocab_size == 0 || device_cache_size >= vocab_size) { + MS_LOG(INFO) << "no cache model , vocab_size " << vocab_size << ", device_cache_size " << device_cache_size; + return kSuccess; + } + + host_cache_model_ = std::make_shared(); + if (host_cache_model_ == nullptr) { + MS_LOG(ERROR) << "HostCacheModel malloc failed"; + return kLiteMemoryFailed; + } + auto ret = host_cache_model_->LoadCache(cache_model_path); + if (ret != kSuccess) { + MS_LOG(ERROR) << "load cache failed"; + return ret; + } + vocab_size_ = vocab_size; + device_cache_size_ = device_cache_size; + + MS_LOG(INFO) << "cache manager init succ, cache model" << cache_model_path << " , vocab_size " << vocab_size + << ", device_cache_size " << device_cache_size; + return ret; +} + +Status EmbeddingCacheManager::Init(DelegateModel *model, size_t vocab_size, + size_t device_cache_size) { + if (model == nullptr || vocab_size == 0 || device_cache_size >= vocab_size) { + MS_LOG(INFO) << "no cache model , vocab_size " << vocab_size << ", device_cache_size " << device_cache_size; + return kSuccess; + } + + host_cache_model_ = std::make_shared(); + if (host_cache_model_ == nullptr) { + MS_LOG(ERROR) << "HostCacheModel malloc failed"; + return kLiteMemoryFailed; + } + auto ret = host_cache_model_->LoadCache(model); + if (ret != kSuccess) { + MS_LOG(ERROR) << "load cache failed"; + return ret; + } + vocab_size_ = vocab_size; + device_cache_size_ = device_cache_size; + + MS_LOG(INFO) << "cache manager init succ, vocab_size " << vocab_size << ", device_cache_size " << device_cache_size; + return ret; +} + +bool EmbeddingCacheManager::CheckIsCacheKernel(kernel::Kernel *kernel) { + if (host_cache_model_ == nullptr) { + return false; + } + return host_cache_model_->CheckIsCacheKernel(kernel); +} + +Status EmbeddingCacheManager::InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context) { + if (host_cache_model_ == nullptr) { + MS_LOG(ERROR) << "cache model is nullptr, kernel " << kernel->name() << " init cache failed"; + return kLiteError; + } + auto host_cache_tensor = host_cache_model_->GetHostCacheTensor(kernel); + if (host_cache_tensor == nullptr) { + MS_LOG(ERROR) << kernel->name() << ": invalid cache kernel"; + return kLiteError; + } + + // only support embedding cache + if (kernel->type() != schema::PrimitiveType_Gather) { + MS_LOG(ERROR) << kernel->name() << " is not embedding kernel"; + return kLiteError; + } + MS_ASSERT(kernel->inputs().size() == kGatherInputsSize); + auto device_tensor = kernel->inputs()[0]; + size_t batch_elements = kernel->inputs()[1].ElementNum(); + auto cache = + std::make_shared(vocab_size_, device_cache_size_, batch_elements, rank_id_, rank_group_size_); + if (cache == nullptr) { + MS_LOG(ERROR) << kernel->name() << ": malloc EmbeddingCache failed"; + return kLiteError; + } + + auto ret = cache->Init(device_id, context, host_cache_tensor, device_tensor); + if (ret != kSuccess) { + MS_LOG(ERROR) << kernel->name() << ": EmbeddingCache init failed"; + return kLiteError; + } + + caches_[device_tensor.Name()] = cache; + MS_LOG(INFO) << kernel->name() << " is cache kernel, input tensor " << kernel->inputs()[1].Name() << ", cache tensor " + << device_tensor.Name(); + + return kSuccess; +} + +bool EmbeddingCacheManager::IsCacheTensor(mindspore::MSTensor tensor) { + if (host_cache_model_ == nullptr) { + return false; + } + auto cache = caches_.find(tensor.Name()); + if (cache != caches_.end()) { + return true; + } + return false; +} + +std::vector EmbeddingCacheManager::GetCacheShape(mindspore::MSTensor tensor) { + std::vector shape = tensor.Shape(); + if (shape.size() > 0 && IsCacheTensor(tensor)) { + shape[0] = device_cache_size_; + } + return shape; +} + +size_t EmbeddingCacheManager::GetCacheDataSize(mindspore::MSTensor tensor) { + auto data_size = tensor.DataSize(); + auto &shape = tensor.Shape(); + if (shape.size() > 0 && IsCacheTensor(tensor) && shape[0] > 0) { + data_size = data_size * device_cache_size_ / shape[0]; + } + return data_size; +} + +Status EmbeddingCacheManager::SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size) { + auto cache_iter = caches_.find(tensor_name); + if (cache_iter == caches_.end() || cache_iter->second == nullptr) { + MS_LOG(ERROR) << "not find cache, " << tensor_name; + return kLiteError; + } + auto cache = cache_iter->second; + return cache->SetDeviceCacheAddr(device_mem_addr, size); +} + +// device_addr is model input device addr +int EmbeddingCacheManager::CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor, + void *model_input_device_addr) { + auto cache_iter = caches_.find(tensor_name); + if (cache_iter == caches_.end()) { + MS_LOG(ERROR) << "not find cache, " << tensor_name; + return lite::RET_ERROR; + } + auto cache = cache_iter->second; + hash_indices_.resize(model_input_tensor.ElementNum()); + auto ret = cache->CheckCacheHit(static_cast(model_input_tensor.MutableData()), hash_indices_.size(), + hash_indices_.data()); + if (ret != kSuccess) { + MS_LOG(ERROR) << "CheckCacheHit failed, " << model_input_tensor.Name(); + return lite::RET_ERROR; + } + + for (size_t i = 0; i < hash_indices_.size(); i++) { + if (hash_indices_[i] != -1) { + hash_indices_[i] += cache->GetDeviceStartIndex(); + } + } + + auto cuda_ret = cudaMemcpy(model_input_device_addr, hash_indices_.data(), hash_indices_.size() * sizeof(int), + cudaMemcpyHostToDevice); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "copy mem failed, " << model_input_tensor.Name(); + return lite::RET_ERROR; + } + MS_LOG(INFO) << "cache handle succ, " << model_input_tensor.Name() << "," << tensor_name; + + return lite::RET_OK; +} +} // namespace cache +} // namespace mindspore diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h new file mode 100644 index 00000000000..2c8e2b47a64 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/embedding_cache_manager.h @@ -0,0 +1,60 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_ +#include +#include +#include +#include +#include "include/api/kernel.h" +#include "include/api/status.h" +#include "include/api/data_type.h" +#include "src/runtime/delegate/parameter_cache/embedding_cache.h" +#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h" + +namespace mindspore { +namespace cache { +class EmbeddingCacheManager { + public: + EmbeddingCacheManager() { + rank_id_ = lite::GetRankID(); + rank_group_size_ = lite::GetGPUGroupSize(); + } + Status Init(const std::string &cache_model_path, size_t vocab_size, size_t device_cache_size); + Status Init(DelegateModel *model, size_t vocab_size, size_t device_cache_size); + bool CheckIsCacheKernel(kernel::Kernel *kernel); + Status InitCacheKernel(kernel::Kernel *kernel, uint32_t device_id, const void *context); + bool IsCacheTensor(mindspore::MSTensor tensor); + int CacheHandle(const std::string &tensor_name, mindspore::MSTensor model_input_tensor, void *device_addr); + Status SetDeviceCacheAddr(const std::string &tensor_name, void *device_mem_addr, size_t size); + std::vector GetCacheShape(mindspore::MSTensor tensor); + size_t GetCacheDataSize(mindspore::MSTensor tensor); + + private: + std::map> caches_; + std::vector hash_indices_; + int rank_id_{0}; + int rank_group_size_{1}; + + std::shared_ptr host_cache_model_; + size_t vocab_size_; + size_t device_cache_size_; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_EMBEDDING_CACHE_MANAGER_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h b/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h new file mode 100644 index 00000000000..cb0049f5f1f --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/factory_mgr_base.h @@ -0,0 +1,81 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_ +#include +#include +#include "include/api/status.h" + +namespace mindspore { +namespace lite { +template +class ProcductRegistrar { + public: + virtual std::shared_ptr Create() = 0; + + protected: + ProcductRegistrar() {} + virtual ~ProcductRegistrar() {} + + private: + ProcductRegistrar(const ProcductRegistrar &); + const ProcductRegistrar &operator=(const ProcductRegistrar &); +}; + +template +class FactoryManagerBase { + public: + static FactoryManagerBase &Instance() { + static FactoryManagerBase instance; + return instance; + } + void RegProduct(const KEY &key, ProcductRegistrar *registrar) { registrars[key] = registrar; } + + std::shared_ptr GetProduct(const KEY &key) { + auto registrar_iter = registrars.find(key); + if (registrar_iter != registrars.end()) { + if (registrar_iter->second != nullptr) { + return registrar_iter->second->Create(); + } + } + return nullptr; + } + + private: + FactoryManagerBase() = default; + ~FactoryManagerBase() = default; + FactoryManagerBase(const FactoryManagerBase &); + const FactoryManagerBase &operator=(const FactoryManagerBase &); + + private: + std::map *> registrars; +}; + +template +class CommonProcductRegistrar : public ProcductRegistrar { + public: + explicit CommonProcductRegistrar(const KEY &key) { + FactoryManagerBase::Instance().RegProduct(key, this); + } + std::shared_ptr Create() { return std::make_shared(); } +}; + +#define RET_COMMON_PRODUCT_REGISTRAR(KEY, PRODUCT, PRODUCT_IMPL, key, name) \ + static mindspore::lite::CommonProcductRegistrar g_commonProcductRegistrar##name(key); +} // namespace lite +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_FACTORY_MGR_BASE_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc new file mode 100644 index 00000000000..c285b844e40 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.cc @@ -0,0 +1,158 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h" +#include +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh" +#include "plugin/device/gpu/hal/device/cuda_driver.h" +#include "src/common/log_adapter.h" +#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h" +namespace mindspore { +namespace cache { +namespace gpu { +RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheMemBase, cache::gpu::GPUCacheMem, "gpu", GPUCacheMem); +bool GPUCacheMem::InitDevice(uint32_t device_id, const void *context) { + auto cuda_ret = cudaSetDevice(static_cast(device_id)); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Failed to set device id " << device_id << ", cuda_ret " << cuda_ret << " " + << cudaGetErrorString(cuda_ret); + return false; + } + if (context != nullptr) { + stream_ = *(reinterpret_cast(context)); + return true; + } + + cuda_ret = cudaStreamCreate(&stream_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda create stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret); + return false; + } + + return true; +} + +void *GPUCacheMem::MallocMemory(size_t size) { + void *device_ptr = nullptr; + auto cuda_ret = cudaMalloc(&device_ptr, size); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda Malloc failed for size:" << size << ", cuda_ret " << cuda_ret << " " + << cudaGetErrorString(cuda_ret); + return nullptr; + } + MS_LOG(DEBUG) << "cudaMalloc size: " << size; + return device_ptr; +} + +void GPUCacheMem::FreeMemory(void *device_addr) { + auto cuda_ret = cudaFree(device_addr); + if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) { + MS_LOG(WARNING) << "free cuda memory failed, " + << ", cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret); + } +} + +bool GPUCacheMem::SynchronizeStream() { + auto cuda_ret = cudaStreamSynchronize(stream_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda sync stream failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret); + return false; + } + + return true; +} + +bool GPUCacheMem::CopyHostMemToDevice(void *dst, const void *src, size_t size) { + if (dst == nullptr) { + MS_LOG(ERROR) << "dst is nullptr"; + return false; + } + if (src == nullptr) { + MS_LOG(ERROR) << "src is nullptr"; + return false; + } + + auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret); + return false; + } + + return true; +} + +bool GPUCacheMem::CopyDeviceMemToHost(void *dst, const void *src, size_t size) { + if (dst == nullptr) { + MS_LOG(ERROR) << "dst is nullptr"; + return false; + } + if (src == nullptr) { + MS_LOG(ERROR) << "src is nullptr"; + return false; + } + + auto cuda_ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda memcpy failed, cuda_ret " << cuda_ret << " " << cudaGetErrorString(cuda_ret); + return false; + } + + return true; +} + +bool GPUCacheMem::HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t, + size_t embedding_size, size_t swap_out_size) { + if (hash_table_addr == nullptr) { + MS_LOG(ERROR) << "hash_table_addr is nullptr"; + return false; + } + if (swap_out_value_addr == nullptr) { + MS_LOG(ERROR) << "swap_out_value_addr is nullptr"; + return false; + } + if (swap_out_index_addr == nullptr) { + MS_LOG(ERROR) << "swap_out_index_addr is nullptr"; + return false; + } + + DoHashSwapOut(reinterpret_cast(hash_table_addr), reinterpret_cast(swap_out_value_addr), + reinterpret_cast(swap_out_index_addr), swap_out_size, embedding_size, stream_); + return true; +} + +bool GPUCacheMem::HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t, + size_t embedding_size, size_t swap_in_size) { + if (hash_table_addr == nullptr) { + MS_LOG(ERROR) << "hash_table_addr is nullptr"; + return false; + } + if (swap_in_value_addr == nullptr) { + MS_LOG(ERROR) << "swap_in_value_addr is nullptr"; + return false; + } + if (swap_in_index_addr == nullptr) { + MS_LOG(ERROR) << "swap_in_index_addr is nullptr"; + return false; + } + + DoHashSwapIn(reinterpret_cast(hash_table_addr), reinterpret_cast(swap_in_value_addr), + reinterpret_cast(swap_in_index_addr), swap_in_size, embedding_size, stream_); + return true; +} +} // namespace gpu +} // namespace cache +} // namespace mindspore diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h new file mode 100644 index 00000000000..f6196d95711 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/gpu/gpu_cache_mem.h @@ -0,0 +1,48 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_ + +#include +#include +#include "src/runtime/delegate/parameter_cache/cache_mem_base.h" + +namespace mindspore { +namespace cache { +namespace gpu { +class GPUCacheMem : public cache::CacheMemBase { + public: + GPUCacheMem() = default; + ~GPUCacheMem() override = default; + bool InitDevice(uint32_t device_id, const void *context) override; + void *MallocMemory(size_t size) override; + void FreeMemory(void *buf) override; + bool SynchronizeStream() override; + bool CopyHostMemToDevice(void *dst, const void *src, size_t size) override; + bool CopyDeviceMemToHost(void *dst, const void *src, size_t size) override; + bool HashSwapOut(void *hash_table_addr, void *swap_out_value_addr, void *swap_out_index_addr, size_t cache_vocab_size, + size_t embedding_size, size_t swap_out_size) override; + bool HashSwapIn(void *hash_table_addr, void *swap_in_value_addr, void *swap_in_index_addr, size_t cache_vocab_size, + size_t embedding_size, size_t swap_in_size) override; + + private: + cudaStream_t stream_; +}; +} // namespace gpu +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_GPU_GPU_CACHE_MEM_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc new file mode 100644 index 00000000000..bde17d6f54c --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.cc @@ -0,0 +1,243 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "src/common/log_adapter.h" +#include "src/runtime/delegate/parameter_cache/lfu_cache.h" +#include "src/runtime/delegate/parameter_cache/factory_mgr_base.h" +namespace mindspore { +namespace cache { +RET_COMMON_PRODUCT_REGISTRAR(std::string, cache::CacheAlgorithm, cache::LFUCacheAlgorithm, "lfu", LFUCacheAlgorithm); + +LFUCacheAlgorithm::~LFUCacheAlgorithm() { + for (auto iter : key_table_) { + delete *(iter.second); + } + key_table_.clear(); + frequency_table_.clear(); +} + +Status LFUCacheAlgorithm::Init(size_t cache_size, int min_host_index, int max_host_index) { + if (cache_size <= 0 || min_host_index < 0 || max_host_index <= 0) { + return kLiteParamInvalid; + } + cache_size_ = cache_size; + min_host_index_ = min_host_index; + max_host_index_ = max_host_index; + return kSuccess; +} + +CacheNoe *LFUCacheAlgorithm::GetNode(int key) { + auto key_table_iter = key_table_.find(key); + if (key_table_iter == key_table_.end()) { + return nullptr; + } + auto node_iter = key_table_iter->second; + auto node = *node_iter; + + auto node_list_iter = frequency_table_.find(key); + if (node_list_iter == frequency_table_.end()) { + return nullptr; + } + auto &node_list = node_list_iter->second; + node_list.erase(node_iter); + + if (node_list.empty()) { + frequency_table_.erase(node_list_iter); + } + + node->frequency += 1; + frequency_table_[node->frequency].emplace_front(node); + key_table_[key] = frequency_table_[node->frequency].begin(); + return node; +} + +int LFUCacheAlgorithm::Get(int key) { + auto node = GetNode(key); + if (node != nullptr) { + return node->value; + } + return -1; +} + +void LFUCacheAlgorithm::Put(int key, int value) { + auto node = GetNode(key); + if (node != nullptr) { + node->value = value; + return; + } + + if (cache_size_ == 0) { + return; + } + + CacheNoe *add_node = nullptr; + if (key_table_.size() == cache_size_) { + add_node = frequency_table_.begin()->second.back(); + key_table_.erase(add_node->key); + frequency_table_.begin()->second.pop_back(); + if (frequency_table_.begin()->second.size() == 0) { + frequency_table_.erase(frequency_table_.begin()->first); + } + add_node->value = value; + add_node->key = key; + add_node->frequency = 1; + } else { + add_node = new CacheNoe(key, 1, value); + if (add_node == nullptr) { + return; + } + } + + frequency_table_[1].emplace_front(add_node); + key_table_[key] = frequency_table_[1].begin(); +} + +void LFUCacheAlgorithm::GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index, + std::unordered_map *hit_index_nodes, + std::unordered_map> *need_swap_map) { + // 找到没有命中和命中的index + for (size_t i = 0; i < batch_ids_len; i++) { + auto key = batch_ids[i]; + if (key < min_host_index_ || key >= max_host_index_) { + cache_index[i] = -1; + // out range + continue; + } + + auto hit_iter = hit_index_nodes->find(key); + if (hit_iter != hit_index_nodes->end()) { + auto node = hit_iter->second; + node->frequency += 1; + cache_index[i] = node->value; + continue; + } + + auto swap_iter = need_swap_map->find(key); + if (swap_iter != need_swap_map->end()) { + swap_iter->second.push_back(i); + continue; + } + + auto node_iter_iter = key_table_.find(key); + if (node_iter_iter == key_table_.end()) { + (*need_swap_map)[key].push_back(i); + continue; + } + auto node_iter = node_iter_iter->second; + auto node = *node_iter; + + auto node_list_iter = frequency_table_.find(node->frequency); + if (node_list_iter == frequency_table_.end()) { + continue; + } + auto &node_list = node_list_iter->second; + node_list.erase(node_iter); + + if (node_list.empty()) { + frequency_table_.erase(node_list_iter); + } + // hit + node->frequency += 1; + cache_index[i] = node->value; + (*hit_index_nodes)[key] = node; + } + return; +} + +std::list LFUCacheAlgorithm::GetSwapNodes(const std::unordered_map> &need_swap_map) { + std::list need_swap_nodes; + auto swap_size = need_swap_map.size(); + + while (swap_size > 0 && !frequency_table_.empty()) { + auto node_list_iter = frequency_table_.begin(); + if (node_list_iter->second.size() > swap_size) { + auto iter = node_list_iter->second.begin(); + std::advance(iter, swap_size); + need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second, node_list_iter->second.begin(), iter); + swap_size = 0; + } else { + swap_size -= node_list_iter->second.size(); + need_swap_nodes.splice(need_swap_nodes.end(), node_list_iter->second); + frequency_table_.erase(node_list_iter); + } + } + return need_swap_nodes; +} + +Status LFUCacheAlgorithm::CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index, + std::vector *need_swap_indies, + std::vector *need_swap_indies_cache_index) { + if (batch_ids == nullptr) { + MS_LOG(ERROR) << "batch_ids is nullptr"; + return kLiteNullptr; + } + if (cache_index == nullptr) { + MS_LOG(ERROR) << "cache_index is nullptr"; + return kLiteNullptr; + } + std::unordered_map> need_swap_map; + std::unordered_map hit_index_nodes; + GetHitNodesAndSwapIndex(batch_ids, batch_ids_len, cache_index, &hit_index_nodes, &need_swap_map); + + // get need_swap_indies.size() least recently used node + std::list need_swap_nodes = GetSwapNodes(need_swap_map); + + // 更新老节点的值 + { + if (need_swap_map.size() != need_swap_nodes.size()) { + MS_LOG(ERROR) << " need_swap_map.size() " << need_swap_map.size() << " != need_swap_nodes.size() " + << need_swap_nodes.size(); + return kLiteError; + } + need_swap_indies_cache_index->reserve(need_swap_map.size()); + auto need_swap_map_iter = need_swap_map.begin(); + for (auto iter = need_swap_nodes.begin(); + iter != need_swap_nodes.end() && need_swap_map_iter != need_swap_map.end(); iter++, need_swap_map_iter++) { + auto node = *iter; + key_table_.erase(node->key); + node->key = need_swap_map_iter->first; + node->frequency = 1; + for (auto index : need_swap_map_iter->second) { + cache_index[index] = node->value; + } + need_swap_indies->push_back(need_swap_map_iter->first); + need_swap_indies_cache_index->push_back(node->value); + MS_LOG(INFO) << "device index " << node->value << ",for host index " << need_swap_map_iter->first; + key_table_[(*iter)->key] = iter; + } + + auto node_list_iter = frequency_table_.begin(); + if (node_list_iter->second.size() > 0) { + auto iter = node_list_iter->second.begin(); + if ((*iter)->frequency == 1) { + node_list_iter->second.splice(node_list_iter->second.begin(), need_swap_nodes); + } else { + frequency_table_[1] = need_swap_nodes; + } + } else { + frequency_table_[1] = need_swap_nodes; + } + } + for (auto node_iter : hit_index_nodes) { + auto node = node_iter.second; + frequency_table_[node->frequency].emplace_front(node); + key_table_[node->key] = frequency_table_[node->frequency].begin(); + } + return kSuccess; +} +} // namespace cache +} // namespace mindspore diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h new file mode 100644 index 00000000000..3704a98415c --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/lfu_cache.h @@ -0,0 +1,55 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_ + +#include +#include +#include +#include +#include "include/api/status.h" +#include "src/runtime/delegate/parameter_cache/cache_algorithm.h" +namespace mindspore { +namespace cache { +class LFUCacheAlgorithm : public CacheAlgorithm { + public: + LFUCacheAlgorithm() {} + ~LFUCacheAlgorithm() override; + + int Get(int key) override; + void Put(int key, int value) override; + Status Init(size_t cache_size, int min_host_index, int max_host_index) override; + Status CheckCacheHit(const int *batch_ids, const size_t batch_ids_len, int *cache_index, + std::vector *need_swap_indies, std::vector *need_swap_indies_cache_index) override; + + private: + CacheNoe *GetNode(int key); + void GetHitNodesAndSwapIndex(const int *batch_ids, const size_t batch_ids_len, int *cache_index, + std::unordered_map *hit_index_nodes, + std::unordered_map> *need_swap_map); + std::list GetSwapNodes(const std::unordered_map> &need_swap_map); + + std::unordered_map::iterator> key_table_; + std::map> frequency_table_; + size_t cache_size_{0}; + + int min_host_index_{0}; + int max_host_index_{1}; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LFU_CACHE_H_ diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc new file mode 100644 index 00000000000..839d8e60e28 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.cc @@ -0,0 +1,148 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "src/runtime/delegate/parameter_cache/load_host_cache_model.h" +#include "src/common/log_adapter.h" +#include "src/common/common.h" +#include "include/errorcode.h" +#include "src/common/file_utils.h" + +namespace { +constexpr size_t kGatherInputsSize = 3; +} +namespace mindspore { +namespace cache { +HostCacheModel::~HostCacheModel() { + if (cache_model_ != nullptr) { + delete cache_model_; + cache_model_ = nullptr; + } +} +MSTensor *SchemaTensorToMSTensor(lite::SchemaTensorWrapper *schema_tensor_wrapper, + mindspore::schema::Tensor *schema_tensor) { + std::vector shape; + for (size_t j = 0; j < schema_tensor->dims()->size(); j++) { + shape.push_back(schema_tensor->dims()->data()[j]); + } + std::string tensor_name; + if (schema_tensor->name() != nullptr) { + tensor_name = schema_tensor->name()->str(); + } + return MSTensor::CreateRefTensor(tensor_name, (DataType)schema_tensor->dataType(), shape, + schema_tensor_wrapper->data(), schema_tensor_wrapper->length()); +} + +Status HostCacheModel::LoadCache(const std::string &model_path) { + cache_model_ = lite::LiteImportFromPath(model_path.c_str()); + if (cache_model_ == nullptr) { + MS_LOG(ERROR) << "Import model failed"; + return kLiteGraphFileError; + } + + auto allTensors = cache_model_->graph_.all_tensors_; + for (auto node : cache_model_->graph_.all_nodes_) { + // only support embedding cache + if (node == nullptr || node->node_type_ != schema::PrimitiveType_Gather) { + continue; + } + + auto input_index = node->input_indices_[0]; + if (input_index > allTensors.size() - 1) { + MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index << ",allTensors.size() " + << allTensors.size(); + return kLiteOutOfTensorRange; + } + auto schema_tensor_wrapper = cache_model_->GetSchemaTensor(input_index); + if (schema_tensor_wrapper == nullptr) { + MS_LOG(ERROR) << "invalid kernel input, input_index " << input_index; + return kLiteOutOfTensorRange; + } + + auto schema_tensor = allTensors[input_index]; + if (schema_tensor != nullptr && schema_tensor_wrapper->data() != nullptr) { + auto tensor = SchemaTensorToMSTensor(schema_tensor_wrapper, schema_tensor); + if (tensor == nullptr) { + return kLiteMemoryFailed; + } + cache_tensor_[tensor->Name()] = *tensor; + MS_LOG(INFO) << tensor->Name() << " is cache tensor, and the node is [" << node->name_ << "]"; + delete tensor; + } + } + return kSuccess; +} + +size_t GetVocabSize(kernel::Kernel *kernel) { + size_t vocab_size = 0; + auto cache_config = kernel->GetConfig(lite::kMSCache); + auto vocab_size_iter = cache_config.find(lite::kMSCacheVocabSize); + if (vocab_size_iter == cache_config.end()) { + return vocab_size; + } + + auto vocab_size_opt = lite::GenericParseValue(vocab_size_iter->second); + if (!vocab_size_opt.IsNone()) { + vocab_size = vocab_size_opt.Get(); + } + return vocab_size; +} + +Status HostCacheModel::LoadCache(DelegateModel *model) { + KernelIter from, end; + for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) { + kernel::Kernel *kernel = *iter; + // only support embedding cache + if (kernel->type() != schema::PrimitiveType_Gather) { + continue; + } + MS_ASSERT(kernel->inputs().size() == kGatherInputsSize); + auto tensor = kernel->inputs()[0]; + if (tensor.Data() == nullptr) { + continue; + } + + size_t vocab_size = GetVocabSize(kernel); + if (vocab_size == 0) { + continue; + } + + cache_tensor_[tensor.Name()] = tensor; + } + return mindspore::kSuccess; +} + +bool HostCacheModel::CheckIsCacheKernel(kernel::Kernel *kernel) { + if (GetHostCacheTensor(kernel) == nullptr) { + return false; + } + return true; +} + +MSTensor HostCacheModel::GetHostCacheTensor(kernel::Kernel *kernel) { + if (kernel != nullptr && kernel->inputs().size() > 0) { + auto iter = cache_tensor_.find(kernel->inputs()[0].Name()); + if (iter != cache_tensor_.end()) { + return iter->second; + } + } + return MSTensor(nullptr); +} +} // namespace cache +} // namespace mindspore diff --git a/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h new file mode 100644 index 00000000000..52b22eea0d0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/parameter_cache/load_host_cache_model.h @@ -0,0 +1,48 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_PARAMETER_CACHE_LOAD_HOST_CACHE_MODEL_H_ + +#include +#include +#include "include/api/status.h" +#include "include/api/data_type.h" +#include "include/api/types.h" +#include "include/api/kernel.h" +#include "include/api/delegate.h" +#include "src/runtime/lite_model.h" + +namespace mindspore { +namespace cache { +class HostCacheModel { + public: + HostCacheModel() = default; + ~HostCacheModel(); + Status LoadCache(const std::string &model_path); + Status LoadCache(DelegateModel *model); + bool CheckIsCacheKernel(kernel::Kernel *kernel); + MSTensor GetHostCacheTensor(kernel::Kernel *kernel); + + private: + std::map cache_tensor_; + mindspore::lite::LiteModel *cache_model_{nullptr}; + char *model_buf_{nullptr}; + size_t model_size_; +}; +} // namespace cache +} // namespace mindspore +#endif // MINDSPORE_LITE_EMBEDDING_CACHE_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt b/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt new file mode 100644 index 00000000000..3e5d613b9da --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/CMakeLists.txt @@ -0,0 +1,95 @@ +include_directories(${TENSORRT_PATH}/include) +include_directories(${CUDA_PATH}/include) +include_directories(${CUDA_PATH}) +include_directories($(CCSRC_DIR)/plugin/device/cpu/kernel) +include_directories(${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops) + +if(DEFINED ENV{MS_ENABLE_CUDA_DISTRIBUTION}) + set(MS_ENABLE_CUDA_DISTRIBUTION $ENV{MS_ENABLE_CUDA_DISTRIBUTION}) +else() + set(MS_ENABLE_CUDA_DISTRIBUTION "off") +endif() + +set(NCCL_MPI_SRC_STUB + ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_collective.cc + ${CMAKE_CURRENT_SOURCE_DIR}/distribution/distribution_base.cc +) + +# nccl mpi +if(MS_ENABLE_CUDA_DISTRIBUTION STREQUAL "on") + message("enable cuda gpu distribution collective") + file(GLOB NCCL_MPI_SRC LIST_DIRECTORIES false + ${CMAKE_CURRENT_SOURCE_DIR}/distribution/*.cc + ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/collective_wrapper.cc + ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/mpi_wrapper.cc + ${CCSRC_DIR}/plugin/device/gpu/hal/device/distribution/nccl_wrapper.cc + ) + list(REMOVE_ITEM NCCL_MPI_SRC ${NCCL_MPI_SRC_STUB}) + + add_compile_definitions(LITE_CUDA_DISTRIBUTION) + include(${TOP_DIR}/cmake/external_libs/ompi.cmake) + include(${TOP_DIR}/cmake/external_libs/nccl.cmake) + + add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC}) + add_library(mindspore::nccl ALIAS nccl::nccl) + add_library(mindspore::ompi ALIAS ompi::mpi) + target_link_libraries(gpu_distribution_collective PRIVATE mindspore::ompi mindspore::nccl) +else() + add_library(gpu_distribution_collective OBJECT ${NCCL_MPI_SRC_STUB}) +endif() +add_dependencies(gpu_distribution_collective fbs_src) + +file(GLOB TENSORRT_RUNTIME_SRC LIST_DIRECTORIES false + ${CMAKE_CURRENT_SOURCE_DIR}/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/op/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime/delegate/delegate_utils.cc + ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_device_info.cc +) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache) + +set(TENSORRT_RUNTIME_SRC + ${TENSORRT_RUNTIME_SRC} + ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache_manager.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/load_host_cache_model.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/lfu_cache.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/embedding_cache.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../parameter_cache/gpu/gpu_cache_mem.cc + ) + +link_libraries(${CUDA_LIB_PATH}/libcudnn.so) +link_libraries(${CUDA_LIB_PATH}/libnvrtc.so) +link_libraries(${CUDA_LIB_PATH}/libcublasLt.so) + +add_library(libcudart SHARED IMPORTED) +set_target_properties(libcudart PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcudart.so) + +add_library(libnvinfer SHARED IMPORTED) +set_target_properties(libnvinfer PROPERTIES IMPORTED_LOCATION ${TENSORRT_LIB_PATH}/libnvinfer.so) + +add_library(libcublas SHARED IMPORTED) +set_target_properties(libcublas PROPERTIES IMPORTED_LOCATION ${CUDA_LIB_PATH}/libcublas.so) +add_library(tensorrt_kernel_mid OBJECT ${TENSORRT_RUNTIME_SRC}) + +add_dependencies(tensorrt_kernel_mid fbs_src) + +target_link_libraries( + tensorrt_kernel_mid + libcudart + libcublas + libnvinfer +) + +# cuda +find_package(CUDA) +file(GLOB_RECURSE CUDA_KERNEL_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_impl/*.cu + ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cu + ${CCSRC_DIR}/plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cu +) + +set_source_files_properties(${CUDA_KERNEL_SRC} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGES} -std=c++14 -fPIC") +SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-std=c++14;) +cuda_add_library(cuda_kernel_mid STATIC ${CUDA_KERNEL_SRC}) diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu new file mode 100644 index 00000000000..ce412e6fcb4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cu @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh" +#include +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" + +template +__global__ void SigmoidKernel(const T *input1, T *output, int element_cnt) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) { + output[pos] = static_cast(1) / (static_cast(1) + exp(-input1[pos])); + } +} + +template +__global__ void GeluKernel(const T *input_addr, T *output_addr, int size) { + // formula: + // gelu(x) = 0.5 * x * (1.0 + tanh(y)) + // tanh(y) = 2 / (1 + exp(-2y)) - 1) + // y = sqrt(2/pi) * (x + 0.044715 * x^3) + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + float x = input_addr[pos]; + float tanh_res = tanh(0.7978845608f * (x + 0.044715f * x * x * x)); + output_addr[pos] = 0.5f * x * (1.0f + tanh_res); + } +} + +template +void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream) { + SigmoidKernel<<>>(input1, output, element_cnt); + return; +} + +template +void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream) { + GeluKernel<<>>(input1, output, element_cnt); + return; +} + +template void Sigmoid(const float *input1, float *output, int element_cnt, cudaStream_t stream); + +template void Gelu(const float *input1, float *output, int element_cnt, cudaStream_t stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh new file mode 100644 index 00000000000..81d187674bd --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/activation.cuh @@ -0,0 +1,26 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_ + +template +void Sigmoid(const T *input1, T *output, int element_cnt, cudaStream_t stream); + +template +void Gelu(const T *input1, T *output, int element_cnt, cudaStream_t stream); + +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_ACTIVATION_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu new file mode 100644 index 00000000000..a1e90b16d48 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cu @@ -0,0 +1,49 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh" +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" + +// Generic cast +template +__device__ __forceinline__ void CastBase(const S *input_addr, T *output_addr) { + *output_addr = static_cast((*input_addr)); +} + +template +__global__ void CastKernel(const int input_size, const S *input_addr, T *output_addr) { + for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < input_size; pos += blockDim.x * gridDim.x) { + CastBase(input_addr + pos, output_addr + pos); + } +} + +template +void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream) { + CastKernel<<>>(input_size, input_addr, output_addr); +} + +template void Cast(const int input_size, const int8_t *input_addr, int8_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const int8_t *input_addr, int32_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const int8_t *input_addr, float *output_addr, cudaStream_t stream); + +template void Cast(const int input_size, const int32_t *input_addr, int8_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const int32_t *input_addr, int32_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const int32_t *input_addr, float *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const int32_t *input_addr, bool *output_addr, cudaStream_t stream); + +template void Cast(const int input_size, const float *input_addr, int8_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const float *input_addr, int32_t *output_addr, cudaStream_t stream); +template void Cast(const int input_size, const float *input_addr, float *output_addr, cudaStream_t stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh new file mode 100644 index 00000000000..59d7ab82793 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cast.cuh @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_ + +template +void Cast(const int input_size, const S *input_addr, T *output_addr, cudaStream_t stream); + +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_CAST_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc new file mode 100644 index 00000000000..cd50b470ef4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.cc @@ -0,0 +1,70 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h" + +namespace mindspore::lite { +void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle) { + const int m = params[0]; + const int n = params[1]; + const float alpha = 1.0f; + const float beta = 0.0f; + CUBLAS_CHECK_VOID( + cublasSgeam(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, &alpha, in_addr, n, &beta, out_addr, m, out_addr, m)); +} + +void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params, + const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle) { + const int m = params[0]; + const int n = params[1]; + const int k = params[2]; + cublasOperation_t trans_a = operations[0]; + cublasOperation_t trans_b = operations[1]; + const int lda = (trans_a == CUBLAS_OP_N) ? k : m; + const int ldb = (trans_b == CUBLAS_OP_N) ? n : k; + const int ldc = n; + cudaDataType type_a = data_types[0]; + cudaDataType type_b = data_types[1]; + cudaDataType type_c = data_types[2]; + cudaDataType compute_type = data_types[3]; + const float alpha = 1.0f; + const float beta = 0.0f; + CUBLAS_CHECK_VOID(cublasGemmEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addr, type_b, ldb, a_addr, type_a, + lda, &beta, c_addr, type_c, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} +void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params, + const cublasOperation_t *operations, const cudaDataType *data_types, + cublasHandle_t cublas_handle) { + cublasOperation_t trans_a = operations[0]; + cublasOperation_t trans_b = operations[1]; + const int m = params[0]; + const int n = params[1]; + const int k = params[2]; + const int batch = params[3]; + const int lda = (trans_a == CUBLAS_OP_N) ? k : m; + const int ldb = (trans_b == CUBLAS_OP_N) ? n : k; + const int ldc = n; + cudaDataType type_a = data_types[0]; + cudaDataType type_b = data_types[1]; + cudaDataType type_c = data_types[2]; + cudaDataType compute_type = data_types[3]; + const float alpha = 1.0f; + const float beta = 0.0f; + CUBLAS_CHECK_VOID(cublasGemmBatchedEx(cublas_handle, trans_b, trans_a, n, m, k, &alpha, b_addrs, type_b, ldb, a_addrs, + type_a, lda, &beta, c_addrs, type_c, ldc, batch, compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h new file mode 100644 index 00000000000..4a7f4eb0576 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h @@ -0,0 +1,62 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_ + +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include "src/common/log_util.h" + +// cublas API error checking +#define CUBLAS_CHECK_VOID(err) \ + do { \ + cublasStatus_t cublas_err = (err); \ + if (cublas_err != CUBLAS_STATUS_SUCCESS) { \ + MS_LOG(ERROR) << "cublas error " << cublas_err; \ + return; \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t cublas_err = (err); \ + if (cublas_err != CUBLAS_STATUS_SUCCESS) { \ + MS_LOG(ERROR) << "cublas error " << cublas_err; \ + return -1; \ + } \ + } while (0) + +namespace mindspore::lite { +// a: m * n +// params order: m, n +void Cublas2DTranspose(const float *in_addr, float *out_addr, const int *params, cublasHandle_t cublas_handle); + +// a: m * k, b: k * n, c: m * n +// params order: m, n, k +// operations order: trans_a, trans_b +// data_types: type_a, type_b, type_c, compute type +void CublasMM1Batch(const void *a_addr, const void *b_addr, void *c_addr, const int *params, + const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle); + +// a: batch * m * k, b: batch * k * n, c: batch * m * n +// params order: m, n, k, batch +// operations order: trans_a, trans_b +// data_types: type_a, type_b, type_c, compute type +void CublasMMBatched(void **a_addrs, void **b_addrs, void **c_addrs, const int *params, + const cublasOperation_t *operations, const cudaDataType *data_types, cublasHandle_t cublas_handle); +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUBLAS_UTILS_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc new file mode 100644 index 00000000000..54f5738aeb8 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.cc @@ -0,0 +1,48 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include +#include "src/common/log_util.h" + +CudaHelper &CudaHelper::GetInstance() { + static CudaHelper instance; + return instance; +} +int CudaHelper::GetThreadNum() const { return threads_per_block_; } +int CudaHelper::GetThreadNum(const int block_size) const { + return std::min(threads_per_block_, ((block_size - 1) / 32 + 1) * 32); +} +int CudaHelper::GetBlocksNum(const int total_threads) const { + return std::min(((total_threads - 1) / threads_per_block_) + 1, max_blocks_); +} +int CudaHelper::GetBlocksNum(const int total_threads, const int block_size) const { + int valid_block_size = std::min(block_size, threads_per_block_); + if (valid_block_size == 0) { + MS_LOG(ERROR) << "invalid input of block_size: " << block_size; + return 0; + } + return std::min(((total_threads - 1) / valid_block_size) + 1, max_blocks_); +} + +CudaHelper::CudaHelper() { + int device_id = 0; + (void)cudaGetDevice(&device_id); + cudaDeviceProp prop; + (void)cudaGetDeviceProperties(&prop, device_id); + threads_per_block_ = prop.maxThreadsPerBlock; + max_blocks_ = prop.multiProcessorCount; +} diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h new file mode 100644 index 00000000000..dc7cc93afa7 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h @@ -0,0 +1,63 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_ + +#include +#include + +class CudaHelper { + public: + int GetThreadNum() const; + int GetThreadNum(const int block_size) const; + int GetBlocksNum(const int total_threads) const; + int GetBlocksNum(const int total_threads, const int block_size) const; + static CudaHelper &GetInstance(); + + private: + CudaHelper(); + ~CudaHelper() = default; + CudaHelper(const CudaHelper &) = delete; + CudaHelper &operator=(const CudaHelper &) = delete; + + int max_blocks_; + int threads_per_block_; +}; + +#define GET_BLOCKS(total_threads) CudaHelper::GetInstance().GetBlocksNum(total_threads) +#define GET_BLOCKS_CAL(total_threads, block_size) CudaHelper::GetInstance().GetBlocksNum(total_threads, block_size) + +#define GET_THREADS CudaHelper::GetInstance().GetThreadNum() +#define GET_THREADS_CAL(block_size) CudaHelper::GetInstance().GetThreadNum(block_size) + +#define CUDA_CHECK(ret) \ + do { \ + cudaError_t cuda_ret = (ret); \ + if ((cuda_ret) != cudaSuccess) { \ + return -1; \ + } \ + } while (0) + +#define CUDA_CHECK_VOID(ret) \ + do { \ + cudaError_t cuda_ret = (ret); \ + if ((cuda_ret) != cudaSuccess) { \ + return; \ + } \ + } while (0) + +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDA_HELPER_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc new file mode 100644 index 00000000000..1590560f697 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.cc @@ -0,0 +1,41 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h" +#include + +namespace mindspore::lite { +cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype) { + std::unordered_map data_types = {{nvinfer1::DataType::kFLOAT, CUDNN_DATA_FLOAT}, + {nvinfer1::DataType::kHALF, CUDNN_DATA_HALF}, + {nvinfer1::DataType::kINT32, CUDNN_DATA_INT32}, + {nvinfer1::DataType::kINT8, CUDNN_DATA_INT8}}; + if (data_types.find(trt_datatype) != data_types.end()) { + return data_types[trt_datatype]; + } else { + MS_LOG(ERROR) << "invalid datatype for cudnn: " << static_cast(trt_datatype); + } + return CUDNN_DATA_FLOAT; +} + +int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc, + const cudnnTensorDescriptor_t x_dsc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y) { + float alpha = 1.0f; + float beta = 0.0f; + CUDNN_CHECK(cudnnActivationForward(handle, activation_desc, &alpha, x_dsc, x, &beta, y_dsc, y)); + return 0; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h new file mode 100644 index 00000000000..d3202e05e00 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h @@ -0,0 +1,48 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_ + +#include +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include "src/common/log_util.h" + +#define CUDNN_CHECK_VOID(err) \ + do { \ + cudnnStatus_t cudnn_err = (err); \ + if (cudnn_err != CUDNN_STATUS_SUCCESS) { \ + MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \ + return; \ + } \ + } while (0) + +#define CUDNN_CHECK(err) \ + do { \ + cudnnStatus_t cudnn_err = (err); \ + if (cudnn_err != CUDNN_STATUS_SUCCESS) { \ + MS_LOG(ERROR) << "cudnn error " << cudnnGetErrorString(cudnn_err); \ + return -1; \ + } \ + } while (0) +namespace mindspore::lite { +cudnnDataType_t ConvertCudnnDataType(nvinfer1::DataType trt_datatype); + +int CudnnActivation(cudnnHandle_t handle, cudnnActivationDescriptor_t activation_desc, + const cudnnTensorDescriptor_t x_esc, const void *x, const cudnnTensorDescriptor_t y_dsc, void *y); +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_CUDA_IMPL_CUDNN_UTILS_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu new file mode 100644 index 00000000000..7d4840e9fea --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cu @@ -0,0 +1,35 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh" +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" + +template +__global__ void EqualKernel(const T *input1, const T *input2, T *output, int element_cnt) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) { + output[pos] = (input1[pos] - input2[pos] < 1e-6 && input1[pos] - input2[pos] > -1e-6); + } +} + +template +void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) { + EqualKernel<<>>(input1, input2, output, element_cnt); + return; +} + +template void Equal(const float *input1, const float *input2, float *output, int element_cnt, cudaStream_t stream); +template void Equal(const int *input1, const int *input2, int *output, int element_cnt, cudaStream_t stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh new file mode 100644 index 00000000000..69551308a97 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/equal.cuh @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_ + +template +void Equal(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream); + +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_EQUAL_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu new file mode 100755 index 00000000000..27c626bc5fe --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cu @@ -0,0 +1,64 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/hash.cuh" +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" + +template +__global__ void HashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size, + const int hash_dim) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) { + int hash_index = swap_out_index[i]; + for (int j = 0; j < hash_dim; j++) { + swap_out_value[i * hash_dim + j] = hash_table[hash_index * hash_dim + j]; + } + } + return; +} + +template +__global__ void HashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size, + const int hash_dim) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < index_size; i += blockDim.x * gridDim.x) { + int hash_index = swap_in_index[i]; + for (int j = 0; j < hash_dim; j++) { + hash_table[hash_index * hash_dim + j] = swap_in_value[i * hash_dim + j]; + } + } + return; +} + +template +void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size, + const int hash_dim, cudaStream_t cuda_stream) { + HashSwapOut<<>>(hash_table, swap_out_value, swap_out_index, + index_size, hash_dim); + return; +} + +template +void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size, + const int hash_dim, cudaStream_t cuda_stream) { + HashSwapIn<<>>(hash_table, swap_in_value, swap_in_index, + index_size, hash_dim); + return; +} + +template void DoHashSwapOut(const float *hash_table, float *swap_out_value, const int *swap_out_index, + const int index_size, const int hash_dim, cudaStream_t cuda_stream); + +template void DoHashSwapIn(float *hash_table, const float *swap_in_value, const int *swap_in_index, + const int index_size, const int hash_dim, cudaStream_t cuda_stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh new file mode 100755 index 00000000000..779abba36b1 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/hash.cuh @@ -0,0 +1,27 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_ + +template +void DoHashSwapOut(const T *hash_table, T *swap_out_value, const int *swap_out_index, const int index_size, + const int hash_dim, cudaStream_t cuda_stream); + +template +void DoHashSwapIn(T *hash_table, const T *swap_in_value, const int *swap_in_index, const int index_size, + const int hash_dim, cudaStream_t cuda_stream); +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_HASH_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu new file mode 100644 index 00000000000..7c28811db26 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cu @@ -0,0 +1,63 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh" +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" + +template +__global__ void LogicalNotKernel(const T *input1, T *output, int element_cnt) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < element_cnt; pos += blockDim.x * gridDim.x) { + output[pos] = static_cast(input1[pos] == 0); + } +} + +template +__global__ void LogicalAndKernel(const T *input_addr1, const T *input_addr2, T *output, int size) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + output[pos] = input_addr1[pos] * input_addr2[pos]; + } +} + +template +__global__ void LogicalOrKernel(const T *input_addr1, const T *input_addr2, T *output, int size) { + for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) { + T sum = input_addr1[pos] + input_addr2[pos]; + output[pos] = static_cast(sum > 0); + } +} + +template +void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream) { + LogicalNotKernel<<>>(input1, output, element_cnt); +} + +template +void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) { + LogicalAndKernel<<>>(input1, input2, output, element_cnt); +} + +template +void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream) { + LogicalOrKernel<<>>(input1, input2, output, element_cnt); +} + +template void LogicalNot(const int32_t *input1, int32_t *output, int element_cnt, cudaStream_t stream); + +template void LogicalAnd(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt, + cudaStream_t stream); + +template void LogicalOr(const int32_t *input1, const int32_t *input2, int32_t *output, int element_cnt, + cudaStream_t stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh new file mode 100644 index 00000000000..e2a18187aab --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/logical.cuh @@ -0,0 +1,29 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_ + +template +void LogicalAnd(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream); + +template +void LogicalOr(const T *input1, const T *input2, T *output, int element_cnt, cudaStream_t stream); + +template +void LogicalNot(const T *input1, T *output, int element_cnt, cudaStream_t stream); + +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_LOGICAL_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu new file mode 100644 index 00000000000..b8005a98334 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cu @@ -0,0 +1,98 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh" +#include +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/utils.cuh" + +template +__global__ void NormalizeKernel(const T *input, const T *gamma, const T *beta, T *output, size_t n, float epsilion, + int dim_before_axis) { + const int tid = threadIdx.x; + const int bid = blockIdx.x; + const int block_loop = (dim_before_axis - 1) / gridDim.x + 1; + const int element_cnt = dim_before_axis * n; + + __shared__ float s_mean[2048]; + __shared__ float s_variance[2048]; + float sum = 0.0f; + float variance = 0.0f; + + for (int block = 0; block < block_loop; block++) { + float local_sum = 0.0f; + int mean_index = bid + block * gridDim.x; + int num_index = bid * n + block * gridDim.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x) { + if (num_index + i >= element_cnt) { + break; + } + local_sum += static_cast(input[num_index + i]); + } + sum = blockReduceSum(local_sum); + if (tid == 0) { + s_mean[mean_index] = sum / n; + } + } + __syncthreads(); + + for (int block = 0; block < block_loop; block++) { + float local_var_sum = 0.0f; + int var_index = bid + block * gridDim.x; + int num_index = bid * n + block * gridDim.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x) { + if (num_index + i >= element_cnt) { + break; + } + float diff = static_cast(input[num_index + i]) - s_mean[var_index]; + local_var_sum += diff * diff; + } + variance = blockReduceSum(local_var_sum); + if (tid == 0) { + s_variance[var_index] = rsqrtf(variance / n + epsilion); + } + } + __syncthreads(); + for (int block = 0; block < block_loop; block++) { + int var_index = bid + block * gridDim.x; + int num_index = bid * n + block * gridDim.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x) { + if (num_index + i >= element_cnt) { + break; + } + float beta_val = (beta == nullptr) ? 0.0f : static_cast(beta[i]); + output[num_index + i] = + static_cast(((static_cast(input[num_index + i]) - s_mean[var_index]) * s_variance[var_index]) * + static_cast(gamma[i]) + + beta_val); + } + } +} + +template +void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion, + int element_cnt, cudaStream_t stream) { + int thread_num = GET_THREADS_CAL(dim_at_axis); + int block_num = GET_BLOCKS_CAL(element_cnt, thread_num); + int dim_before_axis = element_cnt / dim_at_axis; + NormalizeKernel<<>>(input, gamma, beta, output, dim_at_axis, epsilion, + dim_before_axis); + return; +} + +template void Normalize(const float *input, const float *gamma, const float *beta, float *output, size_t dim_at_axis, + float epsilion, int element_cnt, cudaStream_t stream); diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh new file mode 100644 index 00000000000..03eada9f3b4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh @@ -0,0 +1,24 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_ + +template +void Normalize(const T *input, const T *gamma, const T *beta, T *output, size_t dim_at_axis, float epsilion, + int element_cnt, cudaStream_t stream); + +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_CDUA_IMPL_NORMALIZE_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh new file mode 100644 index 00000000000..8d957877db9 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/cuda_impl/utils.cuh @@ -0,0 +1,41 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#define FINAL_MASK 0xffffffff + +template +__device__ T warpedReduceSum(T val) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + val += __shfl_xor_sync(FINAL_MASK, val, mask, 32); + } + return val; +} + +template +__device__ T blockReduceSum(T val) { + static __shared__ T shared[32]; + int warped = threadIdx.x & 0x1f; + val = warpedReduceSum(val); + if (warped == 0) shared[threadIdx.x >> 5] = val; + __syncthreads(); + val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[warped] : static_cast(0.0); + val = warpedReduceSum(val); + return val; +} diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc new file mode 100644 index 00000000000..48f49e688d1 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.cc @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h" + +namespace mindspore::lite { +int GetGPUGroupSize() { return 1; } + +int GetRankID() { return 0; } +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h new file mode 100644 index 00000000000..4feddaadb1e --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base.h @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_ + +#include +#include "src/common/log_adapter.h" +#include "include/errorcode.h" + +namespace mindspore::lite { +constexpr char NCCL_WORLD_GROUP[] = "nccl_world_group"; + +int GetGPUGroupSize(); + +int GetRankID(); +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_BASE_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc new file mode 100644 index 00000000000..760952e89b2 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_base_impl.cc @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h" +#include +#include +#include +#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int GetGPUGroupSize() { return GetGroupSize(NCCL_WORLD_GROUP); } + +int GetRankID() { return GetRankIDByGroup(NCCL_WORLD_GROUP); } +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc new file mode 100644 index 00000000000..e3cc692de7b --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.cc @@ -0,0 +1,38 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h" + +namespace mindspore::lite { +DistributionCollective::DistributionCollective() {} + +DistributionCollective &DistributionCollective::instance() { + static DistributionCollective instance; + return instance; +} + +int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count, + nvinfer1::DataType data_type, schema::ReduceMode reduce_type, + cudaStream_t stream, const std::string &group) { + return RET_OK; +} + +int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count, + nvinfer1::DataType data_type, cudaStream_t stream, + const std::string &group_name) { + return RET_OK; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h new file mode 100644 index 00000000000..43ac1acbfa7 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective.h @@ -0,0 +1,45 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_ + +#include +#include "NvInfer.h" +#include "schema/ops_types_generated.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h" + +namespace mindspore::lite { +class DistributionCollective { + public: + DistributionCollective(DistributionCollective const &) = delete; + + DistributionCollective &operator=(const DistributionCollective &) = delete; + + static DistributionCollective &instance(); + + int ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type, + schema::ReduceMode reduce_type, cudaStream_t stream, const std::string &group); + + int AllGatherWrapper(const void *input_addr, void *output_addr, size_t count, nvinfer1::DataType data_type, + cudaStream_t stream, const std::string &group_name); + + private: + DistributionCollective(); + + ~DistributionCollective() = default; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_COLLECTIVE_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc new file mode 100644 index 00000000000..e524db6a6f9 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_collective_impl.cc @@ -0,0 +1,72 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h" +#include +#include +#include +#include "plugin/device/gpu/hal/device/distribution/collective_wrapper.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_base.h" + +namespace mindspore::lite { +DistributionCollective::DistributionCollective() { + InitMPI(); + InitNCCLComm(); +} + +DistributionCollective &DistributionCollective::instance() { + static DistributionCollective instance; + return instance; +} + +int DistributionCollective::ReduceScatterWrapper(const void *input_addr, void *output_addr, size_t count, + nvinfer1::DataType data_type, schema::ReduceMode reduce_type, + cudaStream_t stream, const std::string &group) { + int rank_id = GetRankID(); + MS_LOG(DEBUG) << "ReduceScatter on rank: " << rank_id; + ncclResult_t ret = ReduceScatter(input_addr, output_addr, count, ConvertNCCLDataType(data_type), + ConvertNCCLReduceMode(reduce_type), stream, group); + if (ret != ncclSuccess) { + MS_LOG(ERROR) << "ReduceScatter failed: " << static_cast(ret); + return RET_ERROR; + } + auto cuda_ret = cudaStreamSynchronize(stream); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast(cuda_ret); + return RET_ERROR; + } + return RET_OK; +} + +int DistributionCollective::AllGatherWrapper(const void *input_addr, void *output_addr, size_t count, + nvinfer1::DataType data_type, cudaStream_t stream, + const std::string &group_name) { + int rank_id = GetRankID(); + MS_LOG(DEBUG) << "AllGather on rank: " << rank_id; + ncclResult_t ret = AllGather(input_addr, output_addr, count, ConvertNCCLDataType(data_type), stream, group_name); + if (ret != ncclSuccess) { + MS_LOG(ERROR) << "AllGather failed: " << static_cast(ret); + return RET_ERROR; + } + auto cuda_ret = cudaStreamSynchronize(stream); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaStreamSynchronize failed: " << static_cast(cuda_ret); + return RET_ERROR; + } + return RET_OK; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc new file mode 100644 index 00000000000..8f45360c1b4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.cc @@ -0,0 +1,58 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/distribution/distribution_utils.h" +#include +#include "src/common/log_adapter.h" + +namespace mindspore::lite { +ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id) { + std::unordered_map data_type_map = { + {nvinfer1::DataType::kINT8, ncclInt8}, + {nvinfer1::DataType::kINT32, ncclInt32}, + {nvinfer1::DataType::kFLOAT, ncclFloat32}, + {nvinfer1::DataType::kHALF, ncclHalf}, + }; + auto iter = data_type_map.find(type_id); + ncclDataType_t data_type; + if (iter != data_type_map.end()) { + data_type = iter->second; + } else { + data_type = ncclFloat32; + MS_LOG(WARNING) << "invalid data_type for NCCL, need check: " << static_cast(type_id); + } + return data_type; +} + +ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode) { + std::unordered_map reduce_ops_ = { + // higher version support mean {schema::ReduceMode::ReduceMode_ReduceMean, ncclAvg}, + {schema::ReduceMode::ReduceMode_ReduceMax, ncclMax}, + {schema::ReduceMode::ReduceMode_ReduceMin, ncclMin}, + {schema::ReduceMode::ReduceMode_ReduceProd, ncclProd}, + {schema::ReduceMode::ReduceMode_ReduceSum, ncclSum}, + }; + auto iter = reduce_ops_.find(mode); + ncclRedOp_t nccl_mode; + if (iter != reduce_ops_.end()) { + nccl_mode = iter->second; + } else { + nccl_mode = ncclSum; + MS_LOG(WARNING) << "invalid reduce for NCCL, need check: " << static_cast(mode); + } + return nccl_mode; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h new file mode 100644 index 00000000000..e38b3a10691 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/distribution/distribution_utils.h @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_ + +#include +#include "include/errorcode.h" +#include "NvInfer.h" +#include "schema/ops_types_generated.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::lite { +ncclDataType_t ConvertNCCLDataType(nvinfer1::DataType type_id); + +ncclRedOp_t ConvertNCCLReduceMode(schema::ReduceMode mode); +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_DISTRIBUTION_DISTRIBUTION_UTILS_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc new file mode 100644 index 00000000000..6ef4682d4be --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.cc @@ -0,0 +1,116 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "NvInferRuntimeCommon.h" +#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/activation.cuh" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/swish_impl.cuh" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(ActivationOptPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int ActivationOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream) noexcept { + return RunCudaActivation(inputDesc, inputs, outputs, stream); +} + +bool ActivationOptPlugin::needResize(const int *current_dims, const int *last_dims) { + for (int i = 0; i < infer_dims_cnt_; i++) { + if (current_dims[i] != last_dims[i]) { + return true; + } + } + return false; +} + +int ActivationOptPlugin::RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, + void *const *outputs, cudaStream_t stream) { + if (needResize(infer_dims_, inputDesc[0].dims.d)) { + if (input_desc_ != nullptr) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_)); + input_desc_ = nullptr; + } + CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc_)); + for (int i = 0; i < inputDesc[0].dims.nbDims; i++) { + infer_dims_[i] = inputDesc[0].dims.d[i]; + } + CUDNN_CHECK(cudnnSetTensorNdDescriptor(input_desc_, ConvertCudnnDataType(inputDesc[0].type), infer_dims_cnt_, + infer_dims_, infer_stride_)); + } + CHECK_NULL_RETURN(cudnn_handle_); + CHECK_NULL_RETURN(activation_desc_); + CHECK_NULL_RETURN(input_desc_); + CUDNN_CHECK(cudnnSetStream(cudnn_handle_, stream)); + auto ret = CudnnActivation(cudnn_handle_, activation_desc_, input_desc_, inputs[0], input_desc_, outputs[0]); + if (ret != RET_OK) { + MS_LOG(ERROR) << "cudnn activation func call failed " << layer_name_; + return ret; + } + return RET_OK; +} + +int ActivationOptPlugin::RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, + void *const *outputs, cudaStream_t stream) { + switch (activation_type_) { + case (schema::ActivationType::ActivationType_SIGMOID): { + Sigmoid(static_cast(inputs[0]), static_cast(outputs[0]), GetDimsVolume(inputDesc[0].dims), + stream); + break; + } + case (schema::ActivationType::ActivationType_GELU): { + Gelu(static_cast(inputs[0]), static_cast(outputs[0]), GetDimsVolume(inputDesc[0].dims), + stream); + break; + } + case (schema::ActivationType::ActivationType_SWISH): { + CalSwish(GetDimsVolume(inputDesc[0].dims), static_cast(inputs[0]), + static_cast(outputs[0]), stream, device_id_); + break; + } + default: { + MS_LOG(ERROR) << "invalid activation type: " << static_cast(activation_type_); + return RET_ERROR; + } + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *ActivationOptPlugin::clone() const noexcept { + auto *plugin = new ActivationOptPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t ActivationOptPlugin::getSerializationSize() const noexcept { return sizeof(schema::ActivationType); } + +void ActivationOptPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &activation_type_, sizeof(schema::ActivationType)); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h new file mode 100644 index 00000000000..9e3b5dfd952 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_opt_plugin.h @@ -0,0 +1,72 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_ + +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cudnn_utils.h" + +namespace mindspore::lite { +constexpr char *ACTIVATION_OPT_PLUGIN_NAME{"ActivationOptPlugin"}; +class ActivationOptPlugin : public TensorRTPlugin { + public: + ActivationOptPlugin(const std::string name, schema::ActivationType activation_type, uint32_t device_id) + : TensorRTPlugin(name, std::string(ACTIVATION_OPT_PLUGIN_NAME), device_id), activation_type_(activation_type) {} + + ActivationOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + activation_type_ = static_cast(fields[0].data)[0]; + } + + ActivationOptPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(ACTIVATION_OPT_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &activation_type_, sizeof(schema::ActivationType)); + } + + ActivationOptPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + bool needResize(const int *current_dims, const int *last_dims); + int RunCudaActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs, + cudaStream_t stream); + int RunCuDNNActivation(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs, + cudaStream_t stream); + const std::string layer_name_; + std::string name_space_; + schema::ActivationType activation_type_; + cudnnHandle_t cudnn_handle_{nullptr}; + cudnnActivationDescriptor_t activation_desc_{nullptr}; + cudnnTensorDescriptor_t input_desc_{nullptr}; + int infer_dims_[5]{1, 1, 1, 1, 1}; + int infer_stride_[5]{1, 1, 1, 1, 1}; + int infer_dims_cnt_{0}; +}; +class ActivationOptPluginCreater : public TensorRTPluginCreater { + public: + ActivationOptPluginCreater() : TensorRTPluginCreater(std::string(ACTIVATION_OPT_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_OPT_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc new file mode 100644 index 00000000000..e78ec89dddc --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.cc @@ -0,0 +1,153 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/activation_opt_plugin.h" + +namespace mindspore::lite { +namespace { +bool HasCustomActivationPlugin(schema::ActivationType type) { + std::unordered_set plugin_activation = {schema::ActivationType::ActivationType_SIGMOID, + schema::ActivationType::ActivationType_GELU, + schema::ActivationType::ActivationType_SWISH}; + return plugin_activation.find(type) != plugin_activation.end(); +} +} // namespace + +int ActivationTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + auto activation_op = this->op_primitive_->value_as_Activation(); + if (activation_op == nullptr) { + MS_LOG(ERROR) << "op convert failed"; + return RET_ERROR; + } + auto activation_params_opt = TryConvertActivationType(activation_op->activation_type()); + bool has_custom_plugin = HasCustomActivationPlugin(activation_op->activation_type()); + if (!activation_params_opt && !has_custom_plugin) { + MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_op->activation_type(); + return RET_ERROR; + } + return RET_OK; +} +int ActivationTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx->network() == nullptr) { + MS_LOG(ERROR) << "network is invalid"; + return RET_ERROR; + } + auto activation_op = this->op_primitive_->value_as_Activation(); + if (activation_op == nullptr) { + MS_LOG(ERROR) << "op convert failed"; + return RET_ERROR; + } + float alpha = activation_op->alpha(); + nvinfer1::ITensor *activation_input = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32) { + activation_input = + TRTTensorCast(ctx, tensorrt_in_tensors_[0].trt_tensor_, nvinfer1::DataType::kFLOAT, op_name_ + "_cast_in"); + } + + auto activation_layer = + ActivationTensorRT::AddActivation(ctx, activation_op->activation_type(), alpha, + std::isfinite(activation_op->min_val()) ? activation_op->min_val() : FLT_MIN, + std::isfinite(activation_op->max_val()) ? activation_op->max_val() : FLT_MAX, + activation_input, device_id_, quant_type_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "add activation op failed for TensorRT."; + return RET_ERROR; + } + + activation_layer->setName(op_name_.c_str()); + // cast to origin type + nvinfer1::ITensor *out_tensor = activation_layer->getOutput(0); + if (out_tensor->getType() != ConvertDataType(out_tensors_[0].DataType())) { + out_tensor = TRTTensorCast(ctx, activation_layer->getOutput(0), ConvertDataType(out_tensors_[0].DataType()), + op_name_ + "_cast_out"); + } + out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + this->layer_ = activation_layer; + return RET_OK; +} +nvinfer1::ILayer *ActivationTensorRT::AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type, + float alpha, float min_value, float max_value, + nvinfer1::ITensor *trt_in_tensor, uint32_t device_id, + schema::QuantType quant_type) { + bool has_custom_plugin = HasCustomActivationPlugin(activation_type); + // sigmoid precision is wrong for trt + if (quant_type == schema::QuantType_QUANT_NONE && has_custom_plugin) { + std::string layer_name = std::string(trt_in_tensor->getName()) + "_activation"; + auto plugin = std::make_shared(layer_name.c_str(), activation_type, device_id); + MS_LOG(INFO) << "using opt plugin for " << layer_name; + if (plugin == nullptr) { + MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << layer_name; + return nullptr; + } + nvinfer1::ITensor *inputTensors[] = {trt_in_tensor}; + nvinfer1::IPluginV2Layer *activation_opt_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); + activation_opt_layer->setName(layer_name.c_str()); + return activation_opt_layer; + } + + // Just some action_code correct, unfind code is set to default relu. need double check. + auto action_param_opt = TryConvertActivationType(activation_type); + if (!action_param_opt) { + MS_LOG(ERROR) << "Unsupported op action type for TensorRT: " << activation_type; + return nullptr; + } + auto action_param = action_param_opt.value(); + nvinfer1::IActivationLayer *activation_layer = + ctx->network()->addActivation(*trt_in_tensor, action_param.activation_type); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "add activation op failed for TensorRT."; + return nullptr; + } + + if (activation_type == schema::ActivationType_HARD_TANH) { + activation_layer->setAlpha(min_value); + activation_layer->setBeta(max_value); + return activation_layer; + } + + if (action_param.has_alpha) { + activation_layer->setAlpha(alpha); + } + + if (action_param.has_beta) { + activation_layer->setBeta(action_param.beta); + } + + return activation_layer; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Activation, ActivationTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h new file mode 100644 index 00000000000..81292b520c5 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/activation_tensorrt.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ActivationTensorRT : public TensorRTOp { + public: + ActivationTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ActivationTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + static nvinfer1::ILayer *AddActivation(TensorRTContext *ctx, schema::ActivationType activation_type, float alpha, + float min_value, float max_value, nvinfer1::ITensor *trt_in_tensor, + uint32_t device_id = 0, + schema::QuantType quant_type = schema::QuantType_QUANT_NONE); +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ACTIVATION_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc new file mode 100644 index 00000000000..7869766e197 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.cc @@ -0,0 +1,113 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/allgather_tensorrt.h" +#include +#include "NvInferRuntimeCommon.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(AllGatherPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int AllGatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { +#ifndef LITE_CUDA_DISTRIBUTION + MS_LOG(ERROR) + << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on."; + return RET_ERROR; +#else + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +#endif +} + +int AllGatherTensorRT::AddInnerOp(TensorRTContext *ctx) { + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_}; + auto allgather_op = op_primitive_->value_as_AllGather(); + if (allgather_op == nullptr) { + MS_LOG(ERROR) << "convert failed for " << op_name_; + return RET_ERROR; + } + int rank = GetGPUGroupSize(); + auto plugin = std::make_shared(op_name_, rank, device_id_); + MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID(); + nvinfer1::IPluginV2Layer *allgather_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); + if (allgather_layer == nullptr) { + MS_LOG(ERROR) << "create AllGather layer failed for: " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *allgather_out = allgather_layer->getOutput(0); + allgather_layer->setName(op_name_.c_str()); + allgather_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{allgather_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + this->layer_ = allgather_layer; + return RET_OK; +} + +// AllGatherPlugin +int AllGatherPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + MS_LOG(INFO) << "all gather run at rank id: " << GetRankID() << " stream: " << stream; + nvinfer1::Dims input_dims = inputDesc[0].dims; + int send_element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + const void *input = inputs[0]; + void *output = outputs[0]; + auto ret = DistributionCollective::instance().AllGatherWrapper(input, output, send_element_cnt, inputDesc->type, + stream, NCCL_WORLD_GROUP); + if (ret != RET_OK) { + MS_LOG(ERROR) << "AllGather nccl run failed for " << layer_name_; + return ret; + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *AllGatherPlugin::clone() const noexcept { + auto *plugin = new AllGatherPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs AllGatherPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, + int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs out_dims{}; + out_dims.nbDims = inputs->nbDims; + auto rank_dim = exprBuilder.constant(rank_); + out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs->d[0], *rank_dim); + for (int i = 1; i < inputs->nbDims; i++) { + out_dims.d[i] = inputs->d[i]; + } + return out_dims; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AllGather, AllGatherTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h new file mode 100644 index 00000000000..a8e266e526e --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/allgather_tensorrt.h @@ -0,0 +1,75 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_ +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h" + +namespace mindspore::lite { +constexpr char *ALLGATHER_PLUGIN_NAME{"AllGatherPlugin"}; +class AllGatherTensorRT : public TensorRTOp { + public: + AllGatherTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~AllGatherTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +class AllGatherPlugin : public TensorRTPlugin { + public: + AllGatherPlugin(const std::string name, int rank, uint32_t device_id) + : TensorRTPlugin(name, std::string(ALLGATHER_PLUGIN_NAME), device_id), rank_(rank) {} + + AllGatherPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + rank_ = static_cast(fields[0].data)[0]; + } + + AllGatherPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(ALLGATHER_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int)); + } + + AllGatherPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + + private: + int rank_{0}; +}; +class AllGatherPluginCreater : public TensorRTPluginCreater { + public: + AllGatherPluginCreater() : TensorRTPluginCreater(std::string(ALLGATHER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ALLGATHER_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc new file mode 100644 index 00000000000..9fde14fb2e4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.cc @@ -0,0 +1,83 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/cast_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh" +#include +#include +#include +#include + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(CastPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int CastPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + nvinfer1::Dims input_dims = inputDesc[0].dims; + int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + + if (inputDesc->type == outputDesc->type) { + int element_size = (outputDesc->type == nvinfer1::DataType::kFLOAT) + ? sizeof(float) + : ((outputDesc->type == nvinfer1::DataType::kINT32) ? sizeof(int) : 0); + auto cuda_ret = cudaMemcpy(outputs[0], inputs[0], element_cnt * element_size, cudaMemcpyDeviceToDevice); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "copy mem failed for " << layer_name_; + return RET_ERROR; + } + return RET_OK; + } + if (inputDesc->type == nvinfer1::DataType::kINT32 && dest_datatype_ == nvinfer1::DataType::kFLOAT) { + auto input = static_cast(inputs[0]); + auto output = static_cast(outputs[0]); + Cast(element_cnt, input, output, stream); + } else if (inputDesc->type == nvinfer1::DataType::kFLOAT && dest_datatype_ == nvinfer1::DataType::kINT32) { + auto input = static_cast(inputs[0]); + auto output = static_cast(outputs[0]); + Cast(element_cnt, input, output, stream); + } else { + MS_LOG(ERROR) << "unsupported data type cast " << layer_name_; + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *CastPlugin::clone() const noexcept { + auto *plugin = new CastPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +nvinfer1::DataType CastPlugin::getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const + noexcept { + return dest_datatype_; +} + +size_t CastPlugin::getSerializationSize() const noexcept { + // origin_datatype_ and dest_datatype_ + return sizeof(nvinfer1::DataType) * 2; +} + +void CastPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &origin_datatype_, sizeof(nvinfer1::DataType)); + SerializeValue(&buffer, &dest_datatype_, sizeof(nvinfer1::DataType)); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h new file mode 100644 index 00000000000..100e142c990 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_plugin.h @@ -0,0 +1,67 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" + +namespace mindspore::lite { +constexpr char *CAST_PLUGIN_NAME{"CastPluginCreater"}; +class CastPlugin : public TensorRTPlugin { + public: + CastPlugin(const std::string name, nvinfer1::DataType origin_datatype, nvinfer1::DataType dest_datatype, + uint32_t device_id = 0) + : TensorRTPlugin(name, std::string(CAST_PLUGIN_NAME), device_id), + origin_datatype_(origin_datatype), + dest_datatype_(dest_datatype) {} + + CastPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + origin_datatype_ = static_cast(fields[0].data)[0]; + dest_datatype_ = static_cast(fields[1].data)[0]; + } + + CastPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(CAST_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &origin_datatype_, sizeof(nvinfer1::DataType)); + DeserializeValue(&serialData, &serialLength, &dest_datatype_, sizeof(nvinfer1::DataType)); + } + + CastPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + + nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const + noexcept override; + + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + nvinfer1::DataType origin_datatype_; + nvinfer1::DataType dest_datatype_; +}; +class CastPluginCreater : public TensorRTPluginCreater { + public: + CastPluginCreater() : TensorRTPluginCreater(std::string(CAST_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc new file mode 100644 index 00000000000..d9490408076 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.cc @@ -0,0 +1,79 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/cast_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/cast_plugin.h" +#include +#include +#include +#include + +namespace mindspore::lite { +int CastTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int CastTensorRT::AddInnerOp(TensorRTContext *ctx) { + // cast to type tensor + auto type_tensor = in_tensors_[1]; + if (type_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "unknown cast type of " << op_name_; + return RET_ERROR; + } + auto type_data = static_cast(type_tensor.Data().get()); + DataType data_type = static_cast(type_data[0]); + MS_LOG(DEBUG) << op_name_ << " cast to data type(43 float): " << type_data[0]; + nvinfer1::DataType dest_datatype = ConvertDataType(data_type); + auto trt_tensor = tensorrt_in_tensors_[0].trt_tensor_; + +#if TRT_VERSION_GE(7, 2) + dest_datatype = (dest_datatype == nvinfer1::DataType::kBOOL ? nvinfer1::DataType::kINT32 : dest_datatype); + auto cast_layer = ctx->network()->addIdentity(*trt_tensor); +#else + auto plugin = std::make_shared(op_name_, trt_tensor->getType(), dest_datatype); + nvinfer1::ITensor *inputTensors[] = {trt_tensor}; + nvinfer1::IPluginV2Layer *cast_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); +#endif + if (cast_layer == nullptr) { + MS_LOG(ERROR) << "create cast layer failed for: " << op_name_; + return RET_ERROR; + } +#if TRT_VERSION_GE(7, 2) + cast_layer->setOutputType(0, dest_datatype); +#endif + cast_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *cast_out = cast_layer->getOutput(0); + cast_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{cast_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + this->layer_ = cast_layer; + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Cast, CastTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h new file mode 100644 index 00000000000..e96d9477568 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/cast_tensorrt.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh" + +namespace mindspore::lite { +class CastTensorRT : public TensorRTOp { + public: + CastTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~CastTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + // CastTensorRT +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CAST_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc new file mode 100644 index 00000000000..e1b1eba8aa3 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.cc @@ -0,0 +1,158 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/concate_tensorrt.h" +#include +#include + +namespace mindspore::lite { +int ConcateTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (type_ != schema::PrimitiveType_Stack && type_ != schema::PrimitiveType_Concat) { + MS_LOG(ERROR) << "Unsupported op :" << op_name_ << " , type: " << type_; + return RET_ERROR; + } + if (in_tensors.size() == 0 || in_tensors.size() < INPUT_SIZE2 && type_ != schema::PrimitiveType_Stack) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + + int input_nbDims = in_tensors_[0].Shape().size(); + if (axis_ == -1) { + axis_ = input_nbDims - 1; + } + if (axis_ < 0 || axis_ > input_nbDims || axis_ == input_nbDims && type_ != schema::PrimitiveType_Stack) { + MS_LOG(ERROR) << "concate_op valid axis : " << axis_ << " , input dims : " << input_nbDims; + return RET_ERROR; + } + return RET_OK; +} +int ConcateTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + + if (tensorrt_in_tensors_.size() != in_tensors_.size()) { + MS_LOG(ERROR) << "concate_op in tensor is invalid, trt tensor has " << tensorrt_in_tensors_.size() + << ", but origin ms tensor has " << in_tensors_.size(); + return RET_ERROR; + } + + nvinfer1::ITensor *trt_input_tensors[tensorrt_in_tensors_.size()]; + int ret = PreProcessInputs(ctx, trt_input_tensors); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PreProcessInputs failed for " << op_name_; + return ret; + } + + if (!same_format_) { + if (trt_input_tensors[0]->getDimensions().nbDims == DIMENSION_4D && out_format_ == Format::NCHW) { + // when inputs all NCHW, change axis + axis_ = ConvertAxisFromNHWC2NCHW(axis_); + MS_LOG(DEBUG) << "concate axis change to " << axis_ << " when using NCHW format."; + } else { + MS_LOG(WARNING) << "input tensor format needs check, convert concat axis failed for " << op_name_; + } + } + + if (type_ == schema::PrimitiveType_Stack) { + for (size_t i = 0; i != tensorrt_in_tensors_.size(); ++i) { + auto shuffle_layer = ctx->network()->addShuffle(*trt_input_tensors[i]); + if (shuffle_layer == nullptr) { + MS_LOG(ERROR) << "addShuffle failed for TensorRT."; + return RET_ERROR; + } + auto shuffer_dims_opt = UnsqueezeDims(trt_input_tensors[i]->getDimensions(), axis_, 1); + if (!shuffer_dims_opt) { + MS_LOG(ERROR) << "UnsqueezeDims failed."; + return RET_ERROR; + } + shuffle_layer->setReshapeDimensions(shuffer_dims_opt.value()); + trt_input_tensors[i] = shuffle_layer->getOutput(0); + } + } + nvinfer1::IConcatenationLayer *concate_layer = + ctx->network()->addConcatenation(trt_input_tensors, static_cast(tensorrt_in_tensors_.size())); + if (concate_layer == nullptr) { + MS_LOG(ERROR) << "addConcatenation failed for TensorRT."; + return RET_ERROR; + } + + if (axis_ != RET_INVALID_OP_ATTR) { + concate_layer->setAxis(axis_); + } + concate_layer->setName(op_name_.c_str()); + auto concat_output = concate_layer->getOutput(0); + concat_output->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{concat_output, out_format_, same_format_}); + this->layer_ = concate_layer; + return RET_OK; +} + +int ConcateTensorRT::PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]) { + int input_nbDims = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims; + out_format_ = tensorrt_in_tensors_[0].format_; + same_format_ = tensorrt_in_tensors_[0].same_format_; + + for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) { + if (tensorrt_in_tensors_[i].trt_tensor_->getDimensions().nbDims != input_nbDims) { + MS_LOG(ERROR) << "dims of inputs is invalid for " << op_name_; + return RET_ERROR; + } + // keep origin format if all input format are the same + if (input_nbDims == DIMENSION_4D && tensorrt_in_tensors_[i].format_ != out_format_) { + out_format_ = Format::NHWC; + } + } + + // make sure all inputs are same format + if (input_nbDims == DIMENSION_4D) { + for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) { + if (tensorrt_in_tensors_[i].format_ == out_format_) { + trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_; + MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]); + } else { + nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[i].trt_tensor_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + trt_input_tensors[i] = transpose_layer->getOutput(0); + this->transpose_layer_ = transpose_layer; + same_format_ = true; + MS_LOG(DEBUG) << "concate input " << GetTensorFormat(trt_input_tensors[i], Format::NHWC, true); + } + } + } else { + for (size_t i = 0; i < tensorrt_in_tensors_.size(); i++) { + trt_input_tensors[i] = tensorrt_in_tensors_[i].trt_tensor_; + MS_LOG(DEBUG) << "concate input " << GetTensorFormat(tensorrt_in_tensors_[i]); + } + } + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Concat, ConcateTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Stack, ConcateTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h new file mode 100644 index 00000000000..351f4abf17b --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/concate_tensorrt.h @@ -0,0 +1,50 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ConcateTensorRT : public TensorRTOp { + public: + ConcateTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) { + type_ = primitive->value_type(); + axis_ = (type_ == schema::PrimitiveType_Concat ? primitive->value_as_Concat()->axis() + : primitive->value_as_Stack()->axis()); + } + + ~ConcateTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int PreProcessInputs(TensorRTContext *ctx, nvinfer1::ITensor *trt_input_tensors[]); + + Format out_format_{Format::NHWC}; + bool same_format_{true}; + schema::PrimitiveType type_; + int axis_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONCATE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc new file mode 100644 index 00000000000..28e3215ebcf --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.cc @@ -0,0 +1,187 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/convolution_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" + +namespace mindspore::lite { +constexpr int BIAS_INDEX = 2; + +int ConvolutionTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) { + MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format(); + return RET_ERROR; + } + return RET_OK; +} + +int ConvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + const schema::Conv2DFusion *conv_op = this->op_primitive_->value_as_Conv2DFusion(); + if (conv_op == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + + nvinfer1::ITensor *conv_input = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // transpose: NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "transpose: NHWC->NCHW failed"; + return RET_ERROR; + } + transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str()); + this->transpose_layer_ = transpose_layer_in; + conv_input = transpose_layer_in->getOutput(0); + } + + // transpose weight + const mindspore::MSTensor &weight_tensor = in_tensors_[1]; + nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_); + + // conv + int nbOutputMaps = weight_tensor.Shape()[0]; + if (nbOutputMaps <= 0) { + MS_LOG(ERROR) << "out_channel is invalid"; + return RET_ERROR; + } + + auto kernel_size = conv_op->kernel_size(); + if (kernel_size == nullptr) { + MS_LOG(ERROR) << "kernel_size is null"; + return RET_ERROR; + } + nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector(kernel_size->begin(), kernel_size->end())); + if (kernelSize.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + // bias + nvinfer1::Weights biasWeights{}; + if (in_tensors_.size() >= INPUT_SIZE3) { + biasWeights = lite::ConvertWeight(in_tensors_[BIAS_INDEX]); + } else { + biasWeights.type = ConvertDataType(weight_tensor.DataType()); + biasWeights.count = 0; + biasWeights.values = nullptr; + } + + nvinfer1::IConvolutionLayer *conv_layer = + ctx->network()->addConvolutionNd(*conv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights); + + if (conv_layer == nullptr) { + MS_LOG(ERROR) << "ConvolutionLayer failed"; + return RET_ERROR; + } + conv_layer->setName((op_name_ + "_conv").c_str()); + this->layer_ = conv_layer; + + // add params + SetAttributes(conv_op, conv_layer); + + // add activation + nvinfer1::ILayer *activation_layer = nullptr; + if (conv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) { + activation_layer = conv_layer; + } else { + activation_layer = + ActivationTensorRT::AddActivation(ctx, conv_op->activation_type(), 0, 0, 0, conv_layer->getOutput(0), device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for conv failed"; + return RET_ERROR; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + } + activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false}); + return RET_OK; +} + +void ConvolutionTensorRT::SetAttributes(const schema::Conv2DFusion *conv_op, nvinfer1::IConvolutionLayer *conv_layer) { + auto stride = conv_op->stride(); + if (stride != nullptr) { + auto stride_val = std::vector(stride->begin(), stride->end()); + auto dims = ConvertCudaDims(stride_val); + if (dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return; + } + conv_layer->setStrideNd(dims); + } + + auto dilation = conv_op->dilation(); + if (dilation != nullptr) { + auto dilation_val = std::vector(dilation->begin(), dilation->end()); + auto dims = ConvertCudaDims(dilation_val); + if (dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return; + } + conv_layer->setDilationNd(dims); + } + int nbGroups = conv_op->group(); + if (nbGroups > 0) { + conv_layer->setNbGroups(nbGroups); + } + + schema::PadMode pad_mode = conv_op->pad_mode(); + if (pad_mode == schema::PadMode::PadMode_SAME) { + conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } else { + auto padding = conv_op->pad_list(); + if (padding != nullptr && padding->size() == DIMENSION_4D) { + auto padding_val = std::vector(padding->begin(), padding->end()); + if (padding_val[0] != padding_val[1] || padding_val[DIMENSION_2D] != padding_val[DIMENSION_3D]) { + MS_LOG(WARNING) << op_name_ << " has different up and down padding value"; + } + nvinfer1::Dims2 dims(padding_val[0], padding_val[DIMENSION_2D]); + conv_layer->setPaddingNd(dims); + } else if (padding == nullptr || padding->size() == 0) { + nvinfer1::Dims2 dims; + conv_layer->setPaddingNd(dims); + } else { + MS_LOG(WARNING) << "pad list is invalid for " << op_name_; + } + } +} + +ConvolutionTensorRT::~ConvolutionTensorRT() { + if (pack_weight_ != nullptr) { + free(pack_weight_); + pack_weight_ = nullptr; + } +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2DFusion, ConvolutionTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h new file mode 100644 index 00000000000..cfeb755a579 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/convolution_tensorrt.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ConvolutionTensorRT : public TensorRTOp { + public: + ConvolutionTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ConvolutionTensorRT() override; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + void SetAttributes(const schema::Conv2DFusion *ms_op, nvinfer1::IConvolutionLayer *current_layer_); + + void *pack_weight_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_CONVOLUTION_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc new file mode 100644 index 00000000000..08e96ed6662 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.cc @@ -0,0 +1,199 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +#include "nnacl/pack.h" + +namespace mindspore::lite { +int DeconvolutionTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) { + MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format(); + return RET_ERROR; + } + return RET_OK; +} +int DeconvolutionTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + const schema::Conv2dTransposeFusion *deconv_op = this->op_primitive_->value_as_Conv2dTransposeFusion(); + if (deconv_op == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + nvinfer1::ITensor *deconv_input = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // transpose: NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "transpose: NHWC->NCHW failed"; + return RET_ERROR; + } + transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str()); + this->transpose_layer_ = transpose_layer_in; + deconv_input = transpose_layer_in->getOutput(0); + } + + // transpose weight + const mindspore::MSTensor &weight_tensor = in_tensors_[1]; + nvinfer1::Weights kernelWeights = lite::TransposeWeight4D(weight_tensor, &pack_weight_); + + // deconv basic params + int nbOutputMaps = weight_tensor.Shape()[0]; + if (nbOutputMaps <= 0) { + MS_LOG(ERROR) << "out_channel is invalid"; + return RET_ERROR; + } + + auto kernel_size = deconv_op->kernel_size(); + if (kernel_size == nullptr) { + MS_LOG(ERROR) << "kernel_size is null"; + return RET_ERROR; + } + nvinfer1::Dims kernelSize = lite::ConvertCudaDims(std::vector(kernel_size->begin(), kernel_size->end())); + if (kernelSize.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + // bias + nvinfer1::Weights biasWeights{}; + if (in_tensors_.size() >= INPUT_SIZE3) { + biasWeights = lite::ConvertWeight(in_tensors_[INPUT_SIZE3 - 1]); + } else { + biasWeights.type = ConvertDataType(weight_tensor.DataType()); + biasWeights.count = 0; + biasWeights.values = nullptr; + } + + nvinfer1::IDeconvolutionLayer *deconv_layer = + ctx->network()->addDeconvolutionNd(*deconv_input, nbOutputMaps, kernelSize, kernelWeights, biasWeights); + + if (deconv_layer == nullptr) { + MS_LOG(ERROR) << "DeconvolutionLayer failed"; + return RET_ERROR; + } + deconv_layer->setName((op_name_ + "_deconv").c_str()); + this->layer_ = deconv_layer; + // set extra params + SetAttributes(deconv_op, deconv_layer); + + // add activation + nvinfer1::ILayer *activation_layer = nullptr; + if (deconv_op->activation_type() == schema::ActivationType::ActivationType_NO_ACTIVATION) { + activation_layer = deconv_layer; + } else { + activation_layer = ActivationTensorRT::AddActivation(ctx, deconv_op->activation_type(), 0, 0, 0, + deconv_layer->getOutput(0), device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for conv failed"; + return RET_ERROR; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + } + activation_layer->getOutput(0)->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{activation_layer->getOutput(0), Format::NCHW, false}); + return RET_OK; +} + +void DeconvolutionTensorRT::SetAttributes(const schema::Conv2dTransposeFusion *ms_op, + nvinfer1::IDeconvolutionLayer *decon_layer) { + // kernel_size + auto kernel_size = ms_op->kernel_size(); + if (kernel_size != nullptr) { + auto kernel_size_val = std::vector(kernel_size->begin(), kernel_size->end()); + nvinfer1::Dims kernel_size_dims = lite::ConvertCudaDims(kernel_size_val); + if (kernel_size_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return; + } + decon_layer->setKernelSizeNd(kernel_size_dims); + } + + // nbOutputMaps + int32_t nbOutputMaps = static_cast(ms_op->out_channel()); + decon_layer->setNbOutputMaps(nbOutputMaps); + + // stride + auto stride = ms_op->stride(); + if (stride != nullptr) { + auto stride_val = std::vector(stride->begin(), stride->end()); + nvinfer1::Dims stride_dims = lite::ConvertCudaDims(stride_val); + if (stride_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return; + } + decon_layer->setStrideNd(stride_dims); + } + + // nbGroups + int32_t nbGroups = static_cast(ms_op->group()); + decon_layer->setNbGroups(nbGroups); + + // padding + schema::PadMode pad_mode = ms_op->pad_mode(); + if (pad_mode == schema::PadMode::PadMode_SAME) { + decon_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } else { + auto padding = ms_op->pad_list(); + auto out_pad = ms_op->output_paddings(); + if (padding == nullptr || out_pad == nullptr) { + MS_LOG(WARNING) << "on pad value of " << op_name_; + return; + } + auto padding_val = std::vector(padding->begin(), padding->end()); + auto out_pad_val = std::vector(out_pad->begin(), out_pad->end()); // h, w + if (out_pad_val.size() != DIMENSION_2D || padding_val.size() != DIMENSION_4D) { + MS_LOG(ERROR) << "invalid size of pad " << op_name_; + return; + } + nvinfer1::Dims dims_pre{}; + dims_pre.nbDims = DIMENSION_2D; + dims_pre.d[0] = padding_val[0]; // up + dims_pre.d[1] = padding_val[2]; // left + decon_layer->setPrePadding(dims_pre); + nvinfer1::Dims dims_post{}; + dims_post.nbDims = DIMENSION_2D; + dims_post.d[0] = padding_val[1] - out_pad_val[0]; // down + dims_post.d[1] = padding_val[3] - out_pad_val[1]; // right + decon_layer->setPostPadding(dims_post); + } +} + +DeconvolutionTensorRT::~DeconvolutionTensorRT() { + if (pack_weight_ != nullptr) { + free(pack_weight_); + pack_weight_ = nullptr; + } +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Conv2dTransposeFusion, DeconvolutionTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h new file mode 100644 index 00000000000..e7cfe233816 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/deconvolution_tensorrt.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class DeconvolutionTensorRT : public TensorRTOp { + public: + DeconvolutionTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~DeconvolutionTensorRT() override; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + void SetAttributes(const schema::Conv2dTransposeFusion *ms_op, nvinfer1::IDeconvolutionLayer *decon_layer); + + void *pack_weight_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_DECONVOLUTION_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc new file mode 100644 index 00000000000..05ac5ceaefd --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.cc @@ -0,0 +1,312 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" + +namespace mindspore::lite { +namespace { +std::unordered_map NOT_BOOL_PRIM2NV_ELEM_OP = { +#if TRT_VERSION_GE(7, 2) + {schema::PrimitiveType_Less, nvinfer1::ElementWiseOperation::kLESS}, + {schema::PrimitiveType_Greater, nvinfer1::ElementWiseOperation::kGREATER}, +#endif + {schema::PrimitiveType_AddFusion, nvinfer1::ElementWiseOperation::kSUM}, + {schema::PrimitiveType_PowFusion, nvinfer1::ElementWiseOperation::kPOW}, + {schema::PrimitiveType_DivFusion, nvinfer1::ElementWiseOperation::kDIV}, + {schema::PrimitiveType_RealDiv, nvinfer1::ElementWiseOperation::kDIV}, + {schema::PrimitiveType_FloorDiv, nvinfer1::ElementWiseOperation::kFLOOR_DIV}, + {schema::PrimitiveType_SubFusion, nvinfer1::ElementWiseOperation::kSUB}, + {schema::PrimitiveType_MulFusion, nvinfer1::ElementWiseOperation::kPROD}, + {schema::PrimitiveType_Minimum, nvinfer1::ElementWiseOperation::kMIN}, + {schema::PrimitiveType_Maximum, nvinfer1::ElementWiseOperation::kMAX}, + {schema::PrimitiveType_BiasAdd, nvinfer1::ElementWiseOperation::kSUM}, +#if TRT_VERSION_GE(7, 2) + {schema::PrimitiveType_Equal, nvinfer1::ElementWiseOperation::kEQUAL}, +#endif +}; +} // namespace + +int ElementWiseTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid input tensort size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensort size: " << out_tensors.size(); + return RET_ERROR; + } + + // if constant tensor is scalar, it needs to know another input tensor's shape to broadcast + if ((in_tensors[0].Shape().size() > 0 && in_tensors[0].Shape()[0] == -1 && in_tensors[1].Shape().size() == 0) || + (in_tensors[1].Shape().size() > 0 && in_tensors[1].Shape()[0] == -1 && in_tensors[0].Shape().size() == 0)) { + MS_LOG(ERROR) << "invalid all input tensor shape unknown for: " << op_name_; + return RET_ERROR; + } + + bool is_not_bool_arith = NOT_BOOL_PRIM2NV_ELEM_OP.find(type_) != NOT_BOOL_PRIM2NV_ELEM_OP.end(); + if (is_not_bool_arith) { + if (std::any_of(in_tensors.begin(), in_tensors.end(), + [](const mindspore::MSTensor &tensor) { return tensor.DataType() == DataType::kNumberTypeBool; })) { + MS_LOG(ERROR) << "invalid input type for : " << op_name_; + return RET_ERROR; + } + element_wise_op_ = NOT_BOOL_PRIM2NV_ELEM_OP[type_]; + } + if (!is_not_bool_arith) { + // PrimitiveType_Eltwise + auto eltwise_op = op_primitive_->value_as_Eltwise(); + if (eltwise_op == nullptr) { + MS_LOG(ERROR) << "convert to Eltwise failed: " << op_name_; + return RET_ERROR; + } + schema::EltwiseMode eltwiseMode = eltwise_op->mode(); + std::map eltwise_modes = { + {schema::EltwiseMode::EltwiseMode_SUM, nvinfer1::ElementWiseOperation::kSUM}, + {schema::EltwiseMode::EltwiseMode_PROD, nvinfer1::ElementWiseOperation::kPROD}, + {schema::EltwiseMode::EltwiseMode_MAXIMUM, nvinfer1::ElementWiseOperation::kMAX}, + }; + auto iter_mode = eltwise_modes.find(eltwiseMode); + if (iter_mode != eltwise_modes.end()) { + element_wise_op_ = iter_mode->second; + } else { + MS_LOG(ERROR) << "unsupported type for ElementWise op" << op_name_; + return RET_ERROR; + } + } + return RET_OK; +} + +int ElementWiseTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "network or input tensor size is invalid"; + return RET_ERROR; + } + ITensorHelper x_input; + ITensorHelper y_input; + int ret = PreprocessInputTensors(ctx, &x_input, &y_input); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PreprocessInputTensors failed."; + return RET_ERROR; + } + nvinfer1::IElementWiseLayer *cal_layer = + ctx->network()->addElementWise(*x_input.trt_tensor_, *y_input.trt_tensor_, element_wise_op_); + + if (cal_layer == nullptr) { + MS_LOG(ERROR) << "addElementWise failed for TensorRT."; + return RET_ERROR; + } + cal_layer->setName(op_name_.c_str()); + this->layer_ = cal_layer; + + nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0); + if (op_out_tensor == nullptr) { + MS_LOG(ERROR) << "addElementWise out tensor is nullptr."; + return RET_ERROR; + } + // add activation + nvinfer1::ITensor *activation_out_tensor = AddActivation(ctx, op_out_tensor); + op_out_tensor = (activation_out_tensor == nullptr) ? op_out_tensor : activation_out_tensor; + + // scale and shift + if (type_ == schema::PrimitiveType_PowFusion) { + auto pow_op = op_primitive_->value_as_PowFusion(); + if (pow_op == nullptr) { + MS_LOG(ERROR) << "PowFusion convert failed."; + return RET_ERROR; + } + float scale = pow_op->scale(); + float shift = pow_op->shift(); + if (abs(scale - 1) >= 1.0e-05 || abs(shift - 0) >= 1.0e-05) { + MS_LOG(WARNING) << "deal with scale and shift for pow op"; + } + } +#if TRT_VERSION_GE(7, 2) + std::unordered_set bool_producer_ops = { + schema::PrimitiveType_Equal, schema::PrimitiveType_Greater, schema::PrimitiveType_Less}; + if (bool_producer_ops.find(type_) != bool_producer_ops.end()) { + auto cast_layer = ctx->network()->addIdentity(*op_out_tensor); + if (cast_layer == nullptr) { + MS_LOG(ERROR) << "create cast layer failed for: " << op_name_; + return RET_ERROR; + } + cast_layer->setOutputType(0, nvinfer1::DataType::kINT32); + op_out_tensor = cast_layer->getOutput(0); + MS_LOG(INFO) << "bool result cast to int32" << op_name_; + } +#endif + op_out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{op_out_tensor, x_input.format_, x_input.same_format_}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} + +int ElementWiseTensorRT::PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input) { + int input_x_index = SameTensor(tensorrt_in_tensors_[0].trt_tensor_, &in_tensors_[0]) ? 0 : 1; + if (in_tensors_[0].Shape() == in_tensors_[1].Shape() && in_tensors_[0].IsConst()) { + input_x_index = 1; + } + + if (this->tensorrt_in_tensors_.size() != INPUT_SIZE2) { + int ret = AddConstTensor(ctx); + if (ret != RET_OK) { + return ret; + } + } + *x_input = tensorrt_in_tensors_[input_x_index]; + *y_input = tensorrt_in_tensors_[1 - input_x_index]; + MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*x_input); + MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(*y_input); + + if (x_input->trt_tensor_->getDimensions().nbDims == DIMENSION_4D && x_input->format_ != y_input->format_) { + // when inputs format are different, change to NHWC + auto need_trans = x_input->format_ == Format::NCHW ? x_input : y_input; + nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *need_trans->trt_tensor_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_input_transpose2NHWC").c_str()); + need_trans->trt_tensor_ = transpose_layer->getOutput(0); + need_trans->format_ = Format::NHWC; + need_trans->same_format_ = true; + } + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*x_input); + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(*y_input); + if (GetDimsVolume(x_input->trt_tensor_->getDimensions()) == GetDimsVolume(y_input->trt_tensor_->getDimensions()) && + x_input->trt_tensor_->getDimensions().nbDims != y_input->trt_tensor_->getDimensions().nbDims) { + bool x_large = x_input->trt_tensor_->getDimensions().nbDims > y_input->trt_tensor_->getDimensions().nbDims; + auto input_tensor = x_large ? y_input : x_input; + auto output_dim = x_large ? x_input->trt_tensor_->getDimensions() : y_input->trt_tensor_->getDimensions(); + auto reshape_layer = ctx->network()->addShuffle(*input_tensor->trt_tensor_); + if (reshape_layer == nullptr) { + MS_LOG(ERROR) << "add reshape failed for " << op_name_; + return RET_ERROR; + } + reshape_layer->setReshapeDimensions(output_dim); + input_tensor->trt_tensor_ = reshape_layer->getOutput(0); + } + return RET_OK; +} + +nvinfer1::ITensor *ElementWiseTensorRT::AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor) { + schema::ActivationType activation = schema::ActivationType::ActivationType_NO_ACTIVATION; + switch (type_) { + case schema::PrimitiveType_AddFusion: { + auto sum_op = op_primitive_->value_as_AddFusion(); + if (sum_op == nullptr) { + MS_LOG(ERROR) << "AddFusion convert failed."; + return nullptr; + } + activation = sum_op->activation_type(); + break; + } + case schema::PrimitiveType_DivFusion: { + auto div_op = op_primitive_->value_as_DivFusion(); + if (div_op == nullptr) { + MS_LOG(ERROR) << "DivFusion convert failed."; + return nullptr; + } + activation = div_op->activation_type(); + break; + } + case schema::PrimitiveType_SubFusion: { + auto sub_op = op_primitive_->value_as_SubFusion(); + if (sub_op == nullptr) { + MS_LOG(ERROR) << "SubFusion convert failed."; + return nullptr; + } + activation = sub_op->activation_type(); + break; + } + case schema::PrimitiveType_MulFusion: { + auto mul_op = op_primitive_->value_as_MulFusion(); + if (mul_op == nullptr) { + MS_LOG(ERROR) << "MulFusion convert failed."; + return nullptr; + } + activation = mul_op->activation_type(); + break; + } + default: + MS_LOG(DEBUG) << "no activation need for: " << op_name_; + } + nvinfer1::ITensor *activation_out_tensor = nullptr; + if (activation != schema::ActivationType::ActivationType_NO_ACTIVATION) { + auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation, 0, 0, 0, in_tensor, device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for element wise failed"; + return nullptr; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + activation_out_tensor = activation_layer->getOutput(0); + } + return activation_out_tensor; +} +int ElementWiseTensorRT::AddConstTensor(TensorRTContext *ctx) { + int const_tensor_index = (in_tensors_[0].Data() != nullptr && in_tensors_[0].IsConst()) ? 0 : 1; + nvinfer1::ITensor *constant_input = ConvertConstantTensorWithDims( + ctx, in_tensors_[const_tensor_index], in_tensors_[1 - const_tensor_index].Shape(), op_name_); + CHECK_NULL_RETURN(constant_input); + AddInnerInTensors(ITensorHelper{constant_input, tensorrt_in_tensors_[0].format_, true}); + return RET_OK; +} +bool ElementWiseTensorRT::SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor) { + if (SameDims(trt_tensor->getDimensions(), ms_tensor->Shape())) { + return true; + } + if (ms_tensor->Shape().size() == DIMENSION_4D) { + // nhwc nchw + auto nchw_shape = NHWC2NCHW(ms_tensor->Shape()); + if (SameDims(trt_tensor->getDimensions(), nchw_shape)) { + return true; + } + } + auto str_name = strstr(trt_tensor->getName(), ms_tensor->Name().c_str()); + if (str_name != nullptr) { + return true; + } + str_name = strstr(ms_tensor->Name().c_str(), trt_tensor->getName()); + if (str_name != nullptr) { + return true; + } + return false; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_SubFusion, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_DivFusion, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_RealDiv, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PowFusion, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AddFusion, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MulFusion, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Eltwise, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Minimum, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Maximum, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_BiasAdd, ElementWiseTensorRT) +#if TRT_VERSION_GE(7, 2) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Less, ElementWiseTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Greater, ElementWiseTensorRT) +#endif +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h new file mode 100644 index 00000000000..ece6aeaa62c --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/elementwise_tensorrt.h @@ -0,0 +1,50 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ElementWiseTensorRT : public TensorRTOp { + public: + ElementWiseTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ElementWiseTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ITensor *AddActivation(TensorRTContext *ctx, nvinfer1::ITensor *in_tensor); + + int AddConstTensor(TensorRTContext *ctx); + + bool SameTensor(nvinfer1::ITensor *trt_tensor, mindspore::MSTensor *ms_tensor); + + int PreprocessInputTensors(TensorRTContext *ctx, ITensorHelper *x_input, ITensorHelper *y_input); + + nvinfer1::ElementWiseOperation element_wise_op_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_ELEMENTWISE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc new file mode 100644 index 00000000000..2b817274d78 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.cc @@ -0,0 +1,96 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/equal_tensorrt.h" +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "NvInferRuntimeCommon.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(EqualPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int EqualTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int EqualTensorRT::AddInnerOp(TensorRTContext *ctx) { + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_}; + auto plugin = std::make_shared(op_name_, device_id_); + nvinfer1::IPluginV2Layer *equal_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin); + if (equal_layer == nullptr) { + MS_LOG(ERROR) << "create equal layer failed for: " << op_name_; + return RET_ERROR; + } + layer_ = equal_layer; + nvinfer1::ITensor *equal_out = equal_layer->getOutput(0); + equal_layer->setName(op_name_.c_str()); + equal_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{equal_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} + +int EqualPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + nvinfer1::Dims input_dims = inputDesc[0].dims; + int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + + if (inputDesc->type == nvinfer1::DataType::kINT32) { + const int *input1 = static_cast(inputs[0]); + const int *input2 = static_cast(inputs[1]); + int *output = static_cast(outputs[0]); + Equal(input1, input2, output, element_cnt, stream); + } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) { + const float *input1 = static_cast(inputs[0]); + const float *input2 = static_cast(inputs[1]); + float *output = static_cast(outputs[0]); + Equal(input1, input2, output, element_cnt, stream); + } else { + MS_LOG(ERROR) << "unsupported equal data type"; + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *EqualPlugin::clone() const noexcept { + auto *plugin = new EqualPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} +#if TRT_VERSION_LS(7, 2) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Equal, EqualTensorRT) +#endif +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h new file mode 100644 index 00000000000..35e5d2259b5 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/equal_tensorrt.h @@ -0,0 +1,63 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/equal.cuh" + +namespace mindspore::lite { +constexpr char *EQUAL_PLUGIN_NAME{"EqualPlugin"}; +class EqualTensorRT : public TensorRTOp { + public: + EqualTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~EqualTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +class EqualPlugin : public TensorRTPlugin { + public: + EqualPlugin(const std::string name, uint32_t device_id) + : TensorRTPlugin(name, std::string(EQUAL_PLUGIN_NAME), device_id) {} + + EqualPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {} + + EqualPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(EQUAL_PLUGIN_NAME)) {} + + EqualPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; +}; +class EqualPluginCreater : public TensorRTPluginCreater { + public: + EqualPluginCreater() : TensorRTPluginCreater(std::string(EQUAL_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_EQUAL_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc new file mode 100644 index 00000000000..a0ea8f40f6a --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.cc @@ -0,0 +1,106 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +namespace mindspore::lite { +constexpr int BIAS_INDEX = 2; +int FullyConnectedTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int FullyConnectedTensorRT::AddInnerOp(TensorRTContext *ctx) { + auto primitive = op_primitive_->value_as_FullConnection(); + CHECK_NULL_RETURN(primitive); + activation_ = primitive->activation_type(); + int axis = primitive->axis(); + if (axis < 0 || axis >= out_tensors_[0].Shape().size()) { + MS_LOG(ERROR) << "axis: " << axis << " is invalid for " << op_name_; + return RET_ERROR; + } + ITensorHelper fc_input; + auto ret = PreprocessInputs(ctx, &fc_input); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PreprocessInputs failed for " << op_name_; + return ret; + } + auto kernel_weight = ConvertWeight(in_tensors_[1].Data().get() == nullptr ? in_tensors_[0] : in_tensors_[1]); + nvinfer1::Weights bias_weight{}; + if (primitive->has_bias()) { + bias_weight = ConvertWeight(in_tensors_[BIAS_INDEX]); + } + nvinfer1::IFullyConnectedLayer *fc_layer = ctx->network()->addFullyConnected( + *(fc_input.trt_tensor_), out_tensors_[0].Shape()[axis], kernel_weight, bias_weight); + if (fc_layer == nullptr) { + MS_LOG(ERROR) << "addFullyConnected failed for " << op_name_; + return RET_ERROR; + } + this->layer_ = fc_layer; + fc_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0); + + if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) { + std::vector squeeze_dim(out_tensors_[0].Shape()); + squeeze_dim[0] = out_tensor->getDimensions().d[0] == -1 ? -1 : squeeze_dim[0]; + out_tensor = Reshape(ctx, out_tensor, squeeze_dim); + } + // add activation + if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) { + nvinfer1::ILayer *activation_layer = + ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for matmul failed"; + return RET_ERROR; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + out_tensor = activation_layer->getOutput(0); + } + + out_tensor->setName((op_name_ + "_output").c_str()); + MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor); + this->AddInnerOutTensors(ITensorHelper{out_tensor, fc_input.format_}); + return RET_OK; +} + +int FullyConnectedTensorRT::PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input) { + auto ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], fc_input); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim failed for " << op_name_; + return ret; + } + auto origin_dims = fc_input->trt_tensor_->getDimensions(); + if (origin_dims.nbDims != DIMENSION_4D) { + std::vector expand_dim(origin_dims.d, origin_dims.d + origin_dims.nbDims); + for (int i = 0; i < DIMENSION_4D - origin_dims.nbDims; i++) { + expand_dim.push_back(1); + } + fc_input->trt_tensor_ = Reshape(ctx, fc_input->trt_tensor_, expand_dim); + } + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_FullConnection, FullyConnectedTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h new file mode 100644 index 00000000000..f98c543a565 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/fullyconnected_tensorrt.h @@ -0,0 +1,45 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_ + +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class FullyConnectedTensorRT : public TensorRTOp { + public: + FullyConnectedTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~FullyConnectedTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int PreprocessInputs(TensorRTContext *ctx, ITensorHelper *fc_input); + + schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_FULLYCONNECTED_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc new file mode 100644 index 00000000000..7c9b5938b22 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.cc @@ -0,0 +1,139 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h" +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(GatherDPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int GatherDTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported gatherd input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "invalid gatherd input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid gatherd output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int GatherDTensorRT::AddInnerOp(TensorRTContext *ctx) { + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[2].trt_tensor_}; + auto dim_tensor = static_cast(in_tensors_[1].Data().get()); + if (dim_tensor == nullptr) { + MS_LOG(ERROR) << op_name_ << " gatherd dim_tensor is null!"; + return RET_ERROR; + } + size_t dim = static_cast(dim_tensor[0]); + + auto plugin = std::make_shared(op_name_, dim, device_id_); + nvinfer1::IPluginV2Layer *gatherd_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin); + if (gatherd_layer == nullptr) { + MS_LOG(ERROR) << "create gatherd failed for: " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *gatherd_out = gatherd_layer->getOutput(0); + gatherd_layer->setName(op_name_.c_str()); + gatherd_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{gatherd_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + this->layer_ = gatherd_layer; + return RET_OK; +} + +int GatherDPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + nvinfer1::Dims input_dims = inputDesc[0].dims; + int dims = input_dims.nbDims; + if (axis_ < 0) { + axis_ += dims; + } + + if (inputDesc->type == nvinfer1::DataType::kINT32) { + auto input = static_cast(inputs[0]); + auto index = static_cast(inputs[1]); + auto output = static_cast(outputs[0]); + Reshape(inputDesc, outputDesc); + Gather(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_, + stream, device_id_); + } else if (inputDesc->type == nvinfer1::DataType::kFLOAT) { + auto input = static_cast(inputs[0]); + auto index = static_cast(inputs[1]); + auto output = static_cast(outputs[0]); + Reshape(inputDesc, outputDesc); + Gather(input, index, output, dim_before_axis_, dim_at_axis_input_, dim_at_axis_output_, dim_after_axis_, + stream, device_id_); + } else { + MS_LOG(ERROR) << "unsupported data type gatherd" << layer_name_; + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *GatherDPlugin::clone() const noexcept { + auto *plugin = new GatherDPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs GatherDPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs out_dims{}; + out_dims.nbDims = inputs[1].nbDims; + for (int i = 0; i < inputs[1].nbDims; i++) { + out_dims.d[i] = inputs[1].d[i]; + } + return out_dims; +} + +void GatherDPlugin::Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc) { + nvinfer1::Dims input_dims = inputDesc[0].dims; + nvinfer1::Dims output_dims = outputDesc[0].dims; + size_t dim_before_axis = 1; + for (size_t i = 0; i < IntToSize(axis_); i++) { + dim_before_axis *= output_dims.d[i]; + } + size_t dim_at_axis_input = input_dims.d[IntToSize(axis_)]; + size_t dim_at_axis_output = output_dims.d[IntToSize(axis_)]; + size_t dim_after_axis = 1; + for (size_t i = IntToSize(axis_) + 1; i < output_dims.nbDims; i++) { + dim_after_axis *= output_dims.d[i]; + } + + dim_before_axis_ = dim_before_axis; + dim_at_axis_input_ = dim_at_axis_input; + dim_at_axis_output_ = dim_at_axis_output; + dim_after_axis_ = dim_after_axis; + return; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_GatherD, GatherDTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h new file mode 100644 index 00000000000..714e6c89819 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_d_tensorrt.h @@ -0,0 +1,80 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/gather.cuh" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +constexpr char *GATHER_D_PLUGIN_NAME{"GatherDPluginCreater"}; +class GatherDTensorRT : public TensorRTOp { + public: + GatherDTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~GatherDTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +class GatherDPlugin : public TensorRTPlugin { + public: + GatherDPlugin(const std::string name, size_t dim, uint32_t device_id) + : TensorRTPlugin(name, std::string(GATHER_D_PLUGIN_NAME), device_id), axis_(dim) {} + + GatherDPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + axis_ = static_cast(fields[0].data)[0]; + } + + GatherDPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(GATHER_D_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &axis_, sizeof(int)); + } + + GatherDPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + + private: + int axis_; + size_t dim_before_axis_; + size_t dim_at_axis_input_; + size_t dim_at_axis_output_; + size_t dim_after_axis_; + void Reshape(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc); +}; + +class GatherDPluginCreater : public TensorRTPluginCreater { + public: + GatherDPluginCreater() : TensorRTPluginCreater(std::string(GATHER_D_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_D_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc new file mode 100644 index 00000000000..38bc8bf1861 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.cc @@ -0,0 +1,108 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/gather_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +constexpr int AXIS_INDEX = 2; + +int GatherTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + if (in_tensors[1].DataType() != DataType::kNumberTypeInt32) { + MS_LOG(ERROR) << "Gather indices only support Int32"; + return RET_ERROR; + } + if (in_tensors[AXIS_INDEX].ElementNum() == 1) { + MS_ASSERT(in_tensors[AXIS_INDEX].Data().get()); + axis_ = static_cast(in_tensors[AXIS_INDEX].Data().get())[0]; + } else { + MS_LOG(ERROR) << "TensorRT axis is attribute."; + return RET_ERROR; + } + return RET_OK; +} + +int GatherTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + if (tensorrt_in_tensors_.size() < INPUT_SIZE2 && in_tensors_.size() >= INPUT_SIZE2) { + int const_ms_tensor_index = in_tensors_[0].IsConst() ? 0 : 1; + auto const_input = ConvertConstantTensor(ctx, in_tensors_[const_ms_tensor_index], op_name_); + if (const_input == nullptr) { + MS_LOG(ERROR) << "add const input tensor failed for " << op_name_; + return RET_ERROR; + } + tensorrt_in_tensors_.push_back(ITensorHelper{const_input}); + } + + int indices_tensor_index = tensorrt_in_tensors_[0].trt_tensor_->getType() == nvinfer1::DataType::kINT32 ? 0 : 1; + ITensorHelper gather_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - indices_tensor_index], &gather_input); + if (ret != RET_OK || gather_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim gather failed for " << op_name_; + return RET_ERROR; + } + ITensorHelper indices_tensor; + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[indices_tensor_index], &indices_tensor); + if (ret != RET_OK || indices_tensor.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim indices failed for " << op_name_; + return RET_ERROR; + } + + nvinfer1::IGatherLayer *gather_layer = + ctx->network()->addGather(*gather_input.trt_tensor_, *indices_tensor.trt_tensor_, axis_); + if (gather_layer == nullptr) { + MS_LOG(ERROR) << "addGather failed for TensorRT."; + return RET_ERROR; + } + + this->layer_ = gather_layer; + gather_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *op_output = gather_layer->getOutput(0); + // keep shape + if (in_tensors_[1].Shape().empty()) { + auto squeeze = ctx->network()->addShuffle(*op_output); + if (squeeze == nullptr) { + MS_LOG(ERROR) << "add output squeeze failed for " << op_name_; + return RET_ERROR; + } + squeeze->setName((op_name_ + "_squeeze_out").c_str()); + auto old_shape = ConvertMSShape(op_output->getDimensions()); + old_shape.erase(old_shape.begin() + axis_); + squeeze->setReshapeDimensions(ConvertCudaDims(old_shape)); + op_output = squeeze->getOutput(0); + } + op_output->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{op_output, gather_input.format_, gather_input.same_format_}); + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Gather, GatherTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h new file mode 100644 index 00000000000..1bd44af8a1f --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/gather_tensorrt.h @@ -0,0 +1,42 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class GatherTensorRT : public TensorRTOp { + public: + GatherTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~GatherTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int axis_{0}; + mindspore::MSTensor indices_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_GATHER_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc new file mode 100644 index 00000000000..8546a5143f7 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.cc @@ -0,0 +1,119 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "NvInferRuntimeCommon.h" +#include "src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh" + +namespace mindspore::lite { +int LogicalNotTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + } + return RET_OK; +} + +int LogicalNotTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) { + MS_LOG(ERROR) << "network or input tensor is invalid"; + return RET_ERROR; + } + if (tensorrt_in_tensors_[0].trt_tensor_->getType() != nvinfer1::DataType::kINT32) { + auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_); + if (cast_layer == nullptr) { + MS_LOG(ERROR) << "create cast layer failed for: " << op_name_; + return RET_ERROR; + } + cast_layer->setOutputType(0, nvinfer1::DataType::kINT32); + tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0); + } + auto plugin = std::make_shared(op_name_, op_primitive_->value_type()); + if (plugin == nullptr) { + MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_}; + nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); + this->layer_ = logical_layer; + nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0); + if (op_out_tensor == nullptr) { + MS_LOG(ERROR) << "addElementWise out tensor is nullptr."; + return RET_ERROR; + } + op_out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} + +REGISTER_TENSORRT_PLUGIN(LogicalNotPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int LogicalNotPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + return RunCudaLogical(inputDesc, inputs, outputs, stream); +} + +int LogicalNotPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, + void *const *outputs, cudaStream_t stream) { + switch (primitive_type_) { + case (schema::PrimitiveType_LogicalNot): { + LogicalNot(static_cast(inputs[0]), static_cast(outputs[0]), GetDimsVolume(inputDesc[0].dims), + stream); + break; + } + default: { + MS_LOG(ERROR) << "invalid logical type: " << static_cast(primitive_type_); + return RET_ERROR; + } + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *LogicalNotPlugin::clone() const noexcept { + auto *plugin = new LogicalNotPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t LogicalNotPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); } + +void LogicalNotPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType)); +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalNot, LogicalNotTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h new file mode 100644 index 00000000000..09c2582bf22 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_not_tensorrt.h @@ -0,0 +1,78 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class LogicalNotTensorRT : public TensorRTOp { + public: + LogicalNotTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~LogicalNotTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +constexpr char *LOGICAL_NOT_PLUGIN_NAME{"LogicalNotPlugin"}; +class LogicalNotPlugin : public TensorRTPlugin { + public: + LogicalNotPlugin(const std::string name, schema::PrimitiveType primitive_type) + : TensorRTPlugin(name, std::string(LOGICAL_NOT_PLUGIN_NAME)), primitive_type_(primitive_type) {} + + LogicalNotPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + primitive_type_ = static_cast(fields[0].data)[0]; + } + + LogicalNotPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(LOGICAL_NOT_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType)); + } + + LogicalNotPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs, + cudaStream_t stream); + const std::string layer_name_; + std::string name_space_; + schema::PrimitiveType primitive_type_; +}; +class LogicalNotPluginCreater : public TensorRTPluginCreater { + public: + LogicalNotPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_NOT_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_NOT_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc new file mode 100644 index 00000000000..653c9431df9 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.cc @@ -0,0 +1,129 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "NvInferRuntimeCommon.h" +#include "src/runtime/delegate/tensorrt/op/logical_tensorrt.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/logical.cuh" + +namespace mindspore::lite { +int LogicalTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int LogicalTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "network or input tensor is invalid"; + return RET_ERROR; + } + for (int i = 0; i != tensorrt_in_tensors_.size(); ++i) { + if (tensorrt_in_tensors_[i].trt_tensor_->getType() != nvinfer1::DataType::kINT32) { + auto cast_layer = ctx->network()->addIdentity(*tensorrt_in_tensors_[0].trt_tensor_); + if (cast_layer == nullptr) { + MS_LOG(ERROR) << "create cast layer failed for: " << op_name_; + return RET_ERROR; + } + cast_layer->setOutputType(0, nvinfer1::DataType::kINT32); + tensorrt_in_tensors_[0].trt_tensor_ = cast_layer->getOutput(0); + } + } + auto plugin = std::make_shared(op_name_, op_primitive_->value_type()); + if (plugin == nullptr) { + MS_LOG(ERROR) << "create ActivationOptPlugin failed for " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, tensorrt_in_tensors_[1].trt_tensor_}; + nvinfer1::IPluginV2Layer *logical_layer = ctx->network()->addPluginV2(inputTensors, 2, *plugin); + this->layer_ = logical_layer; + nvinfer1::ITensor *op_out_tensor = logical_layer->getOutput(0); + if (op_out_tensor == nullptr) { + MS_LOG(ERROR) << "addElementWise out tensor is nullptr."; + return RET_ERROR; + } + op_out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} + +REGISTER_TENSORRT_PLUGIN(LogicalPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int LogicalPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + return RunCudaLogical(inputDesc, inputs, outputs, stream); +} + +int LogicalPlugin::RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, + void *const *outputs, cudaStream_t stream) { + switch (primitive_type_) { + case (schema::PrimitiveType_LogicalAnd): { + LogicalAnd(static_cast(inputs[0]), static_cast(inputs[1]), + static_cast(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream); + break; + } + case (schema::PrimitiveType_LogicalOr): { + LogicalOr(static_cast(inputs[0]), static_cast(inputs[1]), + static_cast(outputs[0]), GetDimsVolume(inputDesc[0].dims), stream); + break; + } + default: { + MS_LOG(ERROR) << "invalid logical type: " << static_cast(primitive_type_); + return RET_ERROR; + } + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *LogicalPlugin::clone() const noexcept { + auto *plugin = new LogicalPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t LogicalPlugin::getSerializationSize() const noexcept { return sizeof(schema::PrimitiveType); } + +void LogicalPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &primitive_type_, sizeof(schema::PrimitiveType)); +} + +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalOr, LogicalTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalAnd, LogicalTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h new file mode 100644 index 00000000000..9ec52e43bc0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/logical_tensorrt.h @@ -0,0 +1,78 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" + +namespace mindspore::lite { +class LogicalTensorRT : public TensorRTOp { + public: + LogicalTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~LogicalTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +constexpr char *LOGICAL_PLUGIN_NAME{"LogicalPlugin"}; +class LogicalPlugin : public TensorRTPlugin { + public: + LogicalPlugin(const std::string name, schema::PrimitiveType primitive_type) + : TensorRTPlugin(name, std::string(LOGICAL_PLUGIN_NAME)), primitive_type_(primitive_type) {} + + LogicalPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + primitive_type_ = static_cast(fields[0].data)[0]; + } + + LogicalPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(LOGICAL_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &primitive_type_, sizeof(schema::PrimitiveType)); + } + + LogicalPlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + int RunCudaLogical(const nvinfer1::PluginTensorDesc *inputDesc, const void *const *inputs, void *const *outputs, + cudaStream_t stream); + const std::string layer_name_; + std::string name_space_; + schema::PrimitiveType primitive_type_; +}; +class LogicalPluginCreater : public TensorRTPluginCreater { + public: + LogicalPluginCreater() : TensorRTPluginCreater(std::string(LOGICAL_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LOGICAL_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc new file mode 100644 index 00000000000..3f0c80dc764 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.cc @@ -0,0 +1,493 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/lstm_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" + +namespace mindspore::lite { +int LSTMTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { +#if TRT_VERSION_GE(7, 0) + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() < INPUT_TENSOR_SIZE) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != OUTPUT_TENSOR_SIZE) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT]; + hidden_init_name_ = hidden_in_init.Name() + "_hidden_init"; + mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT]; + cell_init_name_ = cell_in_init.Name() + "_cell_init"; + + dynamic_shape_params_.support_dynamic_ = false; + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +#else + MS_LOG(WARNING) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher"; + return RET_ERROR; +#endif +} + +int LSTMTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + int input_data_dims_cnt = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims; + if (input_data_dims_cnt != DIMENSION_3D) { + MS_LOG(ERROR) << "invalid input data shape dims for " << op_name_; + return RET_ERROR; + } + network_ = ctx->network(); + int ret = PreProcess(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "PreProcess for " << op_name_; + return ret; + } + + ret = AddLSTMLayers(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "AddLSTMLayers for " << op_name_; + return RET_ERROR; + } + + if (op_data_out_ == nullptr) { + MS_LOG(ERROR) << "layers final output tensor is invalid for " << op_name_; + return RET_ERROR; + } + op_data_out_->setName((op_name_ + "_output").c_str()); + MS_LOG(DEBUG) << "lstm op_data_out_ " << GetTensorFormat(op_data_out_); + MS_LOG(DEBUG) << "lstm op_hidden_out_ " << GetTensorFormat(op_hidden_out_); + MS_LOG(DEBUG) << "lstm op_cell_out_ " << GetTensorFormat(op_cell_out_); + this->AddInnerOutTensors(ITensorHelper{op_data_out_}); + this->AddInnerOutTensors(ITensorHelper{op_hidden_out_}); + this->AddInnerOutTensors(ITensorHelper{op_cell_out_}); + return RET_OK; +} + +int LSTMTensorRT::PreProcess() { + auto ms_input_shape = in_tensors_[0].Shape(); + params_.sequence_size_ = ms_input_shape[0]; + params_.batch_size_ = ms_input_shape[1]; + params_.input_data_size_ = ms_input_shape[INPUT_SIZE_INDEX]; + if (params_.batch_size_ != 1) { + MS_LOG(WARNING) << op_name_ << " lstm has batchsize " << params_.batch_size_ << ", needs further verify"; + } + // ms: 0 sequence size, 1 batch size, 2 input size -> tensorrt: 0 batch size, 1 sequence size, 2 input size + auto transpose_in_layer = network_->addShuffle(*tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_in_layer == nullptr) { + MS_LOG(ERROR) << "create transpose_in_layer failed for " << op_name_; + return RET_ERROR; + } + nvinfer1::Permutation transpose_perm{{1, 0, INPUT_SIZE_INDEX}}; + transpose_in_layer->setFirstTranspose(transpose_perm); + transpose_in_layer->setName((op_name_ + "transpose_in").c_str()); + input_data_ = transpose_in_layer->getOutput(0); + MS_LOG(DEBUG) << "lstm input " << GetTensorFormat(input_data_); + + auto lstm_op = op_primitive_->value_as_LSTM(); + params_.layer_count_ = lstm_op->num_layers() == 0 ? 1 : lstm_op->num_layers(); + params_.hidden_size_ = lstm_op->hidden_size(); + params_.directional_cnt_ = lstm_op->bidirectional() ? BIDIRECTIONAL : 1; + params_.data_type_ = ConvertDataType(in_tensors_[1].DataType()); + return RET_OK; +} + +int LSTMTensorRT::AddLSTMLayers() { + mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT]; + mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT]; + + nvinfer1::ITensor *data_out{nullptr}; + nvinfer1::ITensor *hidden_init = network_->addInput( + hidden_init_name_.c_str(), nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_)); + if (hidden_init == nullptr) { + MS_LOG(ERROR) << "add hidden_init input tensor failed for " << op_name_; + return RET_ERROR; + } + op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(), + nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()}); + nvinfer1::ITensor *cell_init = network_->addInput( + cell_init_name_.c_str(), nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3(params_.layer_count_ * params_.directional_cnt_, params_.batch_size_, params_.hidden_size_)); + if (cell_init == nullptr) { + MS_LOG(ERROR) << "add cell_init input tensor failed for " << op_name_; + return RET_ERROR; + } + op_binding_tensor_.push_back( + BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()}); + + sequence_size_input_ = + network_->addInput((op_name_ + "_seq_input").c_str(), nvinfer1::DataType::kINT32, nvinfer1::Dims{}); + if (sequence_size_input_ == nullptr) { + MS_LOG(ERROR) << "add sequence_size_input_ input tensor failed for " << op_name_; + return RET_ERROR; + } + op_binding_tensor_.push_back( + BindingHelper{(op_name_ + "_seq_input"), ¶ms_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)}); + + nvinfer1::ITensor *max_sequence_size = + network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, ¶ms_.sequence_size_, 1}) + ->getOutput(0); + if (max_sequence_size == nullptr) { + MS_LOG(ERROR) << "add max_sequence_size constant tensor failed for " << op_name_; + return RET_ERROR; + } + LstmState next_state{input_data_, nullptr, nullptr}; // init states + std::vector hidden_outputs; + std::vector cell_outputs; + int input_weight_offset = 0; + int state_weight_offset = 0; + int bias_offset = 0; + + if (params_.layer_count_ != 1) { + MS_LOG(WARNING) << op_name_ << " needs verify for layer cnt: " << params_.layer_count_; + } + for (int i = 0; i < params_.layer_count_; i++) { + LstmState layer_input_states[BIDIRECTIONAL]; + LstmWeights layer_weights[BIDIRECTIONAL]; + layer_weights[0].max_seq_size_ = max_sequence_size; + int ret = ParseLSTMCellInputs(i, hidden_init, cell_init, layer_input_states, &input_weight_offset, + &state_weight_offset, &bias_offset, layer_weights, next_state); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ParseLSTMCellInputs failed for " << op_name_; + return RET_ERROR; + } + data_out = AddLSTMCell(layer_input_states, layer_weights, &next_state); + hidden_outputs.push_back(next_state.hidden_); + cell_outputs.push_back(next_state.cell_); + if (data_out == nullptr || next_state.hidden_ == nullptr || next_state.cell_ == nullptr) { + MS_LOG(ERROR) << "AddLSTMCell failed for " << op_name_; + return RET_ERROR; + } + } + + op_hidden_out_ = ConcateAll(hidden_outputs); + if (op_hidden_out_ == nullptr) { + MS_LOG(ERROR) << "concat hidden output failed for " << op_name_; + return RET_ERROR; + } + op_hidden_out_->setName(out_tensors_[OUTPUT_HIDDEN_INDEX].Name().c_str()); + op_cell_out_ = ConcateAll(cell_outputs); + if (op_cell_out_ == nullptr) { + MS_LOG(ERROR) << "concat cell output failed for " << op_name_; + return RET_ERROR; + } + op_cell_out_->setName(out_tensors_[OUTPUT_CELL_INDEX].Name().c_str()); + op_data_out_ = data_out; + return RET_OK; +} + +int LSTMTensorRT::ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init, + LstmState *layer_input_states, int *input_weight_offset, int *state_weight_offset, + int *bias_offset, LstmWeights *layer_weights, const LstmState &next_state) { + nvinfer1::Dims2 dim_input_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.input_data_size_); + nvinfer1::Dims2 dim_state_weight(LSTM_GATE_NUM * params_.hidden_size_, params_.hidden_size_); + nvinfer1::Dims dim_bias{1, {LSTM_GATE_NUM * params_.hidden_size_}}; + + mindspore::MSTensor &input_weight = in_tensors_[INPUT_WEIGHT]; + mindspore::MSTensor &state_weight = in_tensors_[STATE_WEIGHT]; + mindspore::MSTensor &bias = in_tensors_[BIAS]; + + nvinfer1::Dims dimW = layer_index == 0 ? dim_input_weight : dim_state_weight; + + for (int direction_index = 0; direction_index < params_.directional_cnt_; direction_index++) { + nvinfer1::ITensor *index = + network_ + ->addConstant(nvinfer1::Dims{}, + nvinfer1::Weights{nvinfer1::DataType::kINT32, + &INDICES[layer_index * params_.directional_cnt_ + direction_index], 1}) + ->getOutput(0); + MS_ASSERT(index); + layer_input_states[direction_index].data_ = next_state.data_; + layer_input_states[direction_index].hidden_ = network_->addGather(*hidden_init, *index, 0)->getOutput(0); + layer_input_states[direction_index].cell_ = network_->addGather(*cell_init, *index, 0)->getOutput(0); + MS_ASSERT(layer_input_states[direction_index].hidden_); + MS_ASSERT(layer_input_states[direction_index].cell_); + + // weight order: input, output, forget, cell + if (params_.data_type_ != nvinfer1::DataType::kFLOAT) { + MS_LOG(WARNING) << "more data type need to be done"; + return RET_ERROR; + } + const float *input_weight_ptr = static_cast(input_weight.Data().get()); + const float *state_weight_ptr = static_cast(state_weight.Data().get()); + const float *bias_ptr = static_cast(bias.Data().get()); + nvinfer1::Weights slice_input_weight{params_.data_type_, input_weight_ptr + *input_weight_offset, + GetDimsVolume(dimW)}; + (*input_weight_offset) += slice_input_weight.count; + nvinfer1::Weights slice_state_weight{params_.data_type_, state_weight_ptr + *state_weight_offset, + GetDimsVolume(dim_state_weight)}; + (*state_weight_offset) += slice_state_weight.count; + layer_weights[direction_index].input_weights_ = network_->addConstant(dimW, slice_input_weight)->getOutput(0); + layer_weights[direction_index].state_weights_ = + network_->addConstant(dim_state_weight, slice_state_weight)->getOutput(0); + MS_ASSERT(layer_weights[direction_index].input_weights_); + MS_ASSERT(layer_weights[direction_index].state_weights_); + + // bias + nvinfer1::Weights slice_input_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)}; + (*bias_offset) += slice_input_bias.count; + nvinfer1::Weights slice_state_bias{params_.data_type_, bias_ptr + *bias_offset, GetDimsVolume(dim_bias)}; + (*bias_offset) += slice_state_bias.count; + layer_weights[direction_index].input_bias_ = network_->addConstant(dim_bias, slice_input_bias)->getOutput(0); + layer_weights[direction_index].state_bias_ = network_->addConstant(dim_bias, slice_state_bias)->getOutput(0); + MS_ASSERT(layer_weights[direction_index].input_bias_); + MS_ASSERT(layer_weights[direction_index].state_bias_); + } + if (params_.directional_cnt_ == BIDIRECTIONAL) { + layer_weights[1].max_seq_size_ = layer_weights[0].max_seq_size_; + } + return RET_OK; +} + +nvinfer1::ITensor *LSTMTensorRT::Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims) { + nvinfer1::IShuffleLayer *shuffle = network_->addShuffle(*tensor); + shuffle->setReshapeDimensions(dims); + return shuffle->getOutput(0); +} + +nvinfer1::ITensor *LSTMTensorRT::ConcateAll(std::vector all_tensor, int axis) { + if (all_tensor.size() == 1) { + return all_tensor[0]; + } + nvinfer1::IConcatenationLayer *concat = network_->addConcatenation(all_tensor.data(), all_tensor.size()); + if (concat == nullptr) { + MS_LOG(ERROR) << "addConcatenation failed for " << op_name_; + return nullptr; + } + if (axis >= all_tensor[0]->getDimensions().nbDims) { + MS_LOG(ERROR) << op_name_ << " concat axis is " << axis << ", larger than tensor dims " + << all_tensor[0]->getDimensions().nbDims; + return nullptr; + } + concat->setAxis(axis); + return concat->getOutput(0); +} + +nvinfer1::ITensor *LSTMTensorRT::AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights, + LstmState *next_state) { + nvinfer1::ITensor *backward_output = nullptr; + nvinfer1::ITensor *backward_hidden_out = nullptr; + nvinfer1::ITensor *backward_cell_out = nullptr; + nvinfer1::ITensor *forward_hidden_out = nullptr; + nvinfer1::ITensor *forward_cell_out = nullptr; + + nvinfer1::ITensor *forward_output = + AddLSTMCalculation(layer_input_states[0], layer_weights[0], &forward_hidden_out, &forward_cell_out); + if (params_.directional_cnt_ == BIDIRECTIONAL) { + backward_output = + AddLSTMCalculation(layer_input_states[1], layer_weights[1], &backward_hidden_out, &backward_cell_out, true); + } + + // concate forward and backward + nvinfer1::ITensor *output_tensor = forward_output; + nvinfer1::ITensor *cell_out = forward_cell_out; + nvinfer1::ITensor *hidden_out = forward_hidden_out; + if (backward_output != nullptr && backward_hidden_out != nullptr && backward_cell_out != nullptr) { + nvinfer1::ITensor *output_concat_input[BIDIRECTIONAL] = {forward_output, backward_output}; + auto ouput_out_layer = network_->addConcatenation(output_concat_input, BIDIRECTIONAL); + this->layer_ = ouput_out_layer; + if (ouput_out_layer == nullptr) { + MS_LOG(ERROR) << "create one loop output concat failed for " << op_name_; + return nullptr; + } + ouput_out_layer->setAxis(1); // ms: 0 sequence size, 1 layer * direction, 2 batchsize, 3 hidden + output_tensor = ouput_out_layer->getOutput(0); + + nvinfer1::ITensor *hidden_concat_input[BIDIRECTIONAL] = {forward_hidden_out, backward_hidden_out}; + auto hidden_out_layer = network_->addConcatenation(hidden_concat_input, BIDIRECTIONAL); + hidden_out_layer->setAxis(0); + hidden_out = hidden_out_layer->getOutput(0); + + nvinfer1::ITensor *cell_concat_input[BIDIRECTIONAL] = {forward_cell_out, backward_cell_out}; + auto cell_out_layer = network_->addConcatenation(cell_concat_input, BIDIRECTIONAL); + cell_out_layer->setAxis(0); + cell_out = cell_out_layer->getOutput(0); + } + if (hidden_out == nullptr || cell_out == nullptr) { + MS_LOG(ERROR) << "get one loop hidden_out and cell_out failed for " << op_name_; + return nullptr; + } + *next_state = LstmState{output_tensor, hidden_out, cell_out}; + return output_tensor; +} +nvinfer1::ITensor *LSTMTensorRT::AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights, + nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out, + bool is_backward) { + std::vector all_batch_outputs; + std::vector all_batch_hidden; + std::vector all_batch_cell; + for (int batch_index = 0; batch_index < params_.batch_size_; batch_index++) { + LstmState one_batch_input_state; + nvinfer1::ITensor *batch_index_tensor = + network_->addConstant(nvinfer1::Dims{}, nvinfer1::Weights{nvinfer1::DataType::kINT32, &INDICES[batch_index], 1}) + ->getOutput(0); + one_batch_input_state.data_ = network_->addGather(*input_state.data_, *batch_index_tensor, 0)->getOutput(0); + one_batch_input_state.hidden_ = network_->addGather(*input_state.hidden_, *batch_index_tensor, 0)->getOutput(0); + one_batch_input_state.cell_ = network_->addGather(*input_state.cell_, *batch_index_tensor, 0)->getOutput(0); + nvinfer1::ITensor *one_batch_hidden = nullptr; + nvinfer1::ITensor *one_batch_cell = nullptr; + nvinfer1::ITensor *one_batch_output = + AddLSTMOneLoop(one_batch_input_state, lstm_weights, &one_batch_hidden, &one_batch_cell, is_backward); + if (one_batch_output == nullptr || one_batch_cell == nullptr || one_batch_hidden == nullptr) { + MS_LOG(ERROR) << "AddLSTMOneLoop failed for " << op_name_ << " at batch index " << batch_index; + return nullptr; + } + all_batch_outputs.push_back(one_batch_output); + all_batch_hidden.push_back(one_batch_hidden); + all_batch_cell.push_back(one_batch_cell); + } + *hidden_out = ConcateAll(all_batch_hidden, 1); + *cell_out = ConcateAll(all_batch_cell, 1); + return ConcateAll(all_batch_outputs, BATCH_SIZE_INDEX); +} + +nvinfer1::ITensor *LSTMTensorRT::AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights, + nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out, + bool is_backward) { +#if TRT_VERSION_GE(7, 0) + nvinfer1::ILoop *sequence_loop = network_->addLoop(); + if (sequence_loop == nullptr) { + MS_LOG(ERROR) << "add sequence_loop layer failed for " << op_name_; + return nullptr; + } + std::string loop_name = op_name_ + "_loop" + (is_backward ? "_backward" : "_forward"); + sequence_loop->setName(loop_name.c_str()); + sequence_loop->addTripLimit(*sequence_size_input_, nvinfer1::TripLimit::kCOUNT); + nvinfer1::ITensor *input = sequence_loop->addIterator(*input_state.data_, 0, is_backward)->getOutput(0); + + nvinfer1::ILayer *hidden_mid = sequence_loop->addRecurrence(*input_state.hidden_); + if (hidden_mid == nullptr) { + MS_LOG(ERROR) << "add hidden layer failed for " << op_name_; + return nullptr; + } + nvinfer1::ILayer *cell_mid = sequence_loop->addRecurrence(*input_state.cell_); + if (cell_mid == nullptr) { + MS_LOG(ERROR) << "add cell layer failed for " << op_name_; + return nullptr; + } + + nvinfer1::ITensor *input_matmul = + network_ + ->addMatrixMultiply(*input, nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.input_weights_, + nvinfer1::MatrixOperation::kTRANSPOSE) + ->getOutput(0); + + nvinfer1::ITensor *hidden_matmul = + network_ + ->addMatrixMultiply(*hidden_mid->getOutput(0), nvinfer1::MatrixOperation::kVECTOR, *lstm_weights.state_weights_, + nvinfer1::MatrixOperation::kTRANSPOSE) + ->getOutput(0); + + nvinfer1::ITensor *weights_add = + network_->addElementWise(*input_matmul, *hidden_matmul, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0); + + nvinfer1::ITensor *bias = + network_->addElementWise(*lstm_weights.input_bias_, *lstm_weights.state_bias_, nvinfer1::ElementWiseOperation::kSUM) + ->getOutput(0); + + nvinfer1::ITensor *gates_calculate = + network_->addElementWise(*weights_add, *bias, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0); + + const auto isolateGate = [&](nvinfer1::ITensor &gates, int gateIndex) -> nvinfer1::ITensor * { + nvinfer1::ISliceLayer *slice = + network_->addSlice(gates, nvinfer1::Dims{1, {gateIndex * params_.hidden_size_}}, + nvinfer1::Dims{1, {params_.hidden_size_}}, nvinfer1::Dims{1, {1}}); + return Reshape(slice->getOutput(0), nvinfer1::Dims{1, {params_.hidden_size_}}); + }; + // weight order: input, output, forget, cell + nvinfer1::ITensor *i = + network_->addActivation(*isolateGate(*gates_calculate, 0), nvinfer1::ActivationType::kSIGMOID)->getOutput(0); + + nvinfer1::ITensor *o = + network_->addActivation(*isolateGate(*gates_calculate, 1), nvinfer1::ActivationType::kSIGMOID)->getOutput(0); + + nvinfer1::ITensor *f = + network_->addActivation(*isolateGate(*gates_calculate, FORGET_GATE), nvinfer1::ActivationType::kSIGMOID) + ->getOutput(0); + + nvinfer1::ITensor *c = + network_->addActivation(*isolateGate(*gates_calculate, CELL_GATE), nvinfer1::ActivationType::kTANH)->getOutput(0); + + nvinfer1::ITensor *C = + network_ + ->addElementWise( + *network_->addElementWise(*f, *cell_mid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD)->getOutput(0), + *network_->addElementWise(*i, *c, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM) + ->getOutput(0); + nvinfer1::ITensor *H = + network_ + ->addElementWise(*o, *network_->addActivation(*C, nvinfer1::ActivationType::kTANH)->getOutput(0), + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + + // Recurrent backedge input for hidden and cell. + cell_mid->setInput(1, *C); + hidden_mid->setInput(1, *H); + // outputs + nvinfer1::LoopOutput output_mode = is_backward ? nvinfer1::LoopOutput::kREVERSE : nvinfer1::LoopOutput::kCONCATENATE; + nvinfer1::ILoopOutputLayer *output_layer = sequence_loop->addLoopOutput(*H, output_mode); + output_layer->setInput(1, *lstm_weights.max_seq_size_); + *hidden_out = + Reshape(sequence_loop->addLoopOutput(*hidden_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0), + nvinfer1::Dims3(1, 1, params_.hidden_size_)); + *cell_out = + Reshape(sequence_loop->addLoopOutput(*cell_mid->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0), + nvinfer1::Dims3(1, 1, params_.hidden_size_)); + return Reshape(output_layer->getOutput(0), nvinfer1::Dims4(params_.sequence_size_, 1, 1, params_.hidden_size_)); +#else + MS_LOG(ERROR) << "low TensorRT version don't support LSTM op, please upgrade TensorRT version to 7 or higher"; + return nullptr; +#endif +} + +int LSTMTensorRT::Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) { + if (op_binding_tensor_.size() == 0) { + MS_LOG(DEBUG) << "unsing serialized engine, add input tensor for " << op_name_; + mindspore::MSTensor &hidden_in_init = in_tensors_[HIDDEN_IN_TENSOR_INIT]; + mindspore::MSTensor &cell_in_init = in_tensors_[CELL_IN_TENSOR_INIT]; + + op_binding_tensor_.push_back(BindingHelper{hidden_init_name_, hidden_in_init.MutableData(), + nvinfer1::DataType::kFLOAT, hidden_in_init.DataSize()}); + op_binding_tensor_.push_back( + BindingHelper{cell_init_name_, cell_in_init.MutableData(), nvinfer1::DataType::kFLOAT, cell_in_init.DataSize()}); + params_.sequence_size_ = in_tensors_[0].Shape()[0]; + op_binding_tensor_.push_back( + BindingHelper{(op_name_ + "_seq_input"), ¶ms_.sequence_size_, nvinfer1::DataType::kINT32, sizeof(int)}); + } + for (auto tensor : op_binding_tensor_) { + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor.name_, tensor.size_, tensor.data_type_); + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "malloc for inputs tensor device memory failed " << tensor.name_; + return RET_ERROR; + } + int index = engine->getBindingIndex(tensor.name_.c_str()); + network_tensor_bindings[index] = device_ptr; + runtime_->GetAllocator()->SyncMemInHostAndDevice(tensor.data_, tensor.name_, tensor.size_, true); + runtime_->GetAllocator()->MarkMemValid(tensor.name_, true); + } + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LSTM, LSTMTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h new file mode 100644 index 00000000000..962bf778ff4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/lstm_tensorrt.h @@ -0,0 +1,115 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +constexpr int INPUT_TENSOR_SIZE = 6; +constexpr int OUTPUT_TENSOR_SIZE = 3; +constexpr int INPUT_WEIGHT = 1; +constexpr int STATE_WEIGHT = 2; +constexpr int BIAS = 3; +constexpr int HIDDEN_IN_TENSOR_INIT = 4; +constexpr int CELL_IN_TENSOR_INIT = 5; +constexpr int LSTM_GATE_NUM = 4; +constexpr int BIDIRECTIONAL = 2; +constexpr int OUTPUT_HIDDEN_INDEX = 1; +constexpr int OUTPUT_CELL_INDEX = 2; +constexpr int INPUT_SIZE_INDEX = 2; +constexpr int FORGET_GATE = 2; +constexpr int CELL_GATE = 3; +constexpr int BATCH_SIZE_INDEX = 2; +static const std::array INDICES{0, 1, 2, 3}; + +struct LSTMParams { + int sequence_size_; + int input_data_size_; + int batch_size_; + int layer_count_; + int hidden_size_; + nvinfer1::DataType data_type_; + int directional_cnt_; +}; + +struct LstmState { + nvinfer1::ITensor *data_{nullptr}; + nvinfer1::ITensor *hidden_{nullptr}; + nvinfer1::ITensor *cell_{nullptr}; +}; + +struct LstmWeights { + nvinfer1::ITensor *input_weights_{nullptr}; + nvinfer1::ITensor *state_weights_{nullptr}; + nvinfer1::ITensor *input_bias_{nullptr}; + nvinfer1::ITensor *state_bias_{nullptr}; + nvinfer1::ITensor *max_seq_size_{nullptr}; +}; + +class LSTMTensorRT : public TensorRTOp { + public: + LSTMTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~LSTMTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + int Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) override; + + private: + int PreProcess(); + + int AddLSTMLayers(); + + nvinfer1::ITensor *AddLSTMCell(const LstmState *layer_input_states, const LstmWeights *layer_weights, + LstmState *next_state); + + nvinfer1::ITensor *Reshape(nvinfer1::ITensor *tensor, nvinfer1::Dims dims); + + nvinfer1::ITensor *ConcateAll(std::vector all_tensort, int axis = 0); + + nvinfer1::ITensor *AddLSTMCalculation(const LstmState &input_state, const LstmWeights &lstm_weights, + nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out, + bool is_backward = false); + nvinfer1::ITensor *AddLSTMOneLoop(const LstmState &input_state, const LstmWeights &lstm_weights, + nvinfer1::ITensor **hidden_out, nvinfer1::ITensor **cell_out, + bool is_backward = false); + + int ParseLSTMCellInputs(int layer_index, nvinfer1::ITensor *hidden_init, nvinfer1::ITensor *cell_init, + LstmState *input_state, int *input_weight_offset, int *state_weight_offset, int *bias_offset, + LstmWeights *lstm_weights, const LstmState &next_state); + + nvinfer1::INetworkDefinition *network_{nullptr}; + nvinfer1::ITensor *input_data_{nullptr}; + nvinfer1::ITensor *sequence_size_input_{nullptr}; + nvinfer1::ITensor *op_data_out_{nullptr}; + nvinfer1::ITensor *op_hidden_out_{nullptr}; + nvinfer1::ITensor *op_cell_out_{nullptr}; + LSTMParams params_; + std::string hidden_init_name_; + std::string cell_init_name_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_LSTM_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc new file mode 100644 index 00000000000..e5b610eb120 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.cc @@ -0,0 +1,202 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h" +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include "NvInferRuntimeCommon.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(MatmulOptPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +// MatmulOptPlugin +int MatmulOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + CHECK_NULL_RETURN(cublas_handle_); + CUBLAS_CHECK(cublasSetStream(cublas_handle_, stream)); + const nvinfer1::PluginTensorDesc desc_a = inputDesc[0]; + const nvinfer1::PluginTensorDesc desc_b = inputDesc[1]; + const nvinfer1::PluginTensorDesc desc_c = outputDesc[0]; + + if (desc_a.dims.nbDims == DIMENSION_2D) { + // a: m * k, b: k * n, c: m * n + int m = desc_c.dims.d[0]; + int n = desc_c.dims.d[1]; + int k = b_trans_ ? desc_b.dims.d[1] : desc_b.dims.d[0]; + const int mm_params[]{m, n, k}; + CublasMM1Batch(inputs[0], inputs[1], outputs[0], mm_params, operations_, data_types_, cublas_handle_); + } else if (desc_a.dims.nbDims == DIMENSION_3D) { + return RunBatchedMatmul(inputDesc, outputDesc, inputs, outputs, workspace, stream); + } else { + MS_LOG(ERROR) << layer_name_ << " input dims needs check a: " << desc_a.dims.nbDims; + return RET_ERROR; + } + return RET_OK; +} + +int MatmulOptPlugin::RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream) { + const nvinfer1::PluginTensorDesc desc_b = inputDesc[1]; + const nvinfer1::PluginTensorDesc desc_c = outputDesc[0]; + int batch = desc_c.dims.d[0]; + int m = desc_c.dims.d[1]; + int n = desc_c.dims.d[DIMENSION_2D]; + int k = b_trans_ ? desc_b.dims.d[DIMENSION_2D] : desc_b.dims.d[1]; + const int mm_params[]{m, n, k, batch}; + for (int i = 0; i < batch; i++) { + a_addrs_[i] = inputs[0] + i * m * k * sizeof(float); + b_addrs_[i] = inputs[1] + i * k * n * sizeof(float); + c_addrs_[i] = outputs[0] + i * m * n * sizeof(float); + } + int data_size = batch * sizeof(void *); + int max_batchsize = a_addrs_.size(); + if (a_device_addrs_ == nullptr) { + CUDA_CHECK(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize)); + } + if (b_device_addrs_ == nullptr) { + CUDA_CHECK(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize)); + } + if (c_device_addrs_ == nullptr) { + CUDA_CHECK(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize)); + } + CUDA_CHECK(cudaMemcpy(a_device_addrs_, a_addrs_.data(), data_size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(b_device_addrs_, b_addrs_.data(), data_size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(c_device_addrs_, c_addrs_.data(), data_size, cudaMemcpyHostToDevice)); + + CublasMMBatched(a_device_addrs_, b_device_addrs_, c_device_addrs_, mm_params, operations_, data_types_, + cublas_handle_); + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *MatmulOptPlugin::clone() const noexcept { + auto *plugin = new MatmulOptPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs MatmulOptPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, + int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs out_dims{}; + if (nbInputs != INPUT_SIZE2 && nbInputs != INPUT_SIZE3) { + MS_LOG(ERROR) << "invalid input size " << nbInputs << " of " << layer_name_; + return out_dims; + } + out_dims.nbDims = inputs[0].nbDims; + if (out_dims.nbDims == DIMENSION_2D) { + out_dims.d[0] = a_trans_ ? inputs[0].d[1] : inputs[0].d[0]; + out_dims.d[1] = b_trans_ ? inputs[1].d[0] : inputs[1].d[1]; + return out_dims; + } else if (out_dims.nbDims == DIMENSION_3D) { + out_dims.d[0] = inputs[0].d[0]; + out_dims.d[1] = a_trans_ ? inputs[0].d[DIMENSION_2D] : inputs[0].d[1]; + out_dims.d[DIMENSION_2D] = b_trans_ ? inputs[1].d[1] : inputs[1].d[DIMENSION_2D]; + return out_dims; + } + MS_LOG(ERROR) << "invalid input dims " << out_dims.nbDims << " of " << layer_name_; + return out_dims; +} + +void MatmulOptPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { + operations_[0] = a_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N; + operations_[1] = b_trans_ ? CUBLAS_OP_T : CUBLAS_OP_N; + data_types_[0] = ConvertDataType(in[0].desc.type); // input a + data_types_[1] = ConvertDataType(in[1].desc.type); // input b + data_types_[THIRD_INPUT] = ConvertDataType(out[0].desc.type); // output c + data_types_[FOURTH_INPUT] = + (in[0].desc.type == nvinfer1::DataType::kHALF || in[1].desc.type == nvinfer1::DataType::kHALF) + ? CUDA_R_16F + : CUDA_R_32F; // compute type + if (in[0].max.nbDims == DIMENSION_3D) { + int max_batchsize = in[0].max.d[0]; + a_addrs_.resize(max_batchsize); + b_addrs_.resize(max_batchsize); + c_addrs_.resize(max_batchsize); + if (a_device_addrs_ == nullptr) { + CUDA_CHECK_VOID(cudaMalloc(&a_device_addrs_, sizeof(void *) * max_batchsize)); + } + if (b_device_addrs_ == nullptr) { + CUDA_CHECK_VOID(cudaMalloc(&b_device_addrs_, sizeof(void *) * max_batchsize)); + } + if (c_device_addrs_ == nullptr) { + CUDA_CHECK_VOID(cudaMalloc(&c_device_addrs_, sizeof(void *) * max_batchsize)); + } + } +} + +int MatmulOptPlugin::initialize() noexcept { + if (cublas_handle_ == nullptr) { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + } + for (int i = 0; i < DIMENSION_4D; i++) { + if (data_types_[i] != CUDA_R_32F) { + MS_LOG(ERROR) << layer_name_ << " only support fp32"; + return RET_ERROR; + } + } +} + +void MatmulOptPlugin::terminate() noexcept { + if (cublas_handle_ != nullptr) { + auto cublas_ret = cublasDestroy(cublas_handle_); + if (cublas_ret != CUBLAS_STATUS_SUCCESS) { + MS_LOG(ERROR) << "cublasDestroy failed: " << cublas_ret; + } else { + cublas_handle_ = nullptr; + } + } + cudaError_t err; + if (a_device_addrs_ != nullptr) { + err = cudaFree(a_device_addrs_); + if (err != cudaSuccess) { + MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err; + } + a_device_addrs_ = nullptr; + } + if (b_device_addrs_ != nullptr) { + err = cudaFree(b_device_addrs_); + if (err != cudaSuccess) { + MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err; + } + b_device_addrs_ = nullptr; + } + if (c_device_addrs_ != nullptr) { + err = cudaFree(c_device_addrs_); + if (err != cudaSuccess) { + MS_LOG(ERROR) << layer_name_ << " free cuda device mem failed " << err; + } + c_device_addrs_ = nullptr; + } +} + +size_t MatmulOptPlugin::getSerializationSize() const noexcept { return 2 * sizeof(bool); } + +void MatmulOptPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &a_trans_, sizeof(bool)); + SerializeValue(&buffer, &b_trans_, sizeof(bool)); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h new file mode 100644 index 00000000000..bc5559f6591 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h @@ -0,0 +1,80 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h" + +namespace mindspore::lite { +constexpr char *MATMUL_OPT_PLUGIN_NAME{"MatmulOptPlugin"}; +class MatmulOptPlugin : public TensorRTPlugin { + public: + MatmulOptPlugin(const std::string name, bool a_trans, bool b_trans, uint32_t device_id) + : TensorRTPlugin(name, std::string(MATMUL_OPT_PLUGIN_NAME), device_id), a_trans_(a_trans), b_trans_(b_trans) {} + + MatmulOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + a_trans_ = static_cast(fields[0].data)[0]; + b_trans_ = static_cast(fields[1].data)[0]; + } + + MatmulOptPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(MATMUL_OPT_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &a_trans_, sizeof(bool)); + DeserializeValue(&serialData, &serialLength, &b_trans_, sizeof(bool)); + } + + MatmulOptPlugin() = delete; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + int initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + int RunBatchedMatmul(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream); + + bool a_trans_{false}; + bool b_trans_{false}; + cublasHandle_t cublas_handle_{nullptr}; + cublasOperation_t operations_[2]{CUBLAS_OP_N, CUBLAS_OP_N}; + cudaDataType data_types_[4]{CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F}; + std::vector a_addrs_; + std::vector b_addrs_; + std::vector c_addrs_; + void **a_device_addrs_{nullptr}; + void **b_device_addrs_{nullptr}; + void **c_device_addrs_{nullptr}; +}; +class MatmulOptPluginCreater : public TensorRTPluginCreater { + public: + MatmulOptPluginCreater() : TensorRTPluginCreater(std::string(MATMUL_OPT_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc new file mode 100644 index 00000000000..b12b8457a02 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.cc @@ -0,0 +1,310 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/matmul_tensorrt.h" +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/matmul_opt_plugin.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" + +namespace mindspore::lite { +MatMulTensorRT::~MatMulTensorRT() { + if (weight_ptr_ != nullptr) { + free(weight_ptr_); + weight_ptr_ = nullptr; + } +} +int MatMulTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int MatMulTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (type_ == schema::PrimitiveType_MatMulFusion) { + auto primitive = this->GetPrimitive()->value_as_MatMulFusion(); + if (primitive == nullptr) { + MS_LOG(ERROR) << "convert to primitive matmul failed for " << op_name_; + return RET_ERROR; + } + transpose_a_ = primitive->transpose_a(); + transpose_b_ = primitive->transpose_b(); + activation_ = primitive->activation_type(); + } + nvinfer1::ITensor *out_tensor = nullptr; + if (RunOptPlugin()) { + out_tensor = AddAsOptPlugin(ctx); + } else if (RunFullConnect()) { + MS_LOG(DEBUG) << "use fully connected instead of matmul for " << op_name_; + out_tensor = AddAsFullConnect(ctx); + } else { + MS_LOG(DEBUG) << "use origin tensorrt matmul for " << op_name_; + out_tensor = AddAsMatmul(ctx); + } + if (out_tensor == nullptr) { + MS_LOG(ERROR) << "add matmul failed for " << op_name_; + return RET_ERROR; + } + + // add activation + if (activation_ != schema::ActivationType::ActivationType_NO_ACTIVATION) { + nvinfer1::ILayer *activation_layer = + ActivationTensorRT::AddActivation(ctx, activation_, 0, 0, 0, out_tensor, device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for matmul failed"; + return RET_ERROR; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + out_tensor = activation_layer->getOutput(0); + } + + out_tensor->setName((op_name_ + "_output").c_str()); + MS_LOG(DEBUG) << "output " << GetTensorFormat(out_tensor, out_format_, true); + this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_}); + return RET_OK; +} + +int MatMulTensorRT::PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b) { + if (tensorrt_in_tensors_.size() == INPUT_SIZE2) { + int a_index = + GetDimsVolume(tensorrt_in_tensors_[0].trt_tensor_->getDimensions()) == GetDimsVolume(in_tensors_[0].Shape()) ? 0 + : 1; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[a_index], matmul_a); + ret += PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - a_index], matmul_b); + if (ret != RET_OK || matmul_a->trt_tensor_ == nullptr || matmul_b->trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul inputs failed for " << op_name_; + return ret; + } + out_format_ = matmul_a->format_; + if (matmul_a->format_ != matmul_b->format_) { + MS_LOG(WARNING) << "matmul input tensor has different format " << op_name_; + out_format_ = Format::NHWC; + } + } else if (tensorrt_in_tensors_.size() == 1) { + auto weight = ProcessWeightTensor(ctx); + if (weight == nullptr) { + MS_LOG(ERROR) << "create constant weight tensor failed for " << op_name_; + return RET_ERROR; + } + int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0; + ITensorHelper *weight_helper = (weight_index == 1) ? matmul_b : matmul_a; + ITensorHelper *var_helper = (weight_index == 1) ? matmul_a : matmul_b; + weight_helper->trt_tensor_ = weight; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1 - weight_index], var_helper); + if (ret != RET_OK || var_helper->trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim of matmul input var_helper failed for " << op_name_; + return ret; + } + out_format_ = var_helper->format_; + } else { + MS_LOG(ERROR) << op_name_ << " tensorrt in tensor size is invalid " << tensorrt_in_tensors_.size(); + return RET_ERROR; + } + return RET_OK; +} + +nvinfer1::ITensor *MatMulTensorRT::ProcessWeightTensor(TensorRTContext *ctx) { + nvinfer1::ITensor *weight = nullptr; + int weight_index = in_tensors_[1].Data() != nullptr ? 1 : 0; + if (in_tensors_[weight_index].Shape().size() < + static_cast(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) { + std::vector expect_shape(in_tensors_[1 - weight_index].Shape().size(), 1); + auto origin_shape = in_tensors_[weight_index].Shape(); + for (int i = 0; i < origin_shape.size(); i++) { + expect_shape[expect_shape.size() - 1 - i] = origin_shape[origin_shape.size() - 1 - i]; + } + weight = ConvertTensorWithExpandDims(ctx, in_tensors_[weight_index], expect_shape, op_name_); + } else if (in_tensors_[weight_index].Shape().size() == + static_cast(tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims)) { + weight = ConvertConstantTensor(ctx, in_tensors_[weight_index], op_name_); + } else { + MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_; + return nullptr; + } + return weight; +} + +nvinfer1::ITensor *MatMulTensorRT::AddAsMatmul(TensorRTContext *ctx) { + ITensorHelper matmul_a; + ITensorHelper matmul_b; + + int ret = PreprocessMatMulInputs(ctx, &matmul_a, &matmul_b); + if (ret != RET_OK || matmul_a.trt_tensor_ == nullptr || matmul_b.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessMatMulInputs matmul failed for " << op_name_; + return nullptr; + } + + MS_LOG(DEBUG) << "matmul input a " << GetTensorFormat(matmul_a); + MS_LOG(DEBUG) << "matmul input b " << GetTensorFormat(matmul_b); + + auto matmul_layer = ctx->network()->addMatrixMultiply( + *matmul_a.trt_tensor_, transpose_a_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE, + *matmul_b.trt_tensor_, transpose_b_ ? nvinfer1::MatrixOperation::kTRANSPOSE : nvinfer1::MatrixOperation::kNONE); + if (matmul_layer == nullptr) { + MS_LOG(ERROR) << "addMatrixMultiply failed for " << op_name_; + return nullptr; + } + this->layer_ = matmul_layer; + matmul_layer->setName(op_name_.c_str()); + return AddBias(ctx, matmul_layer->getOutput(0)); +} + +nvinfer1::ITensor *MatMulTensorRT::AddAsFullConnect(TensorRTContext *ctx) { + nvinfer1::Weights weight; + nvinfer1::Weights bias = ConvertWeight(in_tensors_[kBiasIndex]); + nvinfer1::ITensor *input_a = tensorrt_in_tensors_[0].trt_tensor_; + out_format_ = tensorrt_in_tensors_[0].format_; + if (input_a->getDimensions().nbDims != DIMENSION_4D) { + nvinfer1::Dims in_dims(input_a->getDimensions()); + in_dims.nbDims = DIMENSION_4D; + for (int i = input_a->getDimensions().nbDims; i < DIMENSION_4D; i++) { + in_dims.d[i] = 1; + } + input_a = Reshape(ctx, input_a, in_dims); + if (input_a == nullptr) { + MS_LOG(ERROR) << "reshape input failed for " << op_name_; + return nullptr; + } + MS_LOG(DEBUG) << "full connect expand input a to " << GetTensorFormat(input_a); + } else { + ITensorHelper tmp_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &tmp_input); + if (ret != RET_OK || tmp_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "rPreprocessInputs2SameDim failed for " << op_name_; + return nullptr; + } + input_a = tmp_input.trt_tensor_; + out_format_ = tmp_input.format_; + MS_LOG(DEBUG) << "full connect preprocess input a to " << GetTensorFormat(tmp_input); + } + if (!transpose_b_) { + // transpose weight + weight = TransposeWeight2D(in_tensors_[1], &weight_ptr_); + if (weight.values == nullptr || weight_ptr_ == nullptr) { + MS_LOG(ERROR) << "TransposeWeight2D input weight failed for " << op_name_; + return nullptr; + } + } else { + weight = ConvertWeight(in_tensors_[1]); + } + + int output_cnt = in_tensors_[kBiasIndex].Shape()[0]; + + auto fc_layer = ctx->network()->addFullyConnected(*input_a, output_cnt, weight, bias); + if (fc_layer == nullptr) { + MS_LOG(ERROR) << "add fully connected layer failed for " << op_name_; + return nullptr; + } + this->layer_ = fc_layer; + fc_layer->setName((op_name_ + "_fullyconnected").c_str()); + nvinfer1::ITensor *out_tensor = fc_layer->getOutput(0); + if (out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) { + std::vector out_dims(out_tensors_[0].Shape()); + out_dims[0] = out_tensor->getDimensions().d[0]; + out_tensor = Reshape(ctx, out_tensor, out_dims); + } + return out_tensor; +} +nvinfer1::ITensor *MatMulTensorRT::AddAsOptPlugin(TensorRTContext *ctx) { + nvinfer1::ITensor *weight_tensor = nullptr; + if (tensorrt_in_tensors_.size() >= INPUT_SIZE2) { + weight_tensor = tensorrt_in_tensors_[1].trt_tensor_; + } else { + weight_tensor = ConvertConstantTensor(ctx, in_tensors_[1], op_name_); + } + + auto plugin = std::make_shared(op_name_, transpose_a_, transpose_b_, device_id_); + if (plugin == nullptr) { + MS_LOG(ERROR) << "create MatmulOptPlugin failed for " << op_name_; + return nullptr; + } + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_, weight_tensor}; + nvinfer1::IPluginV2Layer *matmul_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE2, *plugin); + if (matmul_layer == nullptr) { + MS_LOG(ERROR) << "add matmul opt plugin layer failed for " << op_name_; + return nullptr; + } + layer_ = matmul_layer; + return AddBias(ctx, matmul_layer->getOutput(0)); +} +nvinfer1::ITensor *MatMulTensorRT::AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor) { + nvinfer1::ITensor *out_tensor = input_tensor; + if (in_tensors_.size() == kBiasIndex + 1) { + nvinfer1::ITensor *bias = nullptr; + if (in_tensors_[kBiasIndex].Shape().size() < static_cast(out_tensor->getDimensions().nbDims)) { + std::vector expect_dims(out_tensors_[0].Shape()); + expect_dims[0] = out_tensor->getDimensions().d[0]; + bias = ConvertTensorWithExpandDims(ctx, in_tensors_[kBiasIndex], expect_dims, op_name_); + } else if (in_tensors_[kBiasIndex].Shape().size() == static_cast(out_tensor->getDimensions().nbDims)) { + bias = ConvertConstantTensor(ctx, in_tensors_[kBiasIndex], op_name_); + } else { + MS_LOG(ERROR) << "input tensor shape is invalid for " << op_name_; + return nullptr; + } + if (bias == nullptr) { + MS_LOG(ERROR) << "create constant bias tensor failed for " << op_name_; + return nullptr; + } + auto bias_layer = ctx->network()->addElementWise(*out_tensor, *bias, nvinfer1::ElementWiseOperation::kSUM); + if (bias_layer == nullptr) { + MS_LOG(ERROR) << "add bias add layer failed for " << op_name_; + return nullptr; + } + auto bias_layer_name = op_name_ + "_bias"; + bias_layer->setName(bias_layer_name.c_str()); + out_tensor = bias_layer->getOutput(0); + } + return out_tensor; +} + +bool MatMulTensorRT::RunOptPlugin() { + if (quant_type_ == schema::QuantType_QUANT_NONE && + runtime_->GetRuntimePrecisionMode() == RuntimePrecisionMode::RuntimePrecisionMode_FP32) { + if (in_tensors_[0].Shape().size() == DIMENSION_2D && in_tensors_[1].Shape().size() == DIMENSION_2D && + in_tensors_[0].Shape()[0] > 1 && tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0] == -1) { + MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 2D dynamic batchsize"; + return true; + } else if (in_tensors_[0].Shape().size() == DIMENSION_3D && in_tensors_[1].Shape().size() == DIMENSION_3D) { + // batched matmul using opt + MS_LOG(INFO) << op_name_ << " uses optimize matmul plugin for 3D batchsized"; + return true; + } + } + return false; +} +bool MatMulTensorRT::RunFullConnect() { + if (in_tensors_.size() == INPUT_SIZE3 && in_tensors_[1].Data() != nullptr && + in_tensors_[kBiasIndex].Data() != nullptr && !transpose_a_ && in_tensors_[1].Shape().size() == DIMENSION_2D && + (in_tensors_[0].Shape().size() == DIMENSION_2D || in_tensors_[0].Shape().size() == DIMENSION_4D)) { + return true; + } + return false; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MatMulFusion, MatMulTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h new file mode 100644 index 00000000000..db3175c8cc4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/matmul_tensorrt.h @@ -0,0 +1,62 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class MatMulTensorRT : public TensorRTOp { + public: + MatMulTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~MatMulTensorRT() override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + int AddInnerOp(TensorRTContext *ctx) override; + + private: + int PreprocessMatMulInputs(TensorRTContext *ctx, ITensorHelper *matmul_a, ITensorHelper *matmul_b); + + nvinfer1::ITensor *ProcessWeightTensor(TensorRTContext *ctx); + + nvinfer1::ITensor *AddAsMatmul(TensorRTContext *ctx); + + nvinfer1::ITensor *AddAsFullConnect(TensorRTContext *ctx); + + nvinfer1::ITensor *AddAsOptPlugin(TensorRTContext *ctx); + + nvinfer1::ITensor *AddBias(TensorRTContext *ctx, nvinfer1::ITensor *input_tensor); + + bool RunOptPlugin(); + bool RunFullConnect(); + + bool transpose_a_{false}; + bool transpose_b_{false}; + Format out_format_{Format::NHWC}; + schema::ActivationType activation_{schema::ActivationType::ActivationType_NO_ACTIVATION}; + void *weight_ptr_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_MATMUL_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc new file mode 100644 index 00000000000..4100a39bf1f --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.cc @@ -0,0 +1,59 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h" +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/cuda_impl/cuda_helper.h" +#include "NvInferRuntimeCommon.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/normalize.cuh" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(NormalizeOptPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int NormalizeOptPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream) noexcept { + auto input = static_cast(inputs[0]); + auto gamma = static_cast(inputs[1]); + auto beta = static_cast(inputs[2]); + auto output = static_cast(outputs[0]); + auto input_dims = inputDesc[0].dims; + size_t dim_at_axis = input_dims.d[axis_]; + int element_cnt = std::accumulate(input_dims.d, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + Normalize(input, gamma, beta, output, dim_at_axis, epsilion_, element_cnt, stream); +} + +nvinfer1::IPluginV2DynamicExt *NormalizeOptPlugin::clone() const noexcept { + auto *plugin = new NormalizeOptPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +size_t NormalizeOptPlugin::getSerializationSize() const noexcept { return sizeof(size_t) + sizeof(float); } + +void NormalizeOptPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &axis_, sizeof(size_t)); + SerializeValue(&buffer, &epsilion_, sizeof(float)); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h new file mode 100644 index 00000000000..981628e6da5 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h @@ -0,0 +1,61 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_OPT_PLUGIN_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" + +namespace mindspore::lite { +constexpr char *NORMALIZE_OPT_PLUGIN_NAME{"NormalizeOptPlugin"}; +class NormalizeOptPlugin : public TensorRTPlugin { + public: + NormalizeOptPlugin(const std::string name, size_t axis, float epsilion, uint32_t device_id) + : TensorRTPlugin(name, std::string(NORMALIZE_OPT_PLUGIN_NAME), device_id), axis_(axis), epsilion_(epsilion) {} + + NormalizeOptPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + axis_ = static_cast(fields[0].data)[0]; + epsilion_ = static_cast(fields[1].data)[0]; + } + + NormalizeOptPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(NORMALIZE_OPT_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &axis_, sizeof(size_t)); + DeserializeValue(&serialData, &serialLength, &epsilion_, sizeof(float)); + } + + NormalizeOptPlugin() = delete; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + size_t axis_{0}; + float epsilion_{0.0f}; +}; +class NormalizeOptPluginCreater : public TensorRTPluginCreater { + public: + NormalizeOptPluginCreater() : TensorRTPluginCreater(std::string(NORMALIZE_OPT_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_MATMUL_OPT_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc new file mode 100644 index 00000000000..ec5a5ab4007 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.cc @@ -0,0 +1,178 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/normalize_tensorrt.h" +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/normalize_opt_plugin.h" + +namespace mindspore::lite { +int NormalizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != INPUT_SIZE3 && out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + auto norm_op = primitive->value_as_LayerNormFusion(); + CHECK_NULL_RETURN(norm_op); + int being_norm_axis = norm_op->begin_norm_axis(); + being_norm_axis = being_norm_axis >= 0 ? being_norm_axis : in_tensors[0].Shape().size() + being_norm_axis; + int begin_params_axis = norm_op->begin_params_axis(); + begin_params_axis = begin_params_axis >= 0 ? begin_params_axis : in_tensors[0].Shape().size() + begin_params_axis; + if (begin_params_axis != being_norm_axis || begin_params_axis != in_tensors[0].Shape().size() - 1) { + MS_LOG(ERROR) << "only support normalize on last one dim, being_norm_axis is " << being_norm_axis << " for " + << op_name_; + return RET_ERROR; + } + axis_ = begin_params_axis; + epsilon_ = norm_op->epsilon(); + return RET_OK; +} + +int NormalizeTensorRT::AddInnerOp(TensorRTContext *ctx) { + CHECK_NULL_RETURN(ctx->network()); + int ret = PreprocessInputs(ctx); + if (ret != RET_OK) { + MS_LOG(ERROR) << "preprocess input failed for " << op_name_; + return ret; + } + return RunOptPlugin() ? RunAsOptPlugin(ctx) : RunAsTrtOps(ctx); +} + +int NormalizeTensorRT::PreprocessInputs(TensorRTContext *ctx) { + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &norm_input_); + if (ret != RET_OK || norm_input_.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim norm_input failed for " << op_name_; + return RET_ERROR; + } + if (in_tensors_.size() == BETA_INDEX + 1) { + gamma_ = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + in_tensors_[1].Name()); + CHECK_NULL_RETURN(gamma_); + beta_ = ConvertTensorWithExpandDims(ctx, in_tensors_[BETA_INDEX], in_tensors_[0].Shape(), + op_name_ + in_tensors_[BETA_INDEX].Name()); + CHECK_NULL_RETURN(beta_); + } + return RET_OK; +} + +int NormalizeTensorRT::RunAsOptPlugin(TensorRTContext *ctx) { + auto plugin = std::make_shared(op_name_, axis_, epsilon_, device_id_); + if (plugin == nullptr) { + MS_LOG(ERROR) << "create NormalizeOptPlugin failed for " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *inputTensors[] = {norm_input_.trt_tensor_, gamma_, beta_}; + nvinfer1::IPluginV2Layer *norm_layer = ctx->network()->addPluginV2(inputTensors, INPUT_SIZE3, *plugin); + if (norm_layer == nullptr) { + MS_LOG(ERROR) << "add norm opt plugin layer failed for " << op_name_; + return RET_ERROR; + } + layer_ = norm_layer; + layer_->setName(op_name_.c_str()); + AddInnerOutTensors(ITensorHelper{norm_layer->getOutput(0), norm_input_.format_, norm_input_.same_format_}); + return RET_OK; +} + +int NormalizeTensorRT::RunAsTrtOps(TensorRTContext *ctx) { + size_t axis = 1u << axis_; + // first output, add later + AddInnerOutTensors(ITensorHelper{nullptr, norm_input_.format_, norm_input_.same_format_}); + + // mean + auto mean = + ctx->network()->addReduce(*(norm_input_.trt_tensor_), nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0); + CHECK_NULL_RETURN(mean); + if (out_tensors_.size() == INPUT_SIZE3) { + AddInnerOutTensors(ITensorHelper{mean, norm_input_.format_, norm_input_.same_format_}); + } + // x - mean + auto sub_mean = ctx->network() + ->addElementWise(*(norm_input_.trt_tensor_), *mean, nvinfer1::ElementWiseOperation::kSUB) + ->getOutput(0); + CHECK_NULL_RETURN(sub_mean); + // (x - mean)^2 + auto const_two = + ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &two_, DataType::kNumberTypeFloat32, op_name_ + "_two"); + CHECK_NULL_RETURN(const_two); + auto pow = ctx->network()->addElementWise(*sub_mean, *const_two, nvinfer1::ElementWiseOperation::kPOW)->getOutput(0); + CHECK_NULL_RETURN(pow); + // mean of (x - mean)^2 + auto var = ctx->network()->addReduce(*pow, nvinfer1::ReduceOperation::kAVG, axis, true)->getOutput(0); + CHECK_NULL_RETURN(var); + if (out_tensors_.size() == INPUT_SIZE3) { + AddInnerOutTensors(ITensorHelper{var, norm_input_.format_, norm_input_.same_format_}); + } + + // var + min epsilon + auto const_epsilon = ConvertScalarToITensor(ctx, in_tensors_[0].Shape().size(), &epsilon_, + DataType::kNumberTypeFloat32, op_name_ + "_epsilion"); + CHECK_NULL_RETURN(const_epsilon); + auto var_epsilon = + ctx->network()->addElementWise(*var, *const_epsilon, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0); + CHECK_NULL_RETURN(var_epsilon); + + // standard deviation + auto std_dev = ctx->network()->addUnary(*var_epsilon, nvinfer1::UnaryOperation::kSQRT)->getOutput(0); + CHECK_NULL_RETURN(std_dev); + + // sub_mean / std_dev + auto norm_layer = ctx->network()->addElementWise(*sub_mean, *std_dev, nvinfer1::ElementWiseOperation::kDIV); + CHECK_NULL_RETURN(norm_layer); + this->layer_ = norm_layer; + auto norm = norm_layer->getOutput(0); + CHECK_NULL_RETURN(norm); + + // scale with gamma and beta + if (gamma_ != nullptr && beta_ != nullptr) { + auto gamma_out = + ctx->network()->addElementWise(*norm, *gamma_, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0); + CHECK_NULL_RETURN(gamma_out); + auto beta_out = + ctx->network()->addElementWise(*gamma_out, *beta_, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0); + CHECK_NULL_RETURN(beta_out); + tensorrt_out_tensors_[0].trt_tensor_ = beta_out; + } else { + tensorrt_out_tensors_[0].trt_tensor_ = norm; + } + return RET_OK; +} + +bool NormalizeTensorRT::RunOptPlugin() { + if (out_tensors_.size() == 1 && in_tensors_.size() == INPUT_SIZE3 && axis_ == in_tensors_[0].Shape().size() - 1 && + in_tensors_[0].Shape()[axis_] < GET_THREADS) { + // insufficient shared memory + int dim_sum = std::accumulate(in_tensors_[0].Shape().begin(), in_tensors_[0].Shape().begin() + axis_, 1, + std::multiplies()); + const int kSharedMemoryThreshold = 2048; + if (dim_sum > kSharedMemoryThreshold) { + return false; + } + MS_LOG(INFO) << op_name_ << " use opt plugin"; + return true; + } + return false; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LayerNormFusion, NormalizeTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h new file mode 100644 index 00000000000..5b7e67882fd --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/normalize_tensorrt.h @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +constexpr int BETA_INDEX = 2; + +class NormalizeTensorRT : public TensorRTOp { + public: + NormalizeTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~NormalizeTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int PreprocessInputs(TensorRTContext *ctx); + + int RunAsOptPlugin(TensorRTContext *ctx); + + int RunAsTrtOps(TensorRTContext *ctx); + + bool RunOptPlugin(); + + ITensorHelper norm_input_; + nvinfer1::ITensor *gamma_{nullptr}; + nvinfer1::ITensor *beta_{nullptr}; + size_t axis_{0}; + const float two_{2.0f}; + float epsilon_{0.0f}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_NORMALIZE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc new file mode 100644 index 00000000000..534f35b0875 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.cc @@ -0,0 +1,140 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/pad_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int PadTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + if (in_tensors_[1].Data() == nullptr) { + MS_LOG(ERROR) << "invalid pad tensor for: " << op_name_; + return RET_ERROR; + } + auto pad_primitive = this->GetPrimitive()->value_as_PadFusion(); + if (pad_primitive == nullptr) { + MS_LOG(ERROR) << "convert PadFusion failed: " << op_name_; + return RET_ERROR; + } + schema::PaddingMode padding_mode = pad_primitive->padding_mode(); + if (padding_mode != schema::PaddingMode::PaddingMode_CONSTANT) { + MS_LOG(ERROR) << "Unsupported padding mode: " << schema::PaddingMode(padding_mode) << ", for op: " << op_name_; + return RET_ERROR; + } + if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) { + MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format(); + return RET_ERROR; + } + constant_value_ = pad_primitive->constant_value(); + return RET_OK; +} + +int PadTensorRT::AddInnerOp(TensorRTContext *ctx) { + mindspore::MSTensor &pad_tensor = in_tensors_[1]; + int element_cnt = std::accumulate(pad_tensor.Shape().begin(), pad_tensor.Shape().end(), 1, std::multiplies()); + if (element_cnt != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims * INPUT_SIZE2) { + MS_LOG(ERROR) << "pad tensor cnt is invalid. cnt: " << element_cnt + << ", input tensor dims cnt: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims; + return RET_ERROR; + } + + nvinfer1::ITensor *pad_input = tensorrt_in_tensors_[0].trt_tensor_; + MS_LOG(DEBUG) << "before transpose " + << GetTensorFormat(pad_input, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_); + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // transpose: NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "transpose: NHWC->NCHW failed"; + return RET_ERROR; + } + transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str()); + this->transpose_layer_ = transpose_layer_in; + pad_input = transpose_layer_in->getOutput(0); + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(pad_input, Format::NCHW, false); + } + + // trt 6 only support 2D padding + const int *padding_data = reinterpret_cast(in_tensors_[1].Data().get()); + MS_ASSERT(padding_data); + nvinfer1::IPaddingLayer *padding_layer = nullptr; + if (element_cnt == index_NHWC_ * INPUT_SIZE2) { + // only support pad at HW index + int h_pre; + int h_post; + int w_pre; + int w_post; + if (SameDims(pad_input->getDimensions(), in_tensors_[0].Shape())) { + // NCHW: 0: N_pre, 1: N_post, 2: C_pre, 3: C_post, 4: H_pre, 5: H_post, 6: W_pre, 7: W_post + if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 2) != 0 || *(padding_data + 3) != 0) { + MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_; + } + h_pre = 4; + h_post = 5; + w_pre = 6; + w_post = 7; + } else { + // NHWC: 0: N_pre, 1: N_post, 2: H_pre, 3: H_post, 4: W_pre, 5: W_post, 6: C_pre, 7: C_post + if (*padding_data != 0 || *(padding_data + 1) != 0 || *(padding_data + 6) != 0 || *(padding_data + 7) != 0) { + MS_LOG(WARNING) << "tensorrt padding only support pad at HW index, unsupported padding value of: " << op_name_; + } + h_pre = 2; + h_post = 3; + w_pre = 4; + w_post = 5; + } + nvinfer1::DimsHW prePadding{*(padding_data + h_pre), *(padding_data + w_pre)}; + nvinfer1::DimsHW postPadding{*(padding_data + h_post), *(padding_data + w_post)}; + MS_LOG(DEBUG) << op_name_ << " prePadding: " << prePadding.d[0] << ", " << prePadding.d[1] + << "; postPadding: " << postPadding.d[0] << ", " << postPadding.d[1]; + + padding_layer = ctx->network()->addPadding(*pad_input, prePadding, postPadding); + } else { + MS_LOG(ERROR) << "need check for pad_tensor dims: " << op_name_ + << ", pad_tensor ElementNum: " << pad_tensor.ElementNum(); + return RET_ERROR; + } + if (padding_layer == nullptr) { + MS_LOG(ERROR) << "add padding layer failed for " << op_name_; + return RET_ERROR; + } + this->layer_ = padding_layer; + padding_layer->setName(op_name_.c_str()); + padding_layer->getOutput(0)->setName((op_name_ + "_output").c_str()); + bool same_format = SameDims(padding_layer->getOutput(0)->getDimensions(), out_tensors_[0].Shape()) && + SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape()); + this->AddInnerOutTensors(ITensorHelper{padding_layer->getOutput(0), Format::NCHW, same_format}); + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PadFusion, PadTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h new file mode 100644 index 00000000000..def44c32bc8 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pad_tensorrt.h @@ -0,0 +1,42 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class PadTensorRT : public TensorRTOp { + public: + PadTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~PadTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + const int index_NHWC_ = 4; + float constant_value_ = 0.0f; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_PAD_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc new file mode 100644 index 00000000000..7d83d9c54c0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.cc @@ -0,0 +1,220 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/pool_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int PoolTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + if (in_tensors[0].format() != Format::NHWC && in_tensors[0].format() != Format::NCHW) { + MS_LOG(ERROR) << "Unsupported input tensor format of " << in_tensors[0].format(); + return RET_ERROR; + } + return RET_OK; +} + +int PoolTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (tensorrt_in_tensors_.size() != 1) { + MS_LOG(ERROR) << "invalid input tensor size: " << tensorrt_in_tensors_.size(); + return RET_ERROR; + } + MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]); + int ret = ParseParams(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ParseParams failed for : " << op_name_; + return RET_ERROR; + } + + nvinfer1::ITensor *pool_input = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // transpose: NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "transpose: NHWC->NCHW failed"; + return RET_ERROR; + } + transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str()); + this->transpose_layer_ = transpose_layer_in; + pool_input = transpose_layer_in->getOutput(0); + } + + // pooling layer + nvinfer1::Dims windowSize = lite::ConvertCudaDims(kernel_size_); + if (windowSize.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + nvinfer1::IPoolingLayer *pooling_layer = ctx->network()->addPoolingNd(*pool_input, pooling_type_, windowSize); + if (pooling_layer == nullptr) { + MS_LOG(ERROR) << "addPoolingNd failed for TensorRT."; + return RET_ERROR; + } + AddParams(pooling_layer); + pooling_layer->setName(op_name_.c_str()); + this->layer_ = pooling_layer; + + // add activation + nvinfer1::ILayer *activation_layer = nullptr; + if (activation_type_ == schema::ActivationType::ActivationType_NO_ACTIVATION) { + activation_layer = pooling_layer; + } else { + activation_layer = + ActivationTensorRT::AddActivation(ctx, activation_type_, 0, 0, 0, pooling_layer->getOutput(0), device_id_); + if (activation_layer == nullptr) { + MS_LOG(ERROR) << "addActivation for pool failed"; + return RET_ERROR; + } + activation_layer->setName((op_name_ + "_activation").c_str()); + } + nvinfer1::ITensor *out_trt_tensor = activation_layer->getOutput(0); + out_trt_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{out_trt_tensor, Format::NCHW, false}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} + +int PoolTensorRT::ParseParams() { + int in_h = in_tensors_[0].Shape()[kNHWC_H]; + int in_w = in_tensors_[0].Shape()[kNHWC_W]; + int out_h = out_tensors_[0].Shape()[kNHWC_H]; + int out_w = out_tensors_[0].Shape()[kNHWC_W]; + int kernel_h; + int kernel_w; + switch (type_) { + case (schema::PrimitiveType_AvgPoolFusion): { + const schema::AvgPoolFusion *pool_primitive = this->GetPrimitive()->value_as_AvgPoolFusion(); + if (pool_primitive == nullptr) { + MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_; + return RET_ERROR; + } + pooling_type_ = nvinfer1::PoolingType::kAVERAGE; + + auto stride = pool_primitive->strides(); + if (stride == nullptr) { + MS_LOG(ERROR) << "get stride failed: " << op_name_; + return RET_ERROR; + } + stride_ = std::vector(stride->begin(), stride->end()); + kernel_h = in_h - (out_h - 1) * stride_[0]; + kernel_w = in_w - (out_w - 1) * stride_[1]; + auto kernel_size = pool_primitive->kernel_size(); + if (kernel_size == nullptr) { + kernel_size_.push_back(kernel_h); + kernel_size_.push_back(kernel_w); + MS_LOG(WARNING) << op_name_ << "don't has kernel size, calculate kernel size on ms tensor, kernel_h is " + << kernel_h << ", kernel_w is " << kernel_w; + } else { + kernel_size_ = std::vector(kernel_size->begin(), kernel_size->end()); + } + auto padding = pool_primitive->pad(); + if (padding != nullptr && padding->size() != DIMENSION_4D) { + MS_LOG(ERROR) << op_name_ << "has invalid pad dims: " << padding->size(); + return RET_ERROR; + } else if (padding == nullptr || padding->size() == 0) { + padding_ = std::vector(DIMENSION_4D, 0); + } else { + padding_ = std::vector(padding->begin(), padding->end()); + } + + pad_mode_ = pool_primitive->pad_mode(); + activation_type_ = pool_primitive->activation_type(); + break; + } + case (schema::PrimitiveType_MaxPoolFusion): { + const schema::MaxPoolFusion *pool_primitive = this->GetPrimitive()->value_as_MaxPoolFusion(); + if (pool_primitive == nullptr) { + MS_LOG(ERROR) << "convert PoolFusion failed: " << op_name_; + return RET_ERROR; + } + pooling_type_ = nvinfer1::PoolingType::kMAX; + + auto kernel_size = pool_primitive->kernel_size(); + if (kernel_size == nullptr) { + MS_LOG(ERROR) << "get kernel size failed: " << op_name_; + return RET_ERROR; + } + kernel_size_ = std::vector(kernel_size->begin(), kernel_size->end()); + + auto stride = pool_primitive->strides(); + if (stride == nullptr) { + MS_LOG(ERROR) << "get stride failed: " << op_name_; + return RET_ERROR; + } + stride_ = std::vector(stride->begin(), stride->end()); + kernel_h = in_h - (out_h - 1) * stride_[0]; + kernel_w = in_w - (out_w - 1) * stride_[1]; + auto padding = pool_primitive->pad(); + if (padding == nullptr) { + MS_LOG(INFO) << "get padding is null, set to default 0: " << op_name_; + padding_ = {0, 0, 0, 0}; + } else { + padding_ = std::vector(padding->begin(), padding->end()); + } + + pad_mode_ = pool_primitive->pad_mode(); + activation_type_ = pool_primitive->activation_type(); + break; + } + default: { + MS_LOG(ERROR) << "unsupported primitive type of " << type_ << " for node: " << op_name_; + return RET_ERROR; + } + } + // some model kernel size is large than hw, correct it + if (kernel_size_[0] > in_h || kernel_size_[1] > in_w) { + MS_LOG(WARNING) << op_name_ << " kernel size is larger than input size"; + kernel_size_[0] = kernel_size_[0] > kernel_h ? kernel_h : kernel_size_[0]; + kernel_size_[1] = kernel_size_[1] > kernel_w ? kernel_w : kernel_size_[1]; + } + return RET_OK; +} + +void PoolTensorRT::AddParams(nvinfer1::IPoolingLayer *pooling_layer) { + nvinfer1::Dims stride_dims = ConvertCudaDims(stride_); + if (stride_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return; + } + pooling_layer->setStrideNd(stride_dims); + if (pad_mode_ == schema::PadMode::PadMode_SAME) { + pooling_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } else { + nvinfer1::Dims dims{}; + dims.nbDims = DIMENSION_2D; + dims.d[0] = padding_[0]; + dims.d[1] = padding_[DIMENSION_2D]; + pooling_layer->setPaddingNd(dims); + } +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_AvgPoolFusion, PoolTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_MaxPoolFusion, PoolTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h new file mode 100644 index 00000000000..de8003ca08c --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/pool_tensorrt.h @@ -0,0 +1,55 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class PoolTensorRT : public TensorRTOp { + public: + PoolTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~PoolTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int ParseParams(); + + void AddParams(nvinfer1::IPoolingLayer *pooling_layer); + + std::vector kernel_size_; + + std::vector stride_; + + std::vector padding_; + + nvinfer1::PoolingType pooling_type_; + + schema::PadMode pad_mode_; + + schema::ActivationType activation_type_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_POOL_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc new file mode 100644 index 00000000000..e3968264654 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.cc @@ -0,0 +1,79 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/delegate/tensorrt/op/prelu_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int PReluTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_; + return RET_ERROR; + } + + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_; + return RET_ERROR; + } + return RET_OK; +} + +int PReluTensorRT::AddInnerOp(TensorRTContext *ctx) { + ITensorHelper prelu_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &prelu_input); + if (ret != RET_OK || prelu_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_; + return ret; + } + int input_nbdims = prelu_input.trt_tensor_->getDimensions().nbDims; + int slope_nbdims = in_tensors_[1].Shape().size(); + auto slope = tensorrt_in_tensors_[1].trt_tensor_; + if (input_nbdims != slope_nbdims) { + slope = ConvertTensorWithExpandDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_ + "_slope"); + tensorrt_in_tensors_[1].trt_tensor_ = slope; + } + if (slope == nullptr) { + MS_LOG(ERROR) << "add const input tensor failed for " << op_name_; + return RET_ERROR; + } + ITensorHelper slope_helper; + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &slope_helper); + if (ret != RET_OK || slope_helper.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim slope tensor failed for " << op_name_; + return ret; + } + + auto *prelu_layer = ctx->network()->addParametricReLU(*prelu_input.trt_tensor_, *slope_helper.trt_tensor_); + if (prelu_layer == nullptr) { + MS_LOG(ERROR) << "addParameticReLU failed for TensorRT : " << op_name_; + return RET_ERROR; + } + + nvinfer1::ITensor *out_tensor = prelu_layer->getOutput(0); + out_tensor->setName((op_name_ + "_0").c_str()); + this->AddInnerOutTensors(ITensorHelper{out_tensor, prelu_input.format_, prelu_input.same_format_}); + this->layer_ = prelu_layer; + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_PReLUFusion, PReluTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h new file mode 100644 index 00000000000..3d6505b8afd --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/prelu_tensorrt.h @@ -0,0 +1,39 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class PReluTensorRT : public TensorRTOp { + public: + PReluTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~PReluTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_PRELU_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc new file mode 100644 index 00000000000..e8cdeb23281 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.cc @@ -0,0 +1,139 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/delegate/tensorrt/op/reduce_tensorrt.h" + +namespace mindspore::lite { +int ReduceTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + } + return RET_OK; +} + +int ReduceTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + auto reduce_op = op_primitive_->value_as_ReduceFusion(); + if (reduce_op == nullptr) { + MS_LOG(ERROR) << "convert failed"; + return RET_ERROR; + } + bool keep_dims = reduce_op->keep_dims(); + out_format_ = tensorrt_in_tensors_[0].format_; + nvinfer1::ITensor *reduce_input = tensorrt_in_tensors_[0].trt_tensor_; + MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]); + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + !SameDims(tensorrt_in_tensors_[0].trt_tensor_->getDimensions(), in_tensors_[0].Shape())) { + if (tensorrt_in_tensors_[0].format_ == Format::NCHW) { + // NCHW->NHWC + nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "create transpose layer failed for " << op_name_; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_transpose_in").c_str()); + reduce_input = transpose_layer->getOutput(0); + out_format_ = Format::NHWC; + this->transpose_layer_ = transpose_layer; + } else if (tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "create transpose layer failed for " << op_name_; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_transpose_in").c_str()); + reduce_input = transpose_layer->getOutput(0); + out_format_ = Format::NCHW; + this->transpose_layer_ = transpose_layer; + } else { + MS_LOG(WARNING) << "input tensor format needs check: " << op_name_; + } + } + MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(reduce_input, out_format_, true); + if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) { + // x^2 + auto *pow2_layer = + ctx->network()->addElementWise(*reduce_input, *reduce_input, nvinfer1::ElementWiseOperation::kPROD); + CHECK_NULL_RETURN(pow2_layer); + pow2_layer->setName((op_name_ + "_pow2").c_str()); + + reduce_input = pow2_layer->getOutput(0); + CHECK_NULL_RETURN(reduce_input); + } + + uint32_t reduceAxis = GetAxis(); + auto reduce_operation_opt = TryConvertTRTReduceMode(reduce_op->mode()); + if (!reduce_operation_opt) { + MS_LOG(WARNING) << "invalid reduce for TensorRT, need check: " << static_cast(reduce_op->mode()); + return RET_ERROR; + } + nvinfer1::IReduceLayer *layer = + ctx->network()->addReduce(*reduce_input, reduce_operation_opt.value(), reduceAxis, keep_dims); + CHECK_NULL_RETURN(layer); + layer->setName(op_name_.c_str()); + this->layer_ = layer; + + nvinfer1::ITensor *out_tensor = layer->getOutput(0); + CHECK_NULL_RETURN(out_tensor); + + if (reduce_op->mode() == schema::ReduceMode::ReduceMode_ReduceL2) { + auto sqrt_layer = ctx->network()->addUnary(*out_tensor, nvinfer1::UnaryOperation::kSQRT); + CHECK_NULL_RETURN(sqrt_layer); + sqrt_layer->setName((op_name_ + "_sqrt").c_str()); + out_tensor = sqrt_layer->getOutput(0); + } + out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{out_tensor, out_format_, true}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} + +uint32_t ReduceTensorRT::GetAxis() { + // axis + uint32_t reduceAxis = 0; + mindspore::MSTensor axis_tensor = this->in_tensors_[1]; + if (axis_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "invalid axis_tensor"; + return reduceAxis; + } + if (axis_tensor.DataType() != DataType::kNumberTypeInt32) { + MS_LOG(WARNING) << "not int data type"; + } + int *axis_data = reinterpret_cast(axis_tensor.MutableData()); + CHECK_NULL_RETURN(axis_data); + for (int i = 0; i < axis_tensor.ElementNum(); i++) { + int format_axis_data = (*axis_data == -1) ? in_tensors_[0].Shape().size() - 1 : *axis_data; + MS_LOG(DEBUG) << op_name_ << " reduceAxis at index : " << *axis_data; + reduceAxis |= 1u << format_axis_data; + axis_data++; + } + return reduceAxis; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceFusion, ReduceTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h new file mode 100644 index 00000000000..d01927f704b --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reduce_tensorrt.h @@ -0,0 +1,44 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_ + +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ReduceTensorRT : public TensorRTOp { + public: + ReduceTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ReduceTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + uint32_t GetAxis(); + Format out_format_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc new file mode 100644 index 00000000000..7c9256992cb --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.cc @@ -0,0 +1,126 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h" +#include +#include +#include "NvInferRuntimeCommon.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(ReduceScatterPluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int ReduceScatterTensorRT::IsSupport(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { +#ifndef LITE_CUDA_DISTRIBUTION + MS_LOG(ERROR) + << "Unsupported package for gpu distribution feature, please recompile with MS_ENABLE_CUDA_DISTRIBUTION set to on."; + return RET_ERROR; +#else + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +#endif +} + +int ReduceScatterTensorRT::AddInnerOp(TensorRTContext *ctx) { + nvinfer1::ITensor *inputTensors[] = {tensorrt_in_tensors_[0].trt_tensor_}; + auto reduce_op = op_primitive_->value_as_ReduceScatter(); + if (reduce_op == nullptr) { + MS_LOG(ERROR) << "convert failed for " << op_name_; + return RET_ERROR; + } + auto reduce_mode = reduce_op->mode(); + auto rank = GetGPUGroupSize(); + auto plugin = std::make_shared(op_name_, reduce_mode, rank, device_id_); + MS_LOG(INFO) << op_name_ << " group size: " << rank << ", rank id: " << GetRankID(); + nvinfer1::IPluginV2Layer *reduce_scatter_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); + if (reduce_scatter_layer == nullptr) { + MS_LOG(ERROR) << "create ReduceScatter layer failed for: " << op_name_; + return RET_ERROR; + } + nvinfer1::ITensor *reduce_scatter_out = reduce_scatter_layer->getOutput(0); + reduce_scatter_layer->setName(op_name_.c_str()); + reduce_scatter_out->setName((op_name_ + "_output").c_str()); + this->layer_ = reduce_scatter_layer; + this->AddInnerOutTensors( + ITensorHelper{reduce_scatter_out, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} + +// ReduceScatterPlugin +int ReduceScatterPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, + const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream) noexcept { + MS_LOG(INFO) << "ReduceScatter run at rank id: " << GetRankID() << " stream: " << stream; + nvinfer1::Dims output_dims = outputDesc[0].dims; + int recieve_element_cnt = + std::accumulate(output_dims.d, output_dims.d + output_dims.nbDims, 1, std::multiplies()); + const void *input = inputs[0]; + void *output = outputs[0]; + auto data_type = inputDesc->type; + auto ret = DistributionCollective::instance().ReduceScatterWrapper(input, output, recieve_element_cnt, data_type, + red_mode_, stream, NCCL_WORLD_GROUP); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ReduceScatter nccl run failed for " << layer_name_; + return ret; + } + return RET_OK; +} + +nvinfer1::IPluginV2DynamicExt *ReduceScatterPlugin::clone() const noexcept { + auto *plugin = new ReduceScatterPlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs ReduceScatterPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, + int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs out_dims{}; + out_dims.nbDims = inputs->nbDims; + auto rank_dim = exprBuilder.constant(rank_); + out_dims.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kCEIL_DIV, *inputs->d[0], *rank_dim); + for (int i = 1; i < inputs->nbDims; i++) { + out_dims.d[i] = inputs->d[i]; + } + return out_dims; +} + +size_t ReduceScatterPlugin::getSerializationSize() const noexcept { return sizeof(schema::ReduceMode); } + +void ReduceScatterPlugin::serialize(void *buffer) const noexcept { + SerializeValue(&buffer, &red_mode_, sizeof(schema::ReduceMode)); +} + +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ReduceScatter, ReduceScatterTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h new file mode 100644 index 00000000000..297397922a0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/reducescatter_tensorrt.h @@ -0,0 +1,83 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_ +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h" + +namespace mindspore::lite { +constexpr char *REDUCESCATTER_PLUGIN_NAME{"ReduceScatterPlugin"}; +class ReduceScatterTensorRT : public TensorRTOp { + public: + ReduceScatterTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ReduceScatterTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; + +class ReduceScatterPlugin : public TensorRTPlugin { + public: + ReduceScatterPlugin(const std::string name, schema::ReduceMode red_mode, int rank, uint32_t device_id) + : TensorRTPlugin(name, std::string(REDUCESCATTER_PLUGIN_NAME), device_id), red_mode_(red_mode), rank_(rank) {} + + ReduceScatterPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + red_mode_ = static_cast(fields[0].data)[0]; + rank_ = static_cast(fields[1].data)[0]; + } + + ReduceScatterPlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(REDUCESCATTER_PLUGIN_NAME)) { + DeserializeValue(&serialData, &serialLength, &red_mode_, sizeof(schema::ReduceMode)); + DeserializeValue(&serialData, &serialLength, &rank_, sizeof(int)); + } + + ReduceScatterPlugin() = delete; + + // IPluginV2DynamicExt Methods + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + + private: + int rank_{0}; + schema::ReduceMode red_mode_; +}; +class ReduceScatterPluginCreater : public TensorRTPluginCreater { + public: + ReduceScatterPluginCreater() : TensorRTPluginCreater(std::string(REDUCESCATTER_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_REDUCESCATTER_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc new file mode 100644 index 00000000000..65776da3fcf --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.cc @@ -0,0 +1,230 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/delegate/tensorrt/op/resize_tensorrt.h" +#include "nnacl/nnacl_common.h" + +namespace mindspore::lite { +int ResizeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + } + resize_op_ = op_primitive_->value_as_Resize(); + if (resize_op_ == nullptr) { + MS_LOG(ERROR) << "convert failed " << op_name_; + return RET_ERROR; + } + if (resize_op_->method() == schema::ResizeMethod_LINEAR) { + MS_LOG(WARNING) << "TensorRT linear resize has precision issue, using cpu instead for " << op_name_; + return RET_ERROR; + } + dynamic_shape_params_.support_hw_dynamic_ = + (resize_op_->new_height() > 0 && resize_op_->new_width() > 0) ? false : true; + // constant new hw op don't support hw resize + return RET_OK; +} + +int ResizeTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + + nvinfer1::ITensor *resize_in_tensor = tensorrt_in_tensors_[0].trt_tensor_; + MS_LOG(DEBUG) << "origin input " << GetTensorFormat(tensorrt_in_tensors_[0]); + + if (resize_in_tensor->getDimensions().nbDims == DIMENSION_4D && tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "create transpose layer failed for " << op_name_; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_transpose_in").c_str()); + resize_in_tensor = transpose_layer->getOutput(0); + this->transpose_layer_ = transpose_layer; + } + MS_LOG(DEBUG) << "after transpose input " << GetTensorFormat(resize_in_tensor, Format::NCHW, false); + + nvinfer1::IResizeLayer *resize_layer = ctx->network()->addResize(*resize_in_tensor); + if (resize_layer == nullptr) { + MS_LOG(ERROR) << "create resize layer failed for " << op_name_; + return RET_ERROR; + } + int ret = SetOutputDims(resize_in_tensor, resize_layer); + if (ret != RET_OK) { + MS_LOG(ERROR) << "SetOutputDims failed for " << op_name_; + return RET_ERROR; + } + + ret = SetParams(resize_layer); + if (ret != RET_OK) { + MS_LOG(ERROR) << "SetParams failed for " << op_name_; + return RET_ERROR; + } + + resize_layer->getOutput(0)->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{resize_layer->getOutput(0), Format::NCHW, false}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + this->layer_ = resize_layer; + return RET_OK; +} + +int ResizeTensorRT::SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer) { + nvinfer1::Dims in_dims = resize_in_tensor->getDimensions(); + if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) { + nvinfer1::Dims4 new_dims(in_dims.d[0], in_dims.d[1], resize_op_->new_height(), resize_op_->new_width()); // nchw + resize_layer->setOutputDimensions(new_dims); // static shape + } else if (in_tensors_.size() == 1 && !dynamic_shape_params_.support_hw_dynamic_ && + dynamic_shape_params_.support_dynamic_ && in_dims.nbDims == DIMENSION_4D) { + // hw is static, but has dynamic batch size + float scales[DIMENSION_4D]{1, 1, 1, 1}; + scales[kNCHW_H] = static_cast(resize_op_->new_height()) / static_cast(in_dims.d[kNCHW_H]); + scales[kNCHW_W] = static_cast(resize_op_->new_width()) / static_cast(in_dims.d[kNCHW_W]); + resize_layer->setScales(scales, DIMENSION_4D); + } else { + auto shape_value_tensor = in_tensors_[1]; + if (shape_value_tensor.Data() == nullptr && tensorrt_in_tensors_.size() >= INPUT_SIZE2) { + // dynamic output shape + resize_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_); + } else { + std::vector out_shape; + ParseValueFromShapeTensor(shape_value_tensor, &out_shape); + if (SameDims(out_shape, out_tensors_[0].Shape())) { + // static dims + if (out_shape.size() == DIMENSION_4D) { + // convert nhwc to nchw + auto channel = out_shape[out_shape.size() - 1]; + out_shape.insert(out_shape.begin() + 1, channel); + out_shape.erase(out_shape.begin() + out_shape.size() - 1); + } + resize_layer->setOutputDimensions(ConvertCudaDims(out_shape)); + } else if (IsScaleOutputDim(in_tensors_[0].Shape(), out_tensors_[0].Shape(), out_shape)) { + // scale dims + float scales[DIMENSION_4D]{1, 1, 1, 1}; + scales[kNCHW_H] = + static_cast(out_tensors_[0].Shape()[kNHWC_H]) / static_cast(in_tensors_[0].Shape()[kNHWC_H]); + scales[kNCHW_W] = + static_cast(out_tensors_[0].Shape()[kNHWC_W]) / static_cast(in_tensors_[0].Shape()[kNHWC_W]); + resize_layer->setScales(scales, DIMENSION_4D); + } else if (out_tensors_[0].Shape().size() == DIMENSION_4D) { + MS_LOG(DEBUG) << op_name_ << " output shape tensor value is const, but set to scales for dynamic input shape."; + float scales[out_tensors_[0].Shape().size()]; + for (size_t i = 0; i < out_tensors_[0].Shape().size(); i++) { + scales[i] = static_cast(out_tensors_[0].Shape()[i]) / static_cast(in_tensors_[0].Shape()[i]); + } + // change to nchw + scales[kNCHW_W] = scales[kNHWC_W]; + scales[kNCHW_H] = scales[kNHWC_H]; + scales[kNCHW_C] = 1; + MS_LOG(DEBUG) << op_name_ << "scale at H " << kNCHW_H << ": " << scales[kNCHW_H] << ", W " << kNCHW_W << ": " + << scales[kNCHW_W]; + resize_layer->setScales(scales, out_tensors_[0].Shape().size()); + } else { + MS_LOG(ERROR) << "resize dims needs check for " << op_name_; + return RET_ERROR; + } + } + } + return RET_OK; +} + +void ResizeTensorRT::ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor, + std::vector *out_shape) { + switch (shape_value_tensor.DataType()) { + case DataType::kNumberTypeFloat32: { + const float *shape_data_fp32 = static_cast(shape_value_tensor.Data().get()); + for (int i = 0; i < shape_value_tensor.ElementNum(); i++) { + out_shape->push_back(*(shape_data_fp32 + i)); + } + break; + } + case DataType::kNumberTypeFloat16: { + const uint16_t *shape_data_fp16 = static_cast(shape_value_tensor.Data().get()); + for (int i = 0; i < shape_value_tensor.ElementNum(); i++) { + out_shape->push_back(ShortToFloat32(*(shape_data_fp16 + i))); + } + break; + } + case DataType::kNumberTypeInt32: { + const int *shape_data_fp16 = static_cast(shape_value_tensor.Data().get()); + for (int i = 0; i < shape_value_tensor.ElementNum(); i++) { + out_shape->push_back(*(shape_data_fp16 + i)); + } + break; + } + default: + MS_LOG(WARNING) << op_name_ + << " more datatype need to check: " << static_cast(shape_value_tensor.DataType()); + break; + } + if (out_shape->size() == DIMENSION_2D && + tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D) { + // out_shape: origin_n, out_shape[0], out_shape[1], origin_c + out_shape->insert(out_shape->begin(), + tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0]); // batch size is dynamic + out_shape->push_back(in_tensors_[0].Shape()[kNHWC_C]); // channel is const + } +} + +bool ResizeTensorRT::IsScaleOutputDim(const std::vector &in_shape, const std::vector &out_shape, + const std::vector &shape_tensor_val) { + if (out_shape.size() != DIMENSION_4D) { + MS_LOG(WARNING) << "dims count needs check for " << op_name_; + return false; + } + if (in_shape.size() != out_shape.size() || shape_tensor_val.size() != in_shape.size()) { + MS_LOG(WARNING) << "tensor shape is not same for " << op_name_; + return false; + } + for (size_t i = 0; i < in_shape.size(); i++) { + if (std::abs(in_shape[i] * shape_tensor_val[i] - out_shape[i]) > 1e-6) { + return false; + } + } + return true; +} + +int ResizeTensorRT::SetParams(nvinfer1::IResizeLayer *resize_layer) { + auto method = resize_op_->method(); + std::map method_map = { + {schema::ResizeMethod_LINEAR, nvinfer1::ResizeMode::kLINEAR}, + {schema::ResizeMethod_NEAREST, nvinfer1::ResizeMode::kNEAREST}}; + if (method_map.find(method) == method_map.end()) { + MS_LOG(ERROR) << op_name_ << " unsupported resize mode " << EnumNameResizeMethod(method); + return RET_ERROR; + } + resize_layer->setResizeMode(method_map.at(method)); + + // unsupported for trt6, but support setCoordinateTransformation() in version8 + auto coordinate_transform_mode = resize_op_->coordinate_transform_mode(); + if (coordinate_transform_mode != schema::CoordinateTransformMode_ASYMMETRIC) { + MS_LOG(WARNING) << op_name_ << " has coordinate_transform_mode may not supported: " + << EnumNameCoordinateTransformMode(coordinate_transform_mode); + } + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Resize, ResizeTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h new file mode 100644 index 00000000000..645436caff3 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/resize_tensorrt.h @@ -0,0 +1,52 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_ + +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ResizeTensorRT : public TensorRTOp { + public: + ResizeTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ResizeTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int SetOutputDims(nvinfer1::ITensor *resize_in_tensor, nvinfer1::IResizeLayer *resize_layer); + + void ParseValueFromShapeTensor(const mindspore::MSTensor &shape_value_tensor, std::vector *out_shape); + + bool IsScaleOutputDim(const std::vector &in_shape, const std::vector &out_shape, + const std::vector &shape_tensor_val); + + int SetParams(nvinfer1::IResizeLayer *resize_layer); + + const schema::Resize *resize_op_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_RESIZE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc new file mode 100644 index 00000000000..02f2c0de383 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.cc @@ -0,0 +1,227 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/scale_tensorrt.h" +#include "src/runtime/delegate/tensorrt/op/activation_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +constexpr int SCALE_INDEX = 1; +constexpr int SHIFT_INDEX = 2; +constexpr int POWER_INDEX = 3; + +int ScaleTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2 && in_tensors.size() != INPUT_SIZE3 && in_tensors.size() != INPUT_SIZE4) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int ScaleTensorRT::AddInnerOp(TensorRTContext *ctx) { + CHECK_NULL_RETURN(ctx); + auto scale_op = op_primitive_->value_as_ScaleFusion(); + CHECK_NULL_RETURN(scale_op); + + schema::ActivationType activation_type = scale_op->activation_type(); + // mode of scale + axis_ = scale_op->axis(); + axis_ = axis_ < 0 ? static_cast(in_tensors_[0].Shape().size() + axis_) : axis_; + out_format_ = tensorrt_in_tensors_[0].format_; + out_same_format_ = tensorrt_in_tensors_[0].same_format_; + mode_ = GetScaleMode(axis_); + MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]); + + nvinfer1::ITensor *scale_in_tensor = PreProcessInputTensor(ctx); + if (scale_in_tensor == nullptr) { + MS_LOG(ERROR) << "PreProcessInputTensor failed: " << op_name_; + return RET_ERROR; + } + + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(scale_in_tensor, out_format_, out_same_format_); + + nvinfer1::ITensor *op_out_tensor{nullptr}; + if (scale_in_tensor->getDimensions().nbDims == DIMENSION_4D) { + op_out_tensor = RunAs4DimsScale(ctx, scale_in_tensor); + } else { + op_out_tensor = RunAsMutiDimsScale(ctx, scale_in_tensor); + } + CHECK_NULL_RETURN(op_out_tensor); + + // add activation + if (activation_type != schema::ActivationType::ActivationType_NO_ACTIVATION) { + auto activation_layer = ActivationTensorRT::AddActivation(ctx, activation_type, 0, 0, 0, op_out_tensor, device_id_); + CHECK_NULL_RETURN(activation_layer); + activation_layer->setName((op_name_ + "_activation").c_str()); + op_out_tensor = activation_layer->getOutput(0); + } + + op_out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{op_out_tensor, out_format_, out_same_format_}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} + +nvinfer1::ITensor *ScaleTensorRT::PreProcessInputTensor(TensorRTContext *ctx) { + nvinfer1::ITensor *scale_in_tensor = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + mode_ == nvinfer1::ScaleMode::kCHANNEL) { + // per channel input format should be nchw, otherwise should be same with scale nhwc + // transpose: NHWC->NCHW + if ((tensorrt_in_tensors_[0].format_ == Format::NHWC && axis_ == kNHWC_C) || + (tensorrt_in_tensors_[0].same_format_ == true && axis_ == kNHWC_C)) { + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return nullptr; + } + transpose_layer_in->setName((op_name_ + "_transpose2NCHW").c_str()); + scale_in_tensor = transpose_layer_in->getOutput(0); + out_format_ = Format::NCHW; + out_same_format_ = !out_same_format_; + } else if (out_format_ != Format::NCHW && axis_ != kNCHW_C) { + MS_LOG(WARNING) << op_name_ << " out format (NHWC:1, NCHW:0) infer as " << out_format_ << ", and axis is " + << axis_; + } + } else if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NCHW && mode_ == nvinfer1::ScaleMode::kELEMENTWISE) { + // transpose: NCHW->NHWC + nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return nullptr; + } + transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str()); + scale_in_tensor = transpose_layer_in->getOutput(0); + out_format_ = Format::NHWC; + out_same_format_ = true; + } + return scale_in_tensor; +} + +nvinfer1::ScaleMode ScaleTensorRT::GetScaleMode(int64_t axis) { + nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kUNIFORM; + auto input_data_shape = in_tensors_[0].Shape(); + auto input_weight_shape = in_tensors_[1].Shape(); + int total = std::accumulate(input_data_shape.begin(), input_data_shape.end(), 1, std::multiplies()); + if (input_weight_shape.size() == 0 || (input_weight_shape.size() == 1 && input_weight_shape[0] == 1)) { + mode = nvinfer1::ScaleMode::kUNIFORM; + } else if ((axis < static_cast(input_data_shape.size()) && input_weight_shape.size() == 1 && + input_data_shape[axis] == input_weight_shape[0]) || + (input_data_shape.size() == DIMENSION_4D && axis == DIMENSION_3D)) { + mode = nvinfer1::ScaleMode::kCHANNEL; + } else if (input_weight_shape.size() == 1 && input_weight_shape[0] == total) { + mode = nvinfer1::ScaleMode::kELEMENTWISE; + } else { + MS_LOG(ERROR) << "ScaleMode create failed: " << op_name_; + return mode; + } + MS_LOG(DEBUG) << op_name_ << " ScaleMode(UNIFORM 0, CHANNEL 1, ELEMENTWISE 2): " << static_cast(mode); + return mode; +} + +nvinfer1::ITensor *ScaleTensorRT::RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) { + bool nd = false; + // (input * scale + shift) ^ power + nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, nullptr, 0}; + if (in_tensors_.size() > SCALE_INDEX) { + scale.values = in_tensors_[SCALE_INDEX].MutableData(); + MS_ASSERT(scale.values); + scale.count = in_tensors_[SCALE_INDEX].ElementNum(); + scale.type = ConvertDataType(in_tensors_[SCALE_INDEX].DataType()); + shift.type = scale.type; + power.type = scale.type; + nd = in_tensors_[1].Shape().size() == 1 ? false : true; + } + if (in_tensors_.size() > SHIFT_INDEX) { + shift.values = in_tensors_[SHIFT_INDEX].MutableData(); + MS_ASSERT(shift.values); + shift.count = in_tensors_[SHIFT_INDEX].ElementNum(); + } + if (in_tensors_.size() > POWER_INDEX) { + power.values = in_tensors_[POWER_INDEX].MutableData(); + MS_ASSERT(power.values); + power.count = in_tensors_[POWER_INDEX].ElementNum(); + } + nvinfer1::IScaleLayer *cal_layer = nullptr; + + if (nd) { + MS_LOG(WARNING) << "multi dims ScaleMode enter"; + cal_layer = ctx->network()->addScaleNd(*scale_in_tensor, mode_, shift, scale, power, axis_); + } else { + cal_layer = ctx->network()->addScale(*scale_in_tensor, mode_, shift, scale, power); + } + + if (cal_layer == nullptr) { + MS_LOG(ERROR) << "addScaleNd failed for: " << op_name_; + return nullptr; + } + cal_layer->setName(op_name_.c_str()); + this->layer_ = cal_layer; + return cal_layer->getOutput(0); +} + +nvinfer1::ITensor *ScaleTensorRT::RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor) { + auto scale_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[1], in_tensors_[0].Shape(), op_name_); + if (scale_tensor == nullptr) { + MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_; + return nullptr; + } + auto mul_layer = + ctx->network()->addElementWise(*scale_in_tensor, *scale_tensor, nvinfer1::ElementWiseOperation::kPROD); + if (mul_layer == nullptr) { + MS_LOG(ERROR) << "add mul failed for " << op_name_; + return nullptr; + } + mul_layer->setName((op_name_ + "_scale").c_str()); + layer_ = mul_layer; + nvinfer1::ITensor *out_tensor = mul_layer->getOutput(0); + // add shift + if (in_tensors_.size() >= INPUT_SIZE3) { + auto shift_tensor = ConvertConstantTensorWithDims(ctx, in_tensors_[SHIFT_INDEX], in_tensors_[0].Shape(), op_name_); + if (shift_tensor == nullptr) { + MS_LOG(ERROR) << "ConvertConstantTensorWithDims failed for " << op_name_; + return nullptr; + } + auto shift_layer = ctx->network()->addElementWise(*out_tensor, *shift_tensor, nvinfer1::ElementWiseOperation::kSUM); + if (shift_layer == nullptr) { + MS_LOG(ERROR) << "add bias failed for " << op_name_; + return nullptr; + } + shift_layer->setName((op_name_ + "_shift").c_str()); + out_tensor = shift_layer->getOutput(0); + } + if (in_tensors_.size() == INPUT_SIZE4) { + MS_LOG(WARNING) << op_name_ << " has power"; + return nullptr; + } + return out_tensor; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScaleFusion, ScaleTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h new file mode 100644 index 00000000000..463b7813549 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scale_tensorrt.h @@ -0,0 +1,57 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +namespace mindspore::lite { +class ScaleTensorRT : public TensorRTOp { + public: + ScaleTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ScaleTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ScaleMode GetScaleMode(int64_t axis); + + nvinfer1::ITensor *PreProcessInputTensor(TensorRTContext *ctx); + + nvinfer1::ITensor *RunAs4DimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor); + + nvinfer1::ITensor *RunAsMutiDimsScale(TensorRTContext *ctx, nvinfer1::ITensor *scale_in_tensor); + + Format out_format_; + + bool out_same_format_{false}; + + nvinfer1::ScaleMode mode_; + + int64_t axis_{0}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SCALE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc new file mode 100644 index 00000000000..58d948ab6d2 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.cc @@ -0,0 +1,99 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int ScatterNdTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { +#if TRT_VERSION_GE(8, 2) + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE3) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size() << " : " << op_name_; + return RET_ERROR; + } + + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size() << " : " << op_name_; + return RET_ERROR; + } + return RET_OK; +#else + MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher"; + return RET_ERROR; +#endif +} + +int ScatterNdTensorRT::AddInnerOp(TensorRTContext *ctx) { +#if TRT_VERSION_GE(8, 2) + ITensorHelper scatter_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &scatter_input); + if (ret != RET_OK || scatter_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_; + return ret; + } + if (tensorrt_in_tensors_.size() < INPUT_SIZE3) { + auto indices = ConvertConstantTensor(ctx, in_tensors_[1], op_name_ + "_indice"); + if (indices == nullptr) { + MS_LOG(ERROR) << "add const input tensor failed for " << op_name_; + return RET_ERROR; + } + tensorrt_in_tensors_.push_back(ITensorHelper{indices}); + auto updates = ConvertConstantTensor(ctx, in_tensors_[INPUT_SIZE2], op_name_ + "_update"); + if (updates == nullptr) { + MS_LOG(ERROR) << "add const input tensor failed for " << op_name_; + return RET_ERROR; + } + tensorrt_in_tensors_.push_back(ITensorHelper{updates}); + } + ITensorHelper indices_helper; + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[1], &indices_helper); + if (ret != RET_OK || indices_helper.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim indices tensor failed for " << op_name_; + return ret; + } + ITensorHelper updates_helper; + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[INPUT_SIZE2], &updates_helper); + if (ret != RET_OK || updates_helper.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim update tensor failed for " << op_name_; + return ret; + } + + nvinfer1::IScatterLayer *scatter_layer = ctx->network()->addScatter( + *scatter_input.trt_tensor_, *indices_helper.trt_tensor_, *updates_helper.trt_tensor_, nvinfer1::ScatterMode::kND); + if (scatter_layer == nullptr) { + MS_LOG(ERROR) << "addScatter failed for TensorRT."; + return RET_ERROR; + } + + nvinfer1::ITensor *out_tensor = scatter_layer->getOutput(0); + out_tensor->setName((op_name_ + "_0").c_str()); + this->AddInnerOutTensors(ITensorHelper{out_tensor, scatter_input.format_, scatter_input.same_format_}); + this->layer_ = scatter_layer; + return RET_OK; +#else + MS_LOG(WARNING) << "low TensorRT version don't support Scatter op, please upgrade TensorRT version to 8.2 or higher"; + return RET_ERROR; +#endif +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ScatterNdUpdate, ScatterNdTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h new file mode 100644 index 00000000000..c8954d206aa --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/scatternd_tensorrt.h @@ -0,0 +1,39 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ScatterNdTensorRT : public TensorRTOp { + public: + ScatterNdTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ScatterNdTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_OP_SCATTERND_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc new file mode 100644 index 00000000000..99c016a3665 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.cc @@ -0,0 +1,69 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/shape_tensorrt.h" + +namespace mindspore::lite { +int ShapeTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + dynamic_shape_params_.support_dynamic_ = false; + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +} +int ShapeTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + nvinfer1::ITensor *shape_input = tensorrt_in_tensors_[0].trt_tensor_; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NCHW) { + // transpose: NCHW->NHWC + nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *tensorrt_in_tensors_[0].trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "transpose: NCHW->NHWC failed for " << op_name_; + return RET_ERROR; + } + transpose_layer_in->setName((op_name_ + "_transpose2NHWC").c_str()); + shape_input = transpose_layer_in->getOutput(0); + this->transpose_layer_ = transpose_layer_in; + } + nvinfer1::IShapeLayer *shape_layer = ctx->network()->addShape(*shape_input); + + if (shape_layer == nullptr) { + MS_LOG(ERROR) << "add shape op failed for TensorRT."; + return RET_ERROR; + } + shape_layer->setName(op_name_.c_str()); + shape_layer->getOutput(0)->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{shape_layer->getOutput(0), Format::NHWC, true}); + this->layer_ = shape_layer; + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Shape, ShapeTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h new file mode 100644 index 00000000000..f7cce06daa4 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shape_tensorrt.h @@ -0,0 +1,38 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class ShapeTensorRT : public TensorRTOp { + public: + ShapeTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ShapeTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHAPE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc new file mode 100644 index 00000000000..53886a2d0cb --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.cc @@ -0,0 +1,437 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h" +#include +#include +#include + +namespace mindspore::lite { +int ShuffleTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + switch (type_) { + case schema::PrimitiveType_Flatten: + case schema::PrimitiveType_Unsqueeze: { + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported in_tensors size " << in_tensors.size() << " of " + << schema::EnumNamePrimitiveType(type_); + return RET_ERROR; + } + break; + } + case schema::PrimitiveType_Squeeze: { + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported in_tensors size " << in_tensors.size() << " of " + << schema::EnumNamePrimitiveType(type_); + return RET_ERROR; + } + auto squeeze_op = this->op_primitive_->value_as_Squeeze(); + if (squeeze_op == nullptr) { + MS_LOG(ERROR) << "SqueezeOp convert failed"; + return RET_ERROR; + } + param_axis_ = squeeze_op->axis(); + if (param_axis_ == nullptr) { + MS_LOG(WARNING) << op_name_ << " is a full dim squeeze, don't support dynamic input shape."; + dynamic_shape_params_.support_dynamic_ = false; + dynamic_shape_params_.support_hw_dynamic_ = false; + } + break; + } + case schema::PrimitiveType_Reshape: { + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "PrimitiveType_Transpose Unsupported in_tensors size: " << in_tensors.size(); + return RET_ERROR; + } + dynamic_shape_params_.support_hw_dynamic_ = false; + if (in_tensors[0].Shape()[0] != out_tensors[0].Shape()[0]) { + dynamic_shape_params_.support_dynamic_ = false; + } + break; + } + case schema::PrimitiveType_Transpose: + case schema::PrimitiveType_ExpandDims: + case schema::PrimitiveType_BroadcastTo: { + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "PrimitiveType_Transpose Unsupported in_tensors size: " << in_tensors.size(); + return RET_ERROR; + } + if (in_tensors[1].Data() == nullptr) { + MS_LOG(ERROR) << "Unsupported shape tensor of " << schema::EnumNamePrimitiveType(type_); + return RET_ERROR; + } + break; + } + default: { + MS_LOG(ERROR) << "Unsupported op type:" << schema::EnumNamePrimitiveType(type_); + return RET_ERROR; + } + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensort size: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int ShuffleTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is invalid"; + return RET_ERROR; + } + ctx_ = ctx; + + int ret = InputTensorPreprocess(); + if (ret != RET_OK || shuffler_input_ == nullptr) { + MS_LOG(ERROR) << "InputTensorPreprocess failed for " << op_name_; + return RET_ERROR; + } + + nvinfer1::IShuffleLayer *shuffle_layer = ctx->network()->addShuffle(*shuffler_input_); + if (shuffle_layer == nullptr) { + MS_LOG(ERROR) << "add Shuffle op failed for TensorRT."; + return RET_ERROR; + } + shuffle_layer->setName(op_name_.c_str()); + this->layer_ = shuffle_layer; + + ret = RET_OK; + switch (type_) { + case schema::PrimitiveType_Unsqueeze: { + ret = AddUnsqueezeOp(shuffle_layer); + break; + } + case schema::PrimitiveType_Squeeze: { + ret = AddSqueezeOp(shuffle_layer); + break; + } + case schema::PrimitiveType_Transpose: { + ret = AddTransposeOp(shuffle_layer); + break; + } + case schema::PrimitiveType_Reshape: { + ret = AddReshapeOp(shuffle_layer); + break; + } + case schema::PrimitiveType_Flatten: { + ret = AddFlattenOp(shuffle_layer); + break; + } + case schema::PrimitiveType_ExpandDims: { + ret = AddExpandDimsOp(shuffle_layer); + break; + } + case schema::PrimitiveType_BroadcastTo: { + ret = AddBroadcastToOp(shuffle_layer); + break; + } + default: + MS_LOG(ERROR) << "Unsupported op type for " << op_name_; + return RET_ERROR; + } + if (ret != RET_OK) { + MS_LOG(ERROR) << "AddOp failed for " << op_name_; + return ret; + } + + if (shuffler_output_ == nullptr) { + MS_LOG(ERROR) << "output tensor create failed for " << op_name_; + return RET_ERROR; + } + shuffler_output_->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{shuffler_output_, out_format_, true}); + MS_LOG(DEBUG) << "output " << GetTensorFormat(tensorrt_out_tensors_[0]); + return RET_OK; +} + +int ShuffleTensorRT::InputTensorPreprocess() { + shuffler_input_ = tensorrt_in_tensors_[0].trt_tensor_; + MS_LOG(DEBUG) << "before transpose " << GetTensorFormat(tensorrt_in_tensors_[0]); + out_format_ = tensorrt_in_tensors_[0].format_; + if (shuffler_input_->getDimensions().nbDims == DIMENSION_4D && !tensorrt_in_tensors_[0].same_format_) { + // input tensor support NCHW format input + if (tensorrt_in_tensors_[0].format_ == Format::NCHW) { + // for transpose op, if tensor has same dim with ms tensor, keep origin dims + nvinfer1::IShuffleLayer *transpose_layer = NCHW2NHWC(ctx_, *shuffler_input_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "create transpose layer failed for " << op_name_; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_transpose_in").c_str()); + shuffler_input_ = transpose_layer->getOutput(0); + out_format_ = Format::NHWC; + } else if (tensorrt_in_tensors_[0].format_ == Format::NHWC) { + // infer format may error, correct here + nvinfer1::IShuffleLayer *transpose_layer = NHWC2NCHW(ctx_, *shuffler_input_); + if (transpose_layer == nullptr) { + MS_LOG(ERROR) << "create transpose layer failed for " << op_name_; + return RET_ERROR; + } + transpose_layer->setName((op_name_ + "_transpose_in").c_str()); + shuffler_input_ = transpose_layer->getOutput(0); + out_format_ = Format::NCHW; + } + } + MS_LOG(DEBUG) << "after transpose " << GetTensorFormat(shuffler_input_, out_format_, true); + return RET_OK; +} + +int ShuffleTensorRT::AddSqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer) { + // axis + auto squeeze_shape = shuffler_input_->getDimensions(); + std::vector new_shape(squeeze_shape.d, squeeze_shape.d + squeeze_shape.nbDims); + if (param_axis_ == nullptr) { + MS_LOG(WARNING) << op_name_ << " has null axis, output shape is totally depends on ms tensor."; + new_shape = out_tensors_[0].Shape(); + } else { + for (int i = param_axis_->size() - 1; i >= 0; i--) { + if (new_shape[param_axis_->Get(i)] != 1) { + MS_LOG(WARNING) << "squeeze_shape value at " << i << " is " << param_axis_->Get(i) << ", need check " + << op_name_; + } + new_shape.erase(new_shape.begin() + param_axis_->Get(i)); + } + } + + nvinfer1::Dims squeeze_dims = lite::ConvertCudaDims(new_shape); + if (squeeze_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + shuffle_layer->setReshapeDimensions(squeeze_dims); + shuffler_output_ = shuffle_layer->getOutput(0); + return shuffler_output_ == nullptr ? RET_ERROR : RET_OK; +} + +int ShuffleTensorRT::AddUnsqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer) { + // Unsqueeze + auto unsqueeze_op = this->op_primitive_->value_as_Unsqueeze(); + if (unsqueeze_op == nullptr) { + MS_LOG(ERROR) << "AddUnsqueezeOp convert failed"; + return RET_ERROR; + } + // axis + param_axis_ = unsqueeze_op->axis(); + if (param_axis_ == nullptr) { + MS_LOG(ERROR) << "axis is invalid for " << op_name_; + return RET_ERROR; + } + if (param_axis_->size() != 1) { + MS_LOG(WARNING) << op_name_ << " has unsqueeze axis size: " << param_axis_->size(); + } + nvinfer1::ITensor *expand_input = shuffler_input_; + for (size_t i = 0; i < param_axis_->size(); i++) { + expand_input = ExpandDim(shuffle_layer, expand_input, param_axis_->Get(i)); + } + shuffler_output_ = expand_input; + return shuffler_output_ == nullptr ? RET_ERROR : RET_OK; +} + +int ShuffleTensorRT::AddTransposeOp(nvinfer1::IShuffleLayer *shuffle_layer) { + if (in_tensors_[0].Shape().size() != in_tensors_[1].ElementNum()) { + MS_LOG(WARNING) << "transpose perm is invalid for input, ignore " << op_name_; + shuffler_output_ = shuffler_input_; + return RET_OK; + } + auto transpose_op = this->op_primitive_->value_as_Transpose(); + if (transpose_op == nullptr) { + MS_LOG(ERROR) << "AddTransposeOp convert failed"; + return RET_ERROR; + } + // perm + mindspore::MSTensor perm_ternsor = in_tensors_[1]; + if (perm_ternsor.Data() == nullptr) { + MS_LOG(ERROR) << "AddTransposeOp perm_ternsor data is invalid: " << op_name_; + return RET_ERROR; + } + int *perm_data = reinterpret_cast(perm_ternsor.MutableData()); + + nvinfer1::Permutation perm{}; + for (int i = 0; i < perm_ternsor.ElementNum(); i++) { + perm.order[i] = *perm_data; + perm_data++; + } + shuffle_layer->setFirstTranspose(perm); + if (perm_ternsor.ElementNum() == DIMENSION_4D) { + if (perm.order[kNCHW_C] == kNHWC_C && perm.order[kNCHW_H] == kNHWC_H && perm.order[kNCHW_W] == kNHWC_W) { + out_format_ = Format::NCHW; + } else if (perm.order[kNHWC_H] == kNCHW_H && perm.order[kNHWC_W] == kNCHW_W && perm.order[kNHWC_C] == kNCHW_C) { + out_format_ = Format::NHWC; + } else { + MS_LOG(INFO) << "input format and perm order is not NHWC or NCHW: " << op_name_; + } + } + shuffler_output_ = shuffle_layer->getOutput(0); + return RET_OK; +} + +int ShuffleTensorRT::AddReshapeOp(nvinfer1::IShuffleLayer *shuffle_layer) { + mindspore::MSTensor &shape_tensor = in_tensors_[1]; + if (shape_tensor.Data() != nullptr) { + // static shuffle layer + shuffle_layer->setReshapeDimensions( + InferReshapeDims(shuffler_input_->getDimensions(), in_tensors_[0].Shape(), out_tensors_[0].Shape())); + } else { + if (tensorrt_in_tensors_.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid shape tensor for reshape " << op_name_; + return RET_ERROR; + } + shuffle_layer->setInput(1, *tensorrt_in_tensors_[1].trt_tensor_); + } + shuffler_output_ = shuffle_layer->getOutput(0); + return RET_OK; +} + +int ShuffleTensorRT::AddFlattenOp(nvinfer1::IShuffleLayer *shuffle_layer) { + nvinfer1::Dims flatten_dims; + const std::vector &input_shape = in_tensors_[0].Shape(); + flatten_dims.nbDims = DIMENSION_2D; + flatten_dims.d[0] = tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0] == -1 + ? 0 + : tensorrt_in_tensors_[0].trt_tensor_->getDimensions().d[0]; + flatten_dims.d[1] = std::accumulate(input_shape.begin() + 1, input_shape.end(), 1, std::multiplies()); + if (flatten_dims.d[1] <= 0) { + MS_LOG(ERROR) << op_name_ << "infer shape failed"; + } + shuffle_layer->setReshapeDimensions(flatten_dims); + shuffler_output_ = shuffle_layer->getOutput(0); + return RET_OK; +} + +int ShuffleTensorRT::AddExpandDimsOp(nvinfer1::IShuffleLayer *shuffle_layer) { + if (in_tensors_[1].DataType() != DataType::kNumberTypeInt32) { + MS_LOG(WARNING) << op_name_ << " axis tensor data type is " << static_cast(in_tensors_[1].DataType()); + } + auto axis_data = static_cast(in_tensors_[1].Data().get()); + int axis = axis_data[0]; + shuffler_output_ = ExpandDim(shuffle_layer, shuffler_input_, axis); + return shuffler_output_ == nullptr ? RET_ERROR : RET_OK; +} + +int ShuffleTensorRT::AddBroadcastToOp(nvinfer1::IShuffleLayer *shuffle_layer) { + if (out_tensors_[0].ElementNum() != in_tensors_[0].ElementNum() && + out_tensors_[0].Shape().size() == in_tensors_[0].Shape().size()) { + MS_LOG(WARNING) << "broadcast element cnt changes, ignore broadcast for " << op_name_; + shuffle_layer->setReshapeDimensions(shuffler_input_->getDimensions()); + } else if (out_tensors_[0].ElementNum() == in_tensors_[0].ElementNum()) { + nvinfer1::Dims new_dims = ConvertCudaDims(out_tensors_[0].Shape()); + if (new_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + new_dims.d[0] = shuffler_input_->getDimensions().d[0]; + shuffle_layer->setReshapeDimensions(new_dims); + } else { + MS_LOG(ERROR) << "broadcast needs check for " << op_name_; + } + shuffler_output_ = shuffle_layer->getOutput(0); + return shuffler_output_ == nullptr ? RET_ERROR : RET_OK; +} + +nvinfer1::ITensor *ShuffleTensorRT::ExpandDim(nvinfer1::IShuffleLayer *shuffle_layer, nvinfer1::ITensor *input_tensor, + int axis) { + auto input_dims = input_tensor->getDimensions(); + // if expand dim not at last dim and shape is dynamic, change to expanddim at last dim and transpose + bool special_expand = false; + for (int i = 0; i < input_dims.nbDims; i++) { + special_expand = special_expand || input_dims.d[i] == -1; + } + special_expand = special_expand && (axis != -1 && axis != input_dims.nbDims - 1); + + if (special_expand) { + std::vector new_shape; + for (int i = 0; i < input_dims.nbDims; i++) { + new_shape.push_back(input_dims.d[i] == -1 ? 0 : input_dims.d[i]); + } + new_shape.push_back(1); + nvinfer1::Dims new_dims = ConvertCudaDims(new_shape); + if (new_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return nullptr; + } + shuffle_layer->setReshapeDimensions(new_dims); + // transpose + nvinfer1::Permutation perm{}; + for (int i = 0; i < new_dims.nbDims; i++) { + if (i < axis) { + perm.order[i] = i; + } else if (i == axis) { + perm.order[i] = new_dims.nbDims - 1; + } else { + perm.order[i] = i - 1; + } + } + nvinfer1::IShuffleLayer *trans_layer = ctx_->network()->addShuffle(*shuffle_layer->getOutput(0)); + if (trans_layer == nullptr) { + MS_LOG(ERROR) << "add transpose layer failed for special expand dims op " << op_name_; + return nullptr; + } + trans_layer->setFirstTranspose(perm); + return trans_layer->getOutput(0); + } else { + std::vector new_shape; + for (int i = 0; i < input_dims.nbDims; i++) { + if (axis == i) { + new_shape.push_back(1); + } + new_shape.push_back(input_dims.d[i] == -1 ? 0 : input_dims.d[i]); + } + if (axis == -1 || axis == input_dims.nbDims) { + new_shape.push_back(1); + } + nvinfer1::Dims new_dims = ConvertCudaDims(new_shape); + if (new_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return nullptr; + } + shuffle_layer->setReshapeDimensions(new_dims); + return shuffle_layer->getOutput(0); + } +} + +nvinfer1::Dims ShuffleTensorRT::InferReshapeDims(const nvinfer1::Dims &input_dims, + const std::vector &ms_input_shape, + const std::vector &ms_output_shape) { + // tensorrt support infer shape of 0 and -1 + nvinfer1::Dims reshape_dims = ConvertCudaDims(ms_output_shape); + if (reshape_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return reshape_dims; + } + for (int i = 0; i < reshape_dims.nbDims; i++) { + if (input_dims.d[i] == -1) { + if (ms_input_shape[i] == ms_output_shape[i]) { + reshape_dims.d[i] = 0; + } else { + reshape_dims.d[i] = -1; + } + } + MS_LOG(DEBUG) << "reshape infer_index " << i << " value: " << reshape_dims.d[i]; + } + return reshape_dims; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Unsqueeze, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Squeeze, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Reshape, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Transpose, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Flatten, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ExpandDims, ShuffleTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_BroadcastTo, ShuffleTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h new file mode 100644 index 00000000000..d326c37588a --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/shuffle_tensorrt.h @@ -0,0 +1,58 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +class ShuffleTensorRT : public TensorRTOp { + public: + ShuffleTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~ShuffleTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int InputTensorPreprocess(); + int AddSqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddUnsqueezeOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddTransposeOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddReshapeOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddFlattenOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddExpandDimsOp(nvinfer1::IShuffleLayer *shuffle_layer); + int AddBroadcastToOp(nvinfer1::IShuffleLayer *shuffle_layer); + nvinfer1::ITensor *ExpandDim(nvinfer1::IShuffleLayer *shuffle_layer, nvinfer1::ITensor *input_tensor, int axis); + nvinfer1::Dims InferReshapeDims(const nvinfer1::Dims &input_dims, const std::vector &ms_input_shape, + const std::vector &ms_output_shape); + + Format out_format_ = Format::NHWC; + nvinfer1::ITensor *shuffler_input_{nullptr}; + nvinfer1::ITensor *shuffler_output_{nullptr}; + TensorRTContext *ctx_{nullptr}; + const flatbuffers::Vector *param_axis_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SHUFFLE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc new file mode 100644 index 00000000000..1908acceea3 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.cc @@ -0,0 +1,281 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/slice_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +namespace { +class StrideSliceTensorRTUtil final : public SliceTensorRTUtil { + public: + StrideSliceTensorRTUtil() = default; + ~StrideSliceTensorRTUtil() = default; + bool IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + if (in_tensors.size() < HAS_AXIS - 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return false; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return false; + } + if (in_tensors.at(BEGINS_INDEX).Data() == nullptr || in_tensors.at(ENDS_INDEX).Data() == nullptr) { + MS_LOG(ERROR) << "invalid input tensor for: " << op_name_; + return false; + } + return true; + } + std::tuple GetSliceParams( + const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + const mindspore::MSTensor &begin = in_tensors.at(BEGINS_INDEX); + const mindspore::MSTensor &stride = in_tensors.back(); + const mindspore::MSTensor &end = in_tensors.at(ENDS_INDEX); + + nvinfer1::Dims start_dims; + nvinfer1::Dims size_dims; + nvinfer1::Dims stride_dims; + + size_t axis_index = in_tensors.size() == HAS_AXIS ? AXIS_INDEX : -1; + auto out_shape = out_tensors.front().Shape(); + if (static_cast(begin.ElementNum()) == in_tensors.at(0).Shape().size()) { + start_dims = lite::ConvertCudaDims(begin.Data().get(), begin.ElementNum()); + if (shrink_axis_ == 0) { + size_dims = lite::ConvertCudaDims(out_shape); + } else { + size_dims.nbDims = start_dims.nbDims; + auto end_dims = lite::ConvertCudaDims(end.Data().get(), end.ElementNum()); + for (int i = 0; i < size_dims.nbDims; i++) { + size_dims.d[i] = end_dims.d[i] - start_dims.d[i]; + } + } + stride_dims = lite::ConvertCudaDims(stride.Data().get(), stride.ElementNum()); + } else { + if (axis_index == -1 || in_tensors.at(axis_index).ElementNum() != 1) { + MS_LOG(ERROR) << "invalid input params for " << op_name_; + return {}; + } + int axis_value = *(static_cast(in_tensors.at(axis_index).Data().get())); + int start_value = *(static_cast(begin.Data().get())); + start_dims.nbDims = in_tensors.at(0).Shape().size(); + for (int i = 0; i < start_dims.nbDims; i++) { + start_dims.d[i] = (i == axis_value) ? start_value : 0; + } + + size_dims = lite::ConvertCudaDims(out_shape); + int stride_value = *(static_cast(stride.Data().get())); + stride_dims = nvinfer1::Dims{size_dims.nbDims, {}}; + std::fill(stride_dims.d, stride_dims.d + stride_dims.nbDims, stride_value); + } + return std::make_tuple(start_dims, size_dims, stride_dims); + } + nvinfer1::ITensor *PostProcess(TensorRTContext *ctx, nvinfer1::ITensor *input, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (shrink_axis_ != 0) { + return Reshape(ctx, input, out_tensors.at(0).Shape()); + } + return input; + } + void SetShrinkAxis(int shrink_axis) { shrink_axis_ = shrink_axis; } + + private: + int shrink_axis_; +}; + +class SliceFusionTensorRTUtil final : public SliceTensorRTUtil { + public: + SliceFusionTensorRTUtil() = default; + ~SliceFusionTensorRTUtil() = default; + bool IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + if (in_tensors.size() != SLICE_INPUT_SIZE) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return false; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return false; + } + return true; + } + std::tuple GetSliceParams( + const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + const auto &input = in_tensors.at(0); + const auto &begin = in_tensors.at(1); + const auto &size = in_tensors.at(SIZE_INDEX); + + auto start_dims = lite::ConvertCudaDims(begin.Data().get(), begin.ElementNum()); + auto size_dims = lite::ConvertCudaDims(size.Data().get(), size.ElementNum()); + auto stride_dims = lite::ConvertCudaDims(1, begin.ElementNum()); + + return std::make_tuple(start_dims, size_dims, stride_dims); + } +}; + +class CropTensorRTUtil final : public SliceTensorRTUtil { + public: + CropTensorRTUtil() = default; + ~CropTensorRTUtil() = default; + bool IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + if (in_tensors.size() != CROP_INPUT_SIZE) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return false; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return false; + } + auto crop_primitive = primitive->value_as_Crop(); + if (crop_primitive == nullptr) { + MS_LOG(ERROR) << "Cast primitive to crop fail"; + return false; + } + axis_ = static_cast(crop_primitive->axis()); + auto offsets_ptr = crop_primitive->offsets(); + if (offsets_ptr == nullptr) { + MS_LOG(ERROR) << "Crop Op do not have offset attr"; + return false; + } + if (axis_ < 0) { + axis_ += in_tensors.at(0).Shape().size(); + } + if (axis_ < 0 || axis_ + offsets_ptr->size() != in_tensors.at(0).Shape().size()) { + MS_LOG(ERROR) << "axis and offsets not match input tensor shape, axis is " << crop_primitive->axis() + << " , offsets size is " << offsets_ptr->size() << " , input size is " + << in_tensors.at(0).Shape().size(); + return false; + } + if (in_tensors.at(0).Shape().size() != in_tensors.at(1).Shape().size()) { + MS_LOG(ERROR) << "input tensor 0 and 1 size not equal," + << " input 0 size is " << in_tensors.at(0).Shape().size() << " , input tensor 1 size is " + << in_tensors.at(1).Shape().size(); + return false; + } + return true; + } + std::tuple GetSliceParams( + const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override { + auto crop_primitive = primitive->value_as_Crop(); + auto offsets_ptr = crop_primitive->offsets(); + + std::vector begin(in_tensors.at(0).Shape().size(), 0); + for (size_t i = 0; i != offsets_ptr->size(); ++i) { + begin[axis_ + i] = offsets_ptr->Get(i); + } + + std::vector size(in_tensors.at(0).Shape().size()); + for (size_t i = 0; i != size.size(); ++i) { + size[i] = in_tensors.at(1).Shape().at(i); + } + + auto start_dims = lite::ConvertCudaDims(&begin[0], begin.size()); + auto size_dims = lite::ConvertCudaDims(&size[0], size.size()); + auto stride_dims = lite::ConvertCudaDims(1, begin.size()); + + return std::make_tuple(start_dims, size_dims, stride_dims); + } + + private: + int axis_; +}; +} // namespace + +SliceTensorRT::SliceTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) { + if (primitive->value_type() == schema::PrimitiveType_StridedSlice) { + auto slice_fusion_util = std::make_unique(); + slice_fusion_util->SetShrinkAxis(primitive->value_as_StridedSlice()->shrink_axis_mask()); + util_ = std::move(slice_fusion_util); + } else if (primitive->value_type() == schema::PrimitiveType_SliceFusion) { + util_ = std::make_unique(); + } else if (primitive->value_type() == schema::PrimitiveType_Crop) { + util_ = std::make_unique(); + } else { + util_ = nullptr; + } + if (util_ != nullptr) { + util_->op_name_ = op_name_; + } +} + +int SliceTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (util_ == nullptr) { + MS_LOG(ERROR) << "Unsupported op_type: " << op_name_; + return RET_ERROR; + } + if (!util_->IsSupport(primitive, in_tensors, out_tensors)) { + return RET_ERROR; + } + dynamic_shape_params_.support_dynamic_ = false; + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +} + +int SliceTensorRT::AddInnerOp(TensorRTContext *ctx) { + ITensorHelper slice_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &slice_input); + if (ret != RET_OK || slice_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_; + return RET_ERROR; + } + + nvinfer1::Dims start_dims; + nvinfer1::Dims size_dims; + nvinfer1::Dims stride_dims; + std::tie(start_dims, size_dims, stride_dims) = util_->GetSliceParams(op_primitive_, in_tensors_, out_tensors_); + if (start_dims.nbDims == -1 || size_dims.nbDims == -1 || stride_dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name_; + return RET_ERROR; + } + + nvinfer1::ISliceLayer *slice_layer = + ctx->network()->addSlice(*slice_input.trt_tensor_, start_dims, size_dims, stride_dims); + if (slice_layer == nullptr) { + MS_LOG(ERROR) << "add Slice op failed for TensorRT: " << op_name_; + return RET_ERROR; + } + this->layer_ = slice_layer; + slice_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *out_tensor = slice_layer->getOutput(0); + out_tensor = util_->PostProcess(ctx, out_tensor, in_tensors_, out_tensors_); + if (out_tensor == nullptr) { + MS_LOG(ERROR) << "output tensor create failed"; + return RET_ERROR; + } + out_tensor->setName((op_name_ + "_output").c_str()); + + this->AddInnerOutTensors(ITensorHelper{out_tensor, slice_input.format_, slice_input.same_format_}); + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_StridedSlice, SliceTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_SliceFusion, SliceTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Crop, SliceTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h new file mode 100644 index 00000000000..e1f82cbe183 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/slice_tensorrt.h @@ -0,0 +1,66 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_ +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class SliceTensorRTUtil { + public: + SliceTensorRTUtil() = default; + virtual ~SliceTensorRTUtil() = default; + virtual bool IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) = 0; + virtual std::tuple GetSliceParams( + const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) = 0; + virtual nvinfer1::ITensor *PostProcess(TensorRTContext *ctx, nvinfer1::ITensor *input, + const std::vector &in_tensors, + const std::vector &out_tensors) { + return input; + } + std::string op_name_; +}; + +constexpr int BEGINS_INDEX = 1; +constexpr int ENDS_INDEX = 2; +constexpr int SIZE_INDEX = 2; +constexpr int HAS_AXIS = 5; +constexpr int AXIS_INDEX = 3; +constexpr int CROP_INPUT_SIZE = 2; +constexpr int SLICE_INPUT_SIZE = 3; +class SliceTensorRT : public TensorRTOp { + public: + SliceTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type); + + ~SliceTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + std::unique_ptr util_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SLICE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc new file mode 100644 index 00000000000..29c68ddd97a --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.cc @@ -0,0 +1,95 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/softmax_tensorrt.h" + +namespace mindspore::lite { +int SoftMaxTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + softmax_op_ = primitive->value_as_Softmax(); + if (softmax_op_ == nullptr) { + MS_LOG(ERROR) << "convert failed"; + return RET_ERROR; + } + + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} +int SoftMaxTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "network is invalid"; + return RET_ERROR; + } + nvinfer1::ISoftMaxLayer *softmax_layer_ = AddSoftMaxOp(ctx); + if (softmax_layer_ == nullptr) { + MS_LOG(ERROR) << "add softmax op failed for TensorRT."; + return RET_ERROR; + } + softmax_layer_->setName((op_name_ + "_softmax").c_str()); + this->layer_ = softmax_layer_; + + nvinfer1::ITensor *out_tensor = softmax_layer_->getOutput(0); + if (out_tensor == nullptr) { + MS_LOG(ERROR) << "softmax output tensor create failed for TensorRT."; + return RET_ERROR; + } + out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} + +nvinfer1::ISoftMaxLayer *SoftMaxTensorRT::AddSoftMaxOp(TensorRTContext *ctx) { + nvinfer1::ISoftMaxLayer *current_layer_ = ctx->network()->addSoftMax(*tensorrt_in_tensors_[0].trt_tensor_); + if (current_layer_ == nullptr) { + MS_LOG(ERROR) << "add softmax op failed for TensorRT."; + return nullptr; + } + auto axis = softmax_op_->axis(); + if (axis == nullptr || axis->size() != 1) { + MS_LOG(ERROR) << "axis needs check"; + return nullptr; + } + auto axis_val = std::vector(axis->begin(), axis->end()); + if (axis_val[0] >= tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims) { + MS_LOG(ERROR) << "axis is larger than input tensor dims."; + return nullptr; + } + int64_t axis_format_value = + (axis_val[0] == -1) ? tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims - 1 : axis_val[0]; + if (tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + tensorrt_in_tensors_[0].format_ == Format::NCHW) { + // transpose axis to NCHW + axis_format_value = ConvertAxisFromNHWC2NCHW(axis_format_value); + } + uint32_t axis_bit = 1 << axis_format_value; + MS_LOG(DEBUG) << op_name_ << " axis_value is " << axis_format_value << ", set axis to " << axis_bit; + current_layer_->setAxes(axis_bit); + return current_layer_; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Softmax, SoftMaxTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h new file mode 100644 index 00000000000..a31d0f8b5a6 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/softmax_tensorrt.h @@ -0,0 +1,43 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class SoftMaxTensorRT : public TensorRTOp { + public: + SoftMaxTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~SoftMaxTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + nvinfer1::ISoftMaxLayer *AddSoftMaxOp(TensorRTContext *ctx); + + const schema::Softmax *softmax_op_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SOFTMAX_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc new file mode 100644 index 00000000000..c4638bb2e49 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.cc @@ -0,0 +1,160 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "src/runtime/delegate/tensorrt/op/split_tensorrt.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +int SplitTensorRT::IsSupport(const mindspore::schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + int ret = ParseParams(); + if (ret != RET_OK) { + MS_LOG(ERROR) << op_name_ << " parse params failed."; + return ret; + } + + axis_ = axis_ < 0 ? axis_ + in_tensors_[0].Shape().size() : axis_; + + if (out_tensors.size() < 1 || out_tensors.size() != output_num_) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + if (axis_ < 0 || axis_ >= in_tensors_[0].Shape().size()) { + MS_LOG(ERROR) << "invalid axis : " << axis_; + return RET_ERROR; + } + int split_sum = std::accumulate(size_splits_.begin(), size_splits_.end(), 0); + int split_sum_expect = in_tensors_[0].Shape()[axis_]; + + if (size_splits_[size_splits_.size() - 1] == -1) { + size_splits_[size_splits_.size() - 1] = split_sum_expect - split_sum - 1; + split_sum = split_sum_expect; + } + + if (split_sum != split_sum_expect) { + MS_LOG(ERROR) << "Sum of size splits not equal input tensor dim. "; + return RET_ERROR; + } + + dynamic_shape_params_.support_dynamic_ = false; + dynamic_shape_params_.support_hw_dynamic_ = false; + return RET_OK; +} + +int SplitTensorRT::AddInnerOp(TensorRTContext *ctx) { + ITensorHelper split_input; + int ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &split_input); + if (ret != RET_OK || split_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "PreprocessInputs2SameDim input tensor failed for " << op_name_; + return ret; + } + + int axis_dim_index = 0; + nvinfer1::Dims one_dims = lite::ConvertCudaDims(1, in_tensors_[0].Shape().size()); + nvinfer1::ISliceLayer *slice_layer = nullptr; + + for (int i = 0; i != output_num_; ++i) { + nvinfer1::Dims start_dims = lite::ConvertCudaDims(0, in_tensors_[0].Shape().size()); + start_dims.d[axis_] = axis_dim_index; + axis_dim_index += size_splits_[i]; + + nvinfer1::Dims size_dims = lite::ConvertCudaDims(in_tensors_[0].Shape()); + size_dims.d[axis_] = size_splits_[i]; + + slice_layer = ctx->network()->addSlice(*split_input.trt_tensor_, start_dims, size_dims, one_dims); + if (slice_layer == nullptr) { + MS_LOG(ERROR) << "add Slice op failed for TensorRT: " << op_name_; + return RET_ERROR; + } + + nvinfer1::ITensor *out_tensor = slice_layer->getOutput(0); + if (type_ == schema::PrimitiveType_Unstack) { + auto shuffer_layer = ctx->network()->addShuffle(*out_tensor); + auto shuffer_dims_opt = SqueezeDims(out_tensor->getDimensions(), axis_); + if (!shuffer_dims_opt) { + MS_LOG(ERROR) << "SqueezeDims failed."; + return RET_ERROR; + } + shuffer_layer->setReshapeDimensions(shuffer_dims_opt.value()); + out_tensor = shuffer_layer->getOutput(0); + } + out_tensor->setName((op_name_ + "_" + std::to_string(i)).c_str()); + this->AddInnerOutTensors(ITensorHelper{out_tensor, split_input.format_, split_input.same_format_}); + } + this->layer_ = slice_layer; + return RET_OK; +} +int SplitTensorRT::ParseParams() { + switch (type_) { + case schema::PrimitiveType_Split: { + auto split_op = op_primitive_->value_as_Split(); + CHECK_NULL_RETURN(split_op); + axis_ = split_op->axis(); + output_num_ = split_op->output_num(); + auto size_splits_ptr = split_op->size_splits(); + if (size_splits_ptr != nullptr) { + size_splits_.resize(size_splits_ptr->size()); + std::copy(size_splits_ptr->begin(), size_splits_ptr->end(), size_splits_.begin()); + } else if (in_tensors_.size() == INPUT_SIZE2 && in_tensors_[1].Data() != nullptr && + in_tensors_[1].DataType() == DataType::kNumberTypeInt32) { + size_splits_.resize(in_tensors_[1].ElementNum()); + auto split_out_ptr = static_cast(in_tensors_[1].Data().get()); + for (int i = 0; i < in_tensors_[1].ElementNum(); i++) { + size_splits_[i] = split_out_ptr[i]; + } + } else { + MS_LOG(ERROR) << op_name_ << " has invalid input size and size_splits: " << in_tensors_.size(); + return RET_ERROR; + } + break; + } + case schema::PrimitiveType_Unstack: { + auto unstack_op = op_primitive_->value_as_Unstack(); + CHECK_NULL_RETURN(unstack_op); + axis_ = unstack_op->axis(); + output_num_ = out_tensors_.size(); + break; + } + default: { + MS_LOG(ERROR) << op_name_ << " has invalid type for split"; + return RET_ERROR; + } + } + if (size_splits_.empty()) { + if (output_num_ == 0 || in_tensors_[0].Shape().at(axis_) % output_num_ != 0) { + MS_LOG(ERROR) << "axis dim can not be split into same subdim"; + return RET_ERROR; + } + int split_width = in_tensors_[0].Shape().at(axis_) / output_num_; + size_splits_.resize(output_num_); + std::fill(size_splits_.begin(), size_splits_.end(), split_width); + } + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Split, SplitTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Unstack, SplitTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h new file mode 100644 index 00000000000..df5b1c21533 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/split_tensorrt.h @@ -0,0 +1,45 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class SplitTensorRT : public TensorRTOp { + public: + SplitTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~SplitTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int ParseParams(); + int64_t axis_; + int64_t output_num_; + std::vector size_splits_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_SPLIT_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc new file mode 100644 index 00000000000..d35712924ea --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.cc @@ -0,0 +1,132 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" +#include + +namespace mindspore::lite { +const schema::Primitive *TensorRTOp::GetPrimitive() { return this->op_primitive_; } + +void TensorRTOp::AddInnerInTensors(ITensorHelper tensor) { this->tensorrt_in_tensors_.push_back(tensor); } + +void TensorRTOp::AddInnerOutTensors(ITensorHelper tensor) { this->tensorrt_out_tensors_.push_back(tensor); } + +std::vector &TensorRTOp::GetInnerOutTensor() { return this->tensorrt_out_tensors_; } + +std::vector &TensorRTOp::GetInnerInTensors() { return this->tensorrt_in_tensors_; } + +std::string TensorRTOp::GetOpName() { return this->op_name_; } + +std::vector &TensorRTOp::inputs() { return this->in_tensors_; } + +std::vector &TensorRTOp::outputs() { return this->out_tensors_; } + +schema::PrimitiveType TensorRTOp::type() const { return this->type_; } + +schema::QuantType TensorRTOp::GetQuantType() const { return this->quant_type_; } + +void TensorRTOp::set_in_ops(const std::vector &in_ops) { this->in_ops_ = in_ops; } + +void TensorRTOp::set_out_ops(const std::vector &out_ops) { this->out_ops_ = out_ops; } + +const std::vector &TensorRTOp::in_ops() const { return this->in_ops_; } + +const std::vector &TensorRTOp::out_ops() const { return this->out_ops_; } + +void TensorRTOp::SetRuntime(TensorRTRuntime *runtime) { + this->runtime_ = runtime; + device_id_ = runtime_->GetDeviceID(); +} + +bool TensorRTOp::IsShapeKnown() { + if (this->in_tensors_.size() == 1 && this->in_tensors_[0].Shape().size() == 0) { + return false; + } + return true; +} + +int TensorRTOp::Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine) { + if (op_binding_tensor_.size() != 0) { + MS_LOG(ERROR) << "need special op Prepare for " << op_name_; + return RET_ERROR; + } + return RET_OK; +} + +DynamicShapeParams TensorRTOp::GetDynamicShapeParams() const { return this->dynamic_shape_params_; } + +int TensorRTOp::SetInt8DynamicRange() { + // setting param layer_ forcely + if (this->layer_ == nullptr) { + MS_LOG(ERROR) << op_name_ << " layer is nullptr."; + return RET_ERROR; + } + if (in_tensors_.empty() || out_tensors_.empty()) { + MS_LOG(ERROR) << "input or output tensor empty."; + return RET_ERROR; + } + if (quant_type_ != schema::QuantType_QUANT_ALL) { + MS_LOG(DEBUG) << "op " << op_name_ << " not quantized."; + return RET_OK; + } + + if (in_tensors_[0].QuantParams().empty() || out_tensors_[0].QuantParams().empty()) { + MS_LOG(WARNING) << op_name_ << " quant param is empty."; + MS_LOG(WARNING) << "in_tensor quant param size: " << in_tensors_[0].QuantParams().size() + << " ,out_tensor quant param size: " << out_tensors_[0].QuantParams().size(); + } + for (size_t i = 0; i < in_tensors_.size(); i++) { + auto tensor = in_tensors_.at(i); + if (!tensor.IsConst()) { + tensorrt_in_tensors_.at(i).trt_tensor_->setDynamicRange(tensor.QuantParams().at(0).min, + tensor.QuantParams().at(0).max); + // Don't set the presion on non-computation layers as they don't support int8. + if (this->layer_->getType() != nvinfer1::LayerType::kCONSTANT && + this->layer_->getType() != nvinfer1::LayerType::kCONCATENATION && + this->layer_->getType() != nvinfer1::LayerType::kSHAPE) { + this->layer_->setPrecision(nvinfer1::DataType::kINT8); + } + } + } + for (size_t i = 0; i < out_tensors_.size(); i++) { + auto tensor = out_tensors_.at(0); + tensorrt_out_tensors_.at(i).trt_tensor_->setDynamicRange(tensor.QuantParams().at(0).min, + tensor.QuantParams().at(0).max); + // set output type of execution tensors. + if (this->layer_->getOutput(i)->isExecutionTensor()) { + this->layer_->setOutputType(i, nvinfer1::DataType::kINT8); + } + } + return SetTransposeDynamicRange(); +} + +int TensorRTOp::SetTransposeDynamicRange() { + if (this->transpose_layer_ == nullptr) { + MS_LOG(INFO) << op_name_ << " transpose_layer is nullptr."; + return RET_OK; + } + if (!in_tensors_[0].QuantParams().empty() && !out_tensors_[0].QuantParams().empty()) { + this->transpose_layer_->getInput(0)->setDynamicRange(in_tensors_.front().QuantParams().at(0).min, + in_tensors_.front().QuantParams().at(0).max); + this->transpose_layer_->getOutput(0)->setDynamicRange(in_tensors_.front().QuantParams().at(0).min, + in_tensors_.front().QuantParams().at(0).max); + this->transpose_layer_->setOutputType(0, nvinfer1::DataType::kINT8); + this->transpose_layer_->setPrecision(nvinfer1::DataType::kINT8); + } + return RET_OK; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h new file mode 100644 index 00000000000..e37b77a051e --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_op.h @@ -0,0 +1,175 @@ +/** + * Copyright 2020-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_ + +#include +#include +#include +#include +#include "include/api/kernel.h" +#include "src/common/log_adapter.h" +#include "include/errorcode.h" +#include "src/runtime/delegate/tensorrt/tensorrt_context.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/auto_registration_factory.h" +#include "src/common/log_util.h" + +namespace mindspore::lite { +constexpr int INPUT_SIZE2 = 2; +constexpr int INPUT_SIZE3 = 3; +constexpr int INPUT_SIZE4 = 4; + +struct BindingHelper { + std::string name_; + void *data_{nullptr}; + nvinfer1::DataType data_type_; + size_t size_; + bool is_input_binding_{false}; +}; + +struct DynamicShapeParams { + bool support_dynamic_{true}; + bool support_hw_dynamic_{true}; +}; + +class TensorRTRuntime; + +class TensorRTOp { + public: + explicit TensorRTOp(const schema::Primitive *primitive, std::vector in_tensors, + std::vector out_tensors, std::string name, schema::QuantType quant_type) + : op_primitive_(primitive), + in_tensors_(std::move(in_tensors)), + out_tensors_(std::move(out_tensors)), + op_name_(std::move(name)), + quant_type_(quant_type) { + if (primitive != nullptr) { + this->type_ = primitive->value_type(); + } + } + + virtual ~TensorRTOp() = default; + + virtual int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) = 0; + + virtual int AddInnerOp(TensorRTContext *ctx) = 0; + + virtual int SetInt8DynamicRange(); + + virtual int Prepare(void **network_tensor_bindings, nvinfer1::ICudaEngine *engine); + + const schema::Primitive *GetPrimitive(); + + void AddInnerInTensors(ITensorHelper tensor); + + void AddInnerOutTensors(ITensorHelper tensor); + + std::vector &GetInnerOutTensor(); + + std::vector &GetInnerInTensors(); + + std::string GetOpName(); + + std::vector &inputs(); + + std::vector &outputs(); + + schema::PrimitiveType type() const; + + schema::QuantType GetQuantType() const; + + void set_in_ops(const std::vector &in_ops); + + void set_out_ops(const std::vector &out_ops); + + const std::vector &in_ops() const; + + const std::vector &out_ops() const; + + void SetRuntime(TensorRTRuntime *runtime); + + DynamicShapeParams GetDynamicShapeParams() const; + + nvinfer1::ILayer *layer() { return layer_; } + + private: + int SetTransposeDynamicRange(); + + protected: + bool IsShapeKnown(); + + nvinfer1::ILayer *layer_ = nullptr; + + nvinfer1::IShuffleLayer *transpose_layer_ = nullptr; + + const schema::Primitive *op_primitive_{nullptr}; + + std::vector in_tensors_; + + std::vector out_tensors_; + + std::vector tensorrt_in_tensors_; + + std::vector tensorrt_out_tensors_; + + std::vector in_ops_; + + std::vector out_ops_; + + std::string op_name_; + + schema::PrimitiveType type_ = schema::PrimitiveType_NONE; + + schema::QuantType quant_type_ = schema::QuantType_QUANT_NONE; + + std::vector op_binding_tensor_; + + TensorRTRuntime *runtime_{nullptr}; + + DynamicShapeParams dynamic_shape_params_; + + uint32_t device_id_{0}; +}; + +template +TensorRTOp *GetTensorRTOp(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) { + auto *op = new (std::nothrow) T(primitive, in_tensors, out_tensors, name, quant_type); + if (op == nullptr) { + MS_LOG(WARNING) << "TensorRT is nullptr."; + return nullptr; + } + + auto ret = op->IsSupport(primitive, in_tensors, out_tensors); + if (ret != RET_OK) { + MS_LOG(WARNING) << "TensorRT op is not supported: " << name; + delete op; + return nullptr; + } + return op; +} +typedef TensorRTOp *(*TensorRTGetOp)(const schema::Primitive *primitive, + const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type); + +#define REGISTER_TENSORRT_CREATOR(KEY, TENSORRT_OP) \ + REGISTER_CLASS_CREATOR(schema::PrimitiveType, KEY, TensorRTGetOp, GetTensorRTOp); +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_OP_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc new file mode 100644 index 00000000000..1ecaa90167e --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.cc @@ -0,0 +1,81 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" + +namespace mindspore::lite { +void SerializeValue(void **buffer, const void *value, size_t cpy_size) { + memcpy(*buffer, value, cpy_size); + *buffer = static_cast(*buffer) + cpy_size; +} + +void DeserializeValue(void const **buffer, size_t *buffer_size, void *value, size_t cpy_size) { + if (cpy_size > *buffer_size) { + MS_LOG(ERROR) << "invalid desirialize size, buffer size: " << *buffer_size << ", value size: " << cpy_size; + return; + } + memcpy(value, *buffer, cpy_size); + *buffer = static_cast(*buffer) + cpy_size; + *buffer_size -= cpy_size; +} + +nvinfer1::DimsExprs TensorRTPlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, + int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept { + return inputs[0]; +} + +bool TensorRTPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept { + return true; +} + +void TensorRTPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {} + +size_t TensorRTPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept { + return 0; +} + +nvinfer1::DataType TensorRTPlugin::getOutputDataType(int index, const nvinfer1::DataType *inputTypes, + int nbInputs) const noexcept { + return inputTypes[0]; +} + +const char *TensorRTPlugin::getPluginType() const noexcept { return plugin_name_.c_str(); } + +const char *TensorRTPlugin::getPluginVersion() const noexcept { return plugin_version_.c_str(); } + +int TensorRTPlugin::getNbOutputs() const noexcept { return 1; } + +int TensorRTPlugin::initialize() noexcept { return 0; } + +void TensorRTPlugin::terminate() noexcept {} + +size_t TensorRTPlugin::getSerializationSize() const noexcept { return 0; } + +void TensorRTPlugin::serialize(void *buffer) const noexcept {} + +void TensorRTPlugin::destroy() noexcept { + // This gets called when the network containing plugin is destroyed + delete this; +} + +void TensorRTPlugin::setPluginNamespace(const char *libNamespace) noexcept { name_space_ = libNamespace; } + +const char *TensorRTPlugin::getPluginNamespace() const noexcept { return name_space_.c_str(); } +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h new file mode 100644 index 00000000000..d2fadb85828 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tensorrt_plugin.h @@ -0,0 +1,106 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_ +#include +#include +#include "src/common/log_adapter.h" +#include "include/errorcode.h" +#include "NvInferRuntimeCommon.h" +#include + +namespace mindspore::lite { +void SerializeValue(void **buffer, const void *value, size_t cpy_size); +void DeserializeValue(void const **buffer, size_t *buffer_size, void *value, size_t cpy_size); +class TensorRTPlugin : public nvinfer1::IPluginV2DynamicExt { + public: + TensorRTPlugin(const std::string &layer_name, const std::string &plugin_name, uint32_t device_id = 0) + : layer_name_(layer_name), plugin_name_(plugin_name), device_id_(device_id) {} + + // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete + // default constructor. + TensorRTPlugin() = delete; + + // IPluginV2DynamicExt Methods + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, + int nbOutputs) noexcept override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override; + + // IPluginV2Ext Methods + nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const + noexcept override; + + // IPluginV2 Methods + const char *getPluginType() const noexcept override; + const char *getPluginVersion() const noexcept override; + int getNbOutputs() const noexcept override; + int initialize() noexcept override; + void terminate() noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + void destroy() noexcept override; + void setPluginNamespace(const char *pluginNamespace) noexcept override; + const char *getPluginNamespace() const noexcept override; + + protected: + std::string layer_name_; + std::string name_space_; + std::string plugin_version_{"1"}; + std::string plugin_name_; + uint32_t device_id_{0}; +}; + +template +class TensorRTPluginCreater : public nvinfer1::IPluginCreator { + public: + explicit TensorRTPluginCreater(const std::string &plugin_name) : plugin_name_(plugin_name) { + // Fill PluginFieldCollection with PluginField arguments metadata + field_collection_.nbFields = fields_.size(); + field_collection_.fields = fields_.data(); + } + + const char *getPluginName() const noexcept override { return plugin_name_.c_str(); } + + const char *getPluginVersion() const noexcept override { return plugin_version_.c_str(); } + + const nvinfer1::PluginFieldCollection *getFieldNames() noexcept override { return &field_collection_; } + + void setPluginNamespace(const char *pluginNamespace) noexcept override { name_space_ = std::string(pluginNamespace); } + + const char *getPluginNamespace() const noexcept override { return name_space_.c_str(); } + + nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) noexcept { + return new (std::nothrow) T(name, fc); + } + + nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept { + return new (std::nothrow) T(name, serialData, serialLength); + } + + protected: + static nvinfer1::PluginFieldCollection field_collection_; + static std::vector fields_; + std::string name_space_; + std::string plugin_version_{"1"}; + std::string plugin_name_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TENSORRT_PLUGIN_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc new file mode 100644 index 00000000000..37225ef49b0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.cc @@ -0,0 +1,183 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/tile_tensorrt.h" +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" + +namespace mindspore::lite { +REGISTER_TENSORRT_PLUGIN(TilePluginCreater); +template class TensorRTPluginCreater; +template +nvinfer1::PluginFieldCollection TensorRTPluginCreater::field_collection_{}; +template +std::vector TensorRTPluginCreater::fields_; + +int TileTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid input tensor size: " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "invalid output tensor size: " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int TileTensorRT::AddInnerOp(TensorRTContext *ctx) { + auto repeats_tensor = in_tensors_[1]; + CHECK_NULL_RETURN(repeats_tensor.Data()); + if (repeats_tensor.ElementNum() != tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims) { + MS_LOG(ERROR) << op_name_ << " has input dims: " << tensorrt_in_tensors_[0].trt_tensor_->getDimensions().nbDims + << ", and invalid repeats cnt: " << repeats_tensor.ElementNum(); + return RET_ERROR; + } + int ret = ParseData2Vector(in_tensors_[1], &repeats_); + if (ret != RET_OK || repeats_.size() == 0) { + MS_LOG(ERROR) << op_name_ << " has invalid repeats tensor"; + return ret; + } + ITensorHelper tile_input; + + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], &tile_input); + if (ret != RET_OK || tile_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << op_name_ << " preprocess tensor failed."; + return RET_ERROR; + } + + return RunAsConcat(ctx, tile_input); +} +int TileTensorRT::RunAsConcat(TensorRTContext *ctx, const ITensorHelper &tile_input) { + int axis = -1; + float tile_times = 0.0f; + for (int i = 0; i < repeats_.size(); i++) { + if (repeats_[i] > 1) { + if (axis != -1) { + MS_LOG(ERROR) << op_name_ << " has more than one axis to tile"; + return RET_ERROR; + } + axis = i; + tile_times = repeats_[i]; + } + } + // concat + nvinfer1::ITensor *concat_inputs[1024]; + for (int i = 0; i < tile_times; i++) { + concat_inputs[i] = tile_input.trt_tensor_; + } + nvinfer1::IConcatenationLayer *concat_layer = ctx->network()->addConcatenation(concat_inputs, tile_times); + CHECK_NULL_RETURN(concat_layer); + concat_layer->setAxis(axis); + concat_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *tile_out = concat_layer->getOutput(0); + layer_ = concat_layer; + tile_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{tile_out, tile_input.format_, true}); + return RET_OK; +} +int TileTensorRT::RunAsPlugin(TensorRTContext *ctx, const ITensorHelper &tile_input) { + // Floating point Exception + nvinfer1::ITensor *inputTensors[] = {tile_input.trt_tensor_}; + auto plugin = std::make_shared(op_name_, repeats_, device_id_); + nvinfer1::IPluginV2Layer *tile_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); + CHECK_NULL_RETURN(tile_layer); + nvinfer1::ITensor *tile_out = tile_layer->getOutput(0); + tile_layer->setName(op_name_.c_str()); + tile_out->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors(ITensorHelper{tile_out, tile_input.format_, true}); + this->layer_ = tile_layer; + return RET_OK; +} +// plugin + +int TilePlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, + cudaStream_t stream) noexcept { + nvinfer1::Dims output_dims = outputDesc[0].dims; + nvinfer1::Dims input_dims = inputDesc[0].dims; + if (device_input_shape_ == nullptr) { + CUDA_CHECK(cudaMalloc(&device_input_shape_, input_dims.nbDims * sizeof(size_t))); + CHECK_NULL_RETURN(device_input_shape_); + } + if (device_output_shape_ == nullptr) { + CUDA_CHECK(cudaMalloc(&device_output_shape_, output_dims.nbDims * sizeof(size_t))); + CHECK_NULL_RETURN(device_output_shape_); + } + size_t input_shape[nvinfer1::Dims::MAX_DIMS]; + size_t output_shape[nvinfer1::Dims::MAX_DIMS]; + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape[i] = static_cast(input_dims.d[i]); + output_shape[i] = static_cast(output_dims.d[i]); + } + CUDA_CHECK(cudaMemcpy(device_input_shape_, input_shape, input_dims.nbDims * sizeof(size_t), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(device_output_shape_, output_shape, output_dims.nbDims * sizeof(size_t), cudaMemcpyHostToDevice)); + MS_LOG(ERROR) << layer_name_ << " has more axis to concat: " << repeats_.size(); + return RET_ERROR; +} + +nvinfer1::DimsExprs TilePlugin::getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept { + nvinfer1::DimsExprs out_dims{}; + out_dims.nbDims = inputs[0].nbDims; + for (int i = 0; i < out_dims.nbDims; i++) { + auto repeat = exprBuilder.constant(repeats_[i]); + out_dims.d[i] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[0].d[i], *repeat); + } + return out_dims; +} + +nvinfer1::IPluginV2DynamicExt *TilePlugin::clone() const noexcept { + auto *plugin = new TilePlugin(*this); + plugin->setPluginNamespace(name_space_.c_str()); + return plugin; +} + +void TilePlugin::terminate() noexcept { + if (device_input_shape_ != nullptr) { + auto cuda_ret = cudaFree(device_input_shape_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "free cuda memory failed for " << layer_name_; + } + } + if (device_output_shape_ != nullptr) { + auto cuda_ret = cudaFree(device_output_shape_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "free cuda memory failed for " << layer_name_; + } + } +} + +size_t TilePlugin::getSerializationSize() const noexcept { return sizeof(float) * repeats_.size() + sizeof(size_t); } + +void TilePlugin::serialize(void *buffer) const noexcept { + size_t dims = repeats_.size(); + SerializeValue(&buffer, &dims, sizeof(size_t)); + for (float one_repeat : repeats_) { + SerializeValue(&buffer, &one_repeat, sizeof(float)); + } +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_TileFusion, TileTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h new file mode 100644 index 00000000000..750d09e8bd9 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/tile_tensorrt.h @@ -0,0 +1,94 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_ +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_plugin.h" +#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/tile_impl.cuh" + +namespace mindspore::lite { +constexpr char *TILE_PLUGIN_NAME{"TilePluginCreater"}; +class TileTensorRT : public TensorRTOp { + public: + TileTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~TileTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int RunAsConcat(TensorRTContext *ctx, const ITensorHelper &tile_input); + int RunAsPlugin(TensorRTContext *ctx, const ITensorHelper &tile_input); + std::vector repeats_; +}; + +class TilePlugin : public TensorRTPlugin { + public: + explicit TilePlugin(const std::string name, const std::vector &repeats, uint32_t device_id) + : TensorRTPlugin(name, std::string(TILE_PLUGIN_NAME), device_id), repeats_(repeats) {} + + TilePlugin(const char *name, const nvinfer1::PluginFieldCollection *fc) + : TensorRTPlugin(std::string(name), std::string(TILE_PLUGIN_NAME)) { + const nvinfer1::PluginField *fields = fc->fields; + size_t dims = static_cast(fields[0].data)[0]; + for (size_t i = 0; i < dims; i++) { + float one_repeat = static_cast(fields[0].data)[i + 1]; + repeats_.push_back(one_repeat); + } + } + + TilePlugin(const char *name, const void *serialData, size_t serialLength) + : TensorRTPlugin(std::string(name), std::string(TILE_PLUGIN_NAME)) { + size_t dims; + DeserializeValue(&serialData, &serialLength, &dims, sizeof(size_t)); + for (size_t i = 0; i < dims; i++) { + float one_repeat; + DeserializeValue(&serialData, &serialLength, &one_repeat, sizeof(float)); + repeats_.push_back(one_repeat); + } + } + + TilePlugin() = delete; + + nvinfer1::IPluginV2DynamicExt *clone() const noexcept override; + + int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc, + const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs, + nvinfer1::IExprBuilder &exprBuilder) noexcept override; + size_t getSerializationSize() const noexcept override; + void serialize(void *buffer) const noexcept override; + void terminate() noexcept override; + + private: + std::vector repeats_; + size_t *device_input_shape_{nullptr}; + size_t *device_output_shape_{nullptr}; +}; +class TilePluginCreater : public TensorRTPluginCreater { + public: + TilePluginCreater() : TensorRTPluginCreater(std::string(TILE_PLUGIN_NAME)) {} +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TILE_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc new file mode 100644 index 00000000000..71da8be9555 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.cc @@ -0,0 +1,160 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/topk_tensorrt.h" + +namespace mindspore::lite { +int TopKTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + return RET_ERROR; + } + if (out_tensors.size() != 1 && in_tensors.size() != INPUT_SIZE2) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + return RET_ERROR; + } + return RET_OK; +} + +int TopKTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) { + MS_LOG(ERROR) << "network or input tensor is invalid"; + return RET_ERROR; + } + int ret = ParseParams(ctx); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ParseParams failed for " << op_name_; + return ret; + } + + ITensorHelper topk_input; + ret = PreprocessInputs(ctx, &topk_input); + if (ret != RET_OK || topk_input.trt_tensor_ == nullptr) { + MS_LOG(ERROR) << "preprocess input failed for " << op_name_; + return ret; + } + axis_ = 1 << axis_value_; + MS_LOG(DEBUG) << "addTopK input " << GetTensorFormat(topk_input); + MS_LOG(DEBUG) << op_name_ << " has k: " << top_k_ << ", axis: " << axis_value_; + + nvinfer1::ITopKLayer *topk_layer = ctx->network()->addTopK(*topk_input.trt_tensor_, topk_op_, top_k_, axis_); + CHECK_NULL_RETURN(topk_layer); + this->layer_ = topk_layer; + topk_layer->setName(op_name_.c_str()); + nvinfer1::ITensor *value_out_tensor = topk_layer->getOutput(0); + nvinfer1::ITensor *index_out_tensor = topk_layer->getOutput(1); + // output 0 is data value, output 1 is index + + if (value_out_tensor->getDimensions().nbDims != out_tensors_[0].Shape().size()) { + nvinfer1::Dims out_dims = ConvertCudaDims(out_tensors_[0].Shape()); + out_dims.d[0] = value_out_tensor->getDimensions().d[0]; + value_out_tensor = Reshape(ctx, value_out_tensor, out_dims); + CHECK_NULL_RETURN(value_out_tensor); + value_out_tensor->setName((op_name_ + "_value_output").c_str()); + index_out_tensor = Reshape(ctx, index_out_tensor, out_dims); + CHECK_NULL_RETURN(index_out_tensor); + index_out_tensor->setName((op_name_ + "_index_output").c_str()); + } + if (out_tensors_.size() == INPUT_SIZE2) { + AddInnerOutTensors(ITensorHelper{value_out_tensor, topk_input.format_, true}); + } + AddInnerOutTensors(ITensorHelper{index_out_tensor, topk_input.format_, true}); + return RET_OK; +} + +int TopKTensorRT::ParseParams(TensorRTContext *ctx) { + switch (type_) { + case schema::PrimitiveType_ArgMaxFusion: { + topk_op_ = nvinfer1::TopKOperation::kMAX; + auto max_prim = op_primitive_->value_as_ArgMaxFusion(); + CHECK_NULL_RETURN(max_prim); + axis_value_ = max_prim->axis(); + axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_; + top_k_ = max_prim->top_k(); + break; + } + case schema::PrimitiveType_ArgMinFusion: { + topk_op_ = nvinfer1::TopKOperation::kMIN; + auto mim_prim = op_primitive_->value_as_ArgMinFusion(); + CHECK_NULL_RETURN(mim_prim); + axis_value_ = mim_prim->axis(); + axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_; + top_k_ = mim_prim->top_k(); + break; + } + case schema::PrimitiveType_TopKFusion: { + auto topk_prim = op_primitive_->value_as_TopKFusion(); + CHECK_NULL_RETURN(topk_prim); + topk_op_ = topk_prim->largest() == 1 ? nvinfer1::TopKOperation::kMAX : nvinfer1::TopKOperation::kMIN; + axis_value_ = topk_prim->axis(); + axis_value_ = axis_value_ > 0 ? axis_value_ : in_tensors_[0].Shape().size() + axis_value_; + if (in_tensors_.size() < INPUT_SIZE2) { + MS_LOG(ERROR) << "invalid input size " << in_tensors_.size() << "for " << op_name_; + return RET_ERROR; + } + std::vector tmp(1); + int ret_k = ParseData2Vector(in_tensors_[1], &tmp); + if (ret_k != RET_OK) { + return ret_k; + } + top_k_ = tmp[0]; + break; + } + default: { + MS_LOG(ERROR) << op_name_ << " has more primitive type: " << schema::EnumNamePrimitiveType(type_); + return RET_ERROR; + } + } + // Currently reduceAxes must specify exactly one dimension, and it must be one of the last four dimensions. + if (axis_value_ != in_tensors_[0].Shape().size() - 1) { + MS_LOG(ERROR) << op_name_ << " has unsupported axis : " << axis_value_; + return RET_ERROR; + } + return RET_OK; +} +int TopKTensorRT::PreprocessInputs(TensorRTContext *ctx, ITensorHelper *topk_input) { + auto input_dim = tensorrt_in_tensors_[0].trt_tensor_->getDimensions(); + int ret = RET_ERROR; + if (input_dim.nbDims == DIMENSION_4D) { + ret = PreprocessInputs2SameDim(ctx, tensorrt_in_tensors_[0], topk_input); + } else if (input_dim.nbDims < DIMENSION_4D) { + // only support 4d + nvinfer1::Dims4 expect_dim; + for (int i = 0; i < DIMENSION_4D; i++) { + if (i < input_dim.nbDims) { + expect_dim.d[DIMENSION_4D - 1 - i] = input_dim.d[input_dim.nbDims - 1 - i]; + } else { + expect_dim.d[DIMENSION_4D - 1 - i] = 1; + } + } + topk_input->trt_tensor_ = Reshape(ctx, tensorrt_in_tensors_[0].trt_tensor_, expect_dim); + CHECK_NULL_RETURN(topk_input->trt_tensor_); + axis_value_ += (DIMENSION_4D - input_dim.nbDims); + return RET_OK; + } else { + MS_LOG(ERROR) << op_name_ << " has invalid input dims: " << input_dim.nbDims; + } + return ret; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ArgMaxFusion, TopKTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ArgMinFusion, TopKTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_TopKFusion, TopKTensorRT) +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h new file mode 100644 index 00000000000..5344d2fda93 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/topk_tensorrt.h @@ -0,0 +1,49 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class TopKTensorRT : public TensorRTOp { + public: + TopKTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~TopKTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + int ParseParams(TensorRTContext *ctx); + + int PreprocessInputs(TensorRTContext *ctx, ITensorHelper *topk_input); + + nvinfer1::TopKOperation topk_op_{nvinfer1::TopKOperation::kMAX}; + uint32_t axis_{0}; + int axis_value_{0}; + int32_t top_k_{0}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_TOPK_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc new file mode 100644 index 00000000000..14300dfe687 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.cc @@ -0,0 +1,84 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/op/unary_tensorrt.h" + +namespace mindspore::lite { +int UnaryTensorRT::IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) { + if (!IsShapeKnown()) { + MS_LOG(ERROR) << "Unsupported input tensor unknown shape: " << op_name_; + return RET_ERROR; + } + if (in_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); + } + if (out_tensors.size() != 1) { + MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size(); + } + auto it = unary_ops_.find(primitive->value_type()); + if (it != unary_ops_.end()) { + unary_op_ = it->second; + } else { + MS_LOG(ERROR) << "unsupported unary ops type: " << schema::EnumNamePrimitiveType(primitive->value_type()); + return RET_ERROR; + } + return RET_OK; +} + +int UnaryTensorRT::AddInnerOp(TensorRTContext *ctx) { + if (ctx == nullptr || ctx->network() == nullptr || this->tensorrt_in_tensors_.size() != 1) { + MS_LOG(ERROR) << "network or input tensor is invalid"; + return RET_ERROR; + } + nvinfer1::IUnaryLayer *cal_layer = ctx->network()->addUnary(*tensorrt_in_tensors_[0].trt_tensor_, unary_op_); + if (cal_layer == nullptr) { + MS_LOG(ERROR) << "addUnary failed for: " << op_name_; + return RET_ERROR; + } + cal_layer->setName(op_name_.c_str()); + this->layer_ = cal_layer; + if (type_ == schema::PrimitiveType_ExpFusion) { + auto exp_op = op_primitive_->value_as_ExpFusion(); + CHECK_NULL_RETURN(exp_op); + float scale = exp_op->scale(); + float shift = exp_op->shift(); + float base = exp_op->base(); + if (scale != 1.0f || shift != 0.0f || base != -1.0f) { + MS_LOG(ERROR) << op_name_ << " has fusion to calculate."; + return RET_ERROR; + } + } + + nvinfer1::ITensor *op_out_tensor = cal_layer->getOutput(0); + op_out_tensor->setName((op_name_ + "_output").c_str()); + this->AddInnerOutTensors( + ITensorHelper{op_out_tensor, tensorrt_in_tensors_[0].format_, tensorrt_in_tensors_[0].same_format_}); + return RET_OK; +} +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Sqrt, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Abs, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Neg, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Log, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Sin, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Cos, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Ceil, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_Floor, UnaryTensorRT) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_ExpFusion, UnaryTensorRT) +#if TRT_VERSION_GE(7, 2) +REGISTER_TENSORRT_CREATOR(schema::PrimitiveType_LogicalNot, UnaryTensorRT) +#endif +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h new file mode 100644 index 00000000000..5f7f18f6908 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/op/unary_tensorrt.h @@ -0,0 +1,56 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_ +#include +#include +#include +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" + +namespace mindspore::lite { +class UnaryTensorRT : public TensorRTOp { + public: + UnaryTensorRT(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors, const std::string &name, + const schema::QuantType &quant_type) + : TensorRTOp(primitive, in_tensors, out_tensors, name, quant_type) {} + + ~UnaryTensorRT() override = default; + + int AddInnerOp(TensorRTContext *ctx) override; + + int IsSupport(const schema::Primitive *primitive, const std::vector &in_tensors, + const std::vector &out_tensors) override; + + private: + std::map unary_ops_ = { + {schema::PrimitiveType_Sqrt, nvinfer1::UnaryOperation::kSQRT}, + {schema::PrimitiveType_Abs, nvinfer1::UnaryOperation::kABS}, + {schema::PrimitiveType_Neg, nvinfer1::UnaryOperation::kNEG}, + {schema::PrimitiveType_Log, nvinfer1::UnaryOperation::kLOG}, + {schema::PrimitiveType_Sin, nvinfer1::UnaryOperation::kSIN}, + {schema::PrimitiveType_Cos, nvinfer1::UnaryOperation::kCOS}, + {schema::PrimitiveType_Ceil, nvinfer1::UnaryOperation::kCEIL}, + {schema::PrimitiveType_Floor, nvinfer1::UnaryOperation::kFLOOR}, + {schema::PrimitiveType_ExpFusion, nvinfer1::UnaryOperation::kEXP}, +#if TRT_VERSION_GE(7, 2) + {schema::PrimitiveType_LogicalNot, nvinfer1::UnaryOperation::kNOT}, +#endif + }; + nvinfer1::UnaryOperation unary_op_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_OP_UNARY_TENSORRT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc new file mode 100644 index 00000000000..ab40a64b4b8 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.cc @@ -0,0 +1,150 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h" +#include +#include +#include "src/common/log_adapter.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cast.cuh" + +namespace mindspore::lite { +void *TensorRTAllocator::MallocDeviceMem(const mindspore::MSTensor &host_tensor, size_t size) { + if (host_tensor == NULL) { + return nullptr; + } + return MallocDeviceMem(host_tensor.Name(), size, ConvertDataType(host_tensor.DataType())); +} + +void *TensorRTAllocator::MallocDeviceMem(const std::string &name, size_t size, nvinfer1::DataType data_type) { + if (cuda_tensor_map_.find(name) != cuda_tensor_map_.end() && size <= cuda_tensor_map_[name].size) { + MS_LOG(DEBUG) << "tensor :" << name << " has already in cuda Allocator pool."; + return cuda_tensor_map_[name].data; + } + void *device_ptr = nullptr; + auto cuda_ret = cudaMalloc(&device_ptr, size); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda Malloc failed for size:" << size; + return nullptr; + } + MS_LOG(INFO) << "cudaMalloc size: " << size << " for " << name; + if (cuda_tensor_map_[name].data != nullptr) { + cuda_ret = cudaFree(cuda_tensor_map_[name].data); + if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) { + MS_LOG(ERROR) << "free old cuda device_ptr failed for " << cudaGetErrorName(cuda_ret); + cuda_ret = cudaFree(device_ptr); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "free new cuda device_ptr failed for " << cudaGetErrorName(cuda_ret); + return nullptr; + } + return nullptr; + } + } + cuda_tensor_map_[name].data = device_ptr; + cuda_tensor_map_[name].is_valid_mem = false; + cuda_tensor_map_[name].size = size; + return device_ptr; +} + +void TensorRTAllocator::MarkMemValid(const std::string &name, bool isValid) { + cuda_tensor_map_[name].is_valid_mem = isValid; + return; +} + +bool TensorRTAllocator::GetMemIsValid(const std::string &name) { + if (cuda_tensor_map_.find(name) == cuda_tensor_map_.end()) { + MS_LOG(WARNING) << "tensor :" << name << " not in cuda Allocator pool."; + return false; + } + return cuda_tensor_map_[name].is_valid_mem; +} + +void *TensorRTAllocator::GetDevicePtr(const std::string &tensor_name) { + if (tensor_name.empty()) { + return nullptr; + } + if (cuda_tensor_map_.find(tensor_name) == cuda_tensor_map_.end()) { + return nullptr; + } + return this->cuda_tensor_map_.find(tensor_name)->second.data; +} + +int TensorRTAllocator::SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, const std::string &device_tensor_name, + bool is_host2device, bool sync) { + if (host_tensor == NULL) { + MS_LOG(ERROR) << "host tensor is null."; + return RET_ERROR; + } +#if TRT_VERSION_GE(7, 2) + if (host_tensor.DataType() == DataType::kNumberTypeBool && !is_host2device) { + CudaTensorParam ¤t_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second; + auto device_ptr = current_cuda_tensor.data; + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "device_ptr is null for " << device_tensor_name; + return RET_ERROR; + } + Cast(host_tensor.DataSize(), static_cast(device_ptr), static_cast(device_ptr), + stream_); + } +#endif + return SyncMemInHostAndDevice(host_tensor.MutableData(), device_tensor_name, host_tensor.DataSize(), is_host2device, + sync); +} + +int TensorRTAllocator::SyncMemInHostAndDevice(void *host_data, const std::string &device_tensor_name, size_t data_size, + bool is_host2device, bool sync) { + if (host_data == nullptr || cuda_tensor_map_.find(device_tensor_name) == cuda_tensor_map_.end()) { + MS_LOG(ERROR) << " host or device ptr is null."; + return RET_ERROR; + } + CudaTensorParam ¤t_cuda_tensor = cuda_tensor_map_.find(device_tensor_name)->second; + // is memcpy from device to host, the host mem is valid, change tag for mem pool. + current_cuda_tensor.is_valid_mem = is_host2device ? current_cuda_tensor.is_valid_mem : true; + if (is_host2device && current_cuda_tensor.is_valid_mem) { + MS_LOG(DEBUG) << "no need memcpy for: " << device_tensor_name; + return RET_OK; + } + auto device_ptr = current_cuda_tensor.data; + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "device_ptr is null for " << device_tensor_name; + return RET_ERROR; + } + + void *src_ptr = is_host2device ? host_data : device_ptr; + void *dst_ptr = is_host2device ? device_ptr : host_data; + cudaMemcpyKind kind = is_host2device ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost; + auto cuda_ret = cudaMemcpy(dst_ptr, src_ptr, data_size, kind); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "copy mem failed,ret " << cudaGetErrorName(cuda_ret); + return RET_ERROR; + } + MS_LOG(INFO) << "cuda memcpy success for " << device_tensor_name; + return RET_OK; +} + +int TensorRTAllocator::ClearDeviceMem() { + for (auto &iter : cuda_tensor_map_) { + auto cuda_ret = cudaFree(iter.second.data); + if (cuda_ret != cudaSuccess && cuda_ret != cudaErrorCudartUnloading) { + MS_LOG(WARNING) << "free cuda failed for " << cudaGetErrorName(cuda_ret); + } + iter.second.data = nullptr; + iter.second.is_valid_mem = false; + } + return RET_OK; +} +std::map TensorRTAllocator::GetAllDevicePtr() { return this->cuda_tensor_map_; } +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h new file mode 100644 index 00000000000..c0c592019ab --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_allocator.h @@ -0,0 +1,64 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_ +#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h" +#include +#include +#include +#include "include/api/types.h" + +namespace mindspore::lite { +struct CudaTensorParam { + void *data = nullptr; + bool is_valid_mem = false; + size_t size = 0; +}; +class TensorRTAllocator { + public: + TensorRTAllocator() = default; + + ~TensorRTAllocator() = default; + + void *MallocDeviceMem(const mindspore::MSTensor &host_tensor, size_t size); + + void *MallocDeviceMem(const std::string &name, size_t size, nvinfer1::DataType data_type); + + void *GetDevicePtr(const std::string &tensor_name); + + void SetCudaStream(cudaStream_t stream) { stream_ = stream; } + + std::map GetAllDevicePtr(); + + int SyncMemInHostAndDevice(mindspore::MSTensor host_tensor, const std::string &device_tensor_name, + bool is_host2device, bool sync = true); + + int SyncMemInHostAndDevice(void *host_data, const std::string &device_tensor_name, size_t data_size, + bool is_host2device, bool sync = true); + + int ClearDeviceMem(); + + void MarkMemValid(const std::string &name, bool isValid); + + bool GetMemIsValid(const std::string &name); + + private: + std::map cuda_tensor_map_; + cudaStream_t stream_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_ALLOCATOR_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc new file mode 100644 index 00000000000..e13b08997ba --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.cc @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_context.h" + +namespace mindspore::lite { +TensorRTContext::~TensorRTContext() { + if (network_ != nullptr) { + network_->destroy(); + network_ = nullptr; + } +} + +bool TensorRTContext::Init() { + network_ = runtime_->GetBuilder()->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + if (network_ == nullptr) { + MS_LOG(ERROR) << "New network init failed."; + return false; + } + return true; +} + +void TensorRTContext::SetRuntime(TensorRTRuntime *runtime) { runtime_ = runtime; } + +nvinfer1::INetworkDefinition *TensorRTContext::network() { return network_; } + +void TensorRTContext::RegisterLayer(nvinfer1::ILayer *layer, const std::string &basename) { + if (layer == nullptr) { + MS_LOG(ERROR) << "Register null layer!"; + return; + } + layer->setName((basename + "_" + std::to_string(counter_++)).c_str()); +} + +void TensorRTContext::RegisterTensor(nvinfer1::ITensor *tensor, const std::string &basename) { + if (tensor == nullptr) { + MS_LOG(ERROR) << "Register null tensor!"; + return; + } + tensor->setName((basename + "_" + std::to_string(counter_++)).c_str()); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h new file mode 100644 index 00000000000..bbcba89b223 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_context.h @@ -0,0 +1,40 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_ +#define MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_ + +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" + +namespace mindspore::lite { +class TensorRTContext { + public: + TensorRTContext() = default; + ~TensorRTContext(); + bool Init(); + void SetRuntime(TensorRTRuntime *runtime); + nvinfer1::INetworkDefinition *network(); + void RegisterLayer(nvinfer1::ILayer *layer, const std::string &basename); + void RegisterTensor(nvinfer1::ITensor *tensor, const std::string &basename); + + private: + int counter_{0}; + nvinfer1::INetworkDefinition *network_{nullptr}; + TensorRTRuntime *runtime_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_DELEGATE_TENSORRT_TENSORRT_CONTEXT_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc new file mode 100644 index 00000000000..1882681a8b6 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.cc @@ -0,0 +1,243 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_delegate.h" +#include +#include +#include +#include +#include "src/runtime/delegate/delegate_utils.h" +#include "src/runtime/delegate/auto_registration_factory.h" + +namespace mindspore::lite { +TensorRTDelegate::~TensorRTDelegate() { + if (runtime_ != nullptr) { + delete runtime_; + } + if (stream_ != nullptr) { + cudaStreamDestroy(stream_); + } +} +bool IsHardwareSupport() { + int driver_version = 0; + int ret = cudaDriverGetVersion(&driver_version); + if (ret != cudaSuccess || driver_version == 0) { + MS_LOG(WARNING) << "No nvidia GPU driver."; + return false; + } + return true; +} + +Status TensorRTDelegate::Init() { + if (!IsHardwareSupport()) { + return mindspore::kLiteNotSupport; + } + std::vector> device_list = context_->MutableDeviceInfo(); + auto iter = std::find_if(device_list.begin(), device_list.end(), [](std::shared_ptr device) { + return device->GetDeviceType() == DeviceType::kGPU; + }); + if (iter == device_list.end()) { + MS_LOG(ERROR) << "no gpu device info found for TensorRT."; + return mindspore::kLiteError; + } + auto gpu_info = (*iter)->Cast(); + if (gpu_info == nullptr) { + MS_LOG(ERROR) << "no gpu device info found for TensorRT."; + return mindspore::kLiteError; + } + device_info_ = gpu_info; + int ret = lite::SetCudaDevice(device_info_); + if (ret != RET_OK) { + return mindspore::kLiteError; + } + if (runtime_ == nullptr) { + runtime_ = new (std::nothrow) TensorRTRuntime(); + if (runtime_ == nullptr) { + MS_LOG(ERROR) << "create TensorRTRuntime failed."; + return mindspore::kLiteError; + } + } + if (runtime_->Init() != RET_OK) { + MS_LOG(ERROR) << "TensorRTRuntime init failed."; + return mindspore::kLiteError; + } + runtime_->SetDeviceID(device_info_->GetDeviceID()); + + auto cuda_ret = cudaStreamCreate(&stream_); + if (cuda_ret != cudaSuccess) { + MS_LOG(ERROR) << "Cuda create stream failed"; + return mindspore::kLiteError; + } + + cache_mgr_ = std::make_shared(); + if (cache_mgr_ == nullptr) { + MS_LOG(ERROR) << "malloc EmbeddingCacheManager failed."; + return kLiteMemoryFailed; + } + auto cache_ret = cache_mgr_->Init(cache_model_path_, vocab_size_, device_cache_size_); + if (cache_ret != mindspore::kSuccess) { + MS_LOG(ERROR) << "cache_mgr_ init failed."; + return cache_ret; + } + + return mindspore::kSuccess; +} + +Status TensorRTDelegate::BuildSubGraph(DelegateModel *model) { + KernelIter from, end; + std::vector tensorrt_ops; + int tensorrt_subgraph_index = 0; + for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) { + kernel::Kernel *kernel = *iter; + auto tensorrt_op = FindTensorRTOp(kernel, model->GetPrimitive(kernel)); + if (tensorrt_op != nullptr) { + if (cache_mgr_->CheckIsCacheKernel(kernel)) { + auto cache_ret = cache_mgr_->InitCacheKernel(kernel, device_info_->GetDeviceID(), &stream_); + if (cache_ret != kSuccess) { + MS_LOG(ERROR) << "InitCacheKernel failed " << kernel->name(); + return cache_ret; + } + } + + // If tensorrt_ops does not equal nullptr, this kernel can be supported by delegate + if (tensorrt_ops.size() == 0) { + from = iter; + } + tensorrt_op->SetRuntime(this->runtime_); + tensorrt_ops.push_back(tensorrt_op); + end = iter; + } else { + if (tensorrt_ops.size() > 0) { + auto tensorrt_subgraph = CreateTensorRTGraph(tensorrt_ops, model, from, end, tensorrt_subgraph_index); + if (tensorrt_subgraph == nullptr) { + MS_LOG(ERROR) << "Create TensorRT Graph failed."; + return mindspore::kLiteNullptr; + } + tensorrt_subgraph_index++; + iter = model->Replace(from, end + 1, tensorrt_subgraph); + tensorrt_ops.clear(); + } + } + } + if (tensorrt_ops.size() > 0) { + auto tensorrt_subgraph = CreateTensorRTGraph(tensorrt_ops, model, from, end, tensorrt_subgraph_index); + if (tensorrt_subgraph == nullptr) { + MS_LOG(ERROR) << "Create TensorRT Graph failed."; + return mindspore::kLiteNullptr; + } + model->Replace(from, end + 1, tensorrt_subgraph); + tensorrt_ops.clear(); + } + return mindspore::kSuccess; +} + +Status TensorRTDelegate::Build(DelegateModel *model) { + int ret = lite::SetCudaDevice(device_info_); + if (ret != RET_OK) { + return mindspore::kLiteError; + } + if (cache_model_path_.empty() && vocab_size_ > 0) { + auto cache_ret = cache_mgr_->Init(model, vocab_size_, device_cache_size_); + if (cache_ret != mindspore::kSuccess) { + MS_LOG(ERROR) << "cache_mgr_ init failed."; + return cache_ret; + } + } + + auto build_ret = BuildSubGraph(model); + if (build_ret != kSuccess) { + MS_LOG(INFO) << "BuildSubGraph failed"; + return build_ret; + } + + return mindspore::kSuccess; +} + +TensorRTOp *TensorRTDelegate::FindTensorRTOp(kernel::Kernel *kernel, const schema::Primitive *primitive) { + auto in_tensors = kernel->inputs(); + auto out_tensors = kernel->outputs(); + auto name = kernel->name(); + auto node_type = primitive->value_type(); + auto &plugin_factory = AutoRegistrationFactory::Get(); + if (plugin_factory.HasKey(node_type)) { + TensorRTOp *tensorrt_op = + plugin_factory.GetCreator(node_type)(primitive, in_tensors, out_tensors, name, kernel->quant_type()); + if (tensorrt_op == nullptr) { + return nullptr; + } + if (!support_resize_) { + return tensorrt_op; + } + support_resize_ = tensorrt_op->GetDynamicShapeParams().support_dynamic_ ? support_resize_ : false; + if (!tensorrt_op->GetDynamicShapeParams().support_dynamic_) { + MS_LOG(WARNING) << "TensorRT subgraph don't support dynamic shape resize, because of op " << name; + support_hw_resize_ = false; + return tensorrt_op; + } + if (!support_hw_resize_) { + return tensorrt_op; + } + support_hw_resize_ = tensorrt_op->GetDynamicShapeParams().support_hw_dynamic_ ? support_hw_resize_ : false; + if (!tensorrt_op->GetDynamicShapeParams().support_hw_dynamic_) { + MS_LOG(WARNING) << "TensorRT subgraph don't support dynamic hw dims resize, because of op " << name; + } + return tensorrt_op; + } else { + MS_LOG(WARNING) << "Unsupported op type for TensorRT. kernel->name:" << kernel->name() + << " type:" << schema::EnumNamePrimitiveType(primitive->value_type()); + return nullptr; + } +} + +TensorRTSubGraph *TensorRTDelegate::CreateTensorRTGraph(const std::vector &ops, + DelegateModel *model, KernelIter from, + KernelIter end, int index) { + auto in_tensors = GraphInTensors(ops, model, from, end); + auto out_tensors = GraphOutTensors(ops, model, from, end); + auto *tensorrt_graph = new (std::nothrow) TensorRTSubGraph(ops, in_tensors, out_tensors, context_, device_info_, + runtime_, support_resize_, support_hw_resize_); + if (tensorrt_graph == nullptr) { + MS_LOG(ERROR) << "new tensorrt_graph failed."; + return nullptr; + } + tensorrt_graph->SetCacheManager(cache_mgr_); + if (serialize_path_.size() > 0) { + tensorrt_graph->SetSerializePath(serialize_path_ + "_trt" + std::to_string(GetRankID()) + ".bin_" + + std::to_string(index)); + } + + // 1. For every op, find pre and next ops + FindPreNextOps(ops); + + // 2. Init TensorRT SubGraph. + auto ret = tensorrt_graph->Init(stream_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "TensorRTGraph init failed."; + delete tensorrt_graph; + return nullptr; + } + + // 3. Build TensorRT Model. + ret = tensorrt_graph->BuildTensorRTGraph(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "TensorRTGraph build failed."; + delete tensorrt_graph; + return nullptr; + } + + return tensorrt_graph; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h new file mode 100644 index 00000000000..aa543a669ff --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_delegate.h @@ -0,0 +1,70 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_ +#include +#include +#include +#include +#include +#include "include/api/delegate.h" +#include "src/runtime/delegate/tensorrt/tensorrt_subgraph.h" +#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h" +#include "include/api/kernel.h" +#include "include/errorcode.h" +#include "src/common/log_adapter.h" +#include "include/api/context.h" + +namespace mindspore::lite { +class TensorRTDelegate : public Delegate { + public: + explicit TensorRTDelegate(mindspore::Context *context, const std::string &cache_model_path, size_t vocab_size, + size_t device_cache_size, const std::string &serialize_path) + : context_(context), + cache_model_path_(cache_model_path), + vocab_size_(vocab_size), + device_cache_size_(device_cache_size), + serialize_path_(serialize_path) {} + + ~TensorRTDelegate() override; + + Status Init() override; + + Status Build(DelegateModel *model) override; + + private: + Status BuildSubGraph(DelegateModel *model); + + TensorRTOp *FindTensorRTOp(kernel::Kernel *kernel, const schema::Primitive *primitive); + + TensorRTSubGraph *CreateTensorRTGraph(const std::vector &ops, DelegateModel *model, + KernelIter from, KernelIter end, int index); + + std::unordered_map op_func_lists_; + mindspore::Context *context_{nullptr}; + std::shared_ptr device_info_{nullptr}; + TensorRTRuntime *runtime_{nullptr}; + bool support_hw_resize_{true}; + bool support_resize_{true}; + const std::string cache_model_path_; + size_t vocab_size_{0}; + size_t device_cache_size_{0}; + std::shared_ptr cache_mgr_{nullptr}; + const std::string serialize_path_; + cudaStream_t stream_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_DELEGATE_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc new file mode 100644 index 00000000000..73b0dc31287 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.cc @@ -0,0 +1,52 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" +#include +#include + +namespace mindspore::lite { +int TensorRTRuntime::Init() { + if (is_init_) { + return RET_OK; + } + builder_ = nvinfer1::createInferBuilder(this->logger_); + if (builder_ == nullptr) { + MS_LOG(ERROR) << "create infer builder failed."; + return RET_ERROR; + } + builder_->setMaxBatchSize(MAX_BATCH_SIZE); + allocator_ = new (std::nothrow) TensorRTAllocator(); + if (allocator_ == nullptr) { + MS_LOG(ERROR) << "Create allocator failed."; + return RET_ERROR; + } + is_init_ = true; + return RET_OK; +} + +TensorRTRuntime::~TensorRTRuntime() { + if (builder_ != nullptr) { + builder_->destroy(); + builder_ = nullptr; + } + if (allocator_ != nullptr) { + allocator_->ClearDeviceMem(); + delete allocator_; + allocator_ = nullptr; + } +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h new file mode 100644 index 00000000000..29ccd3f701b --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_runtime.h @@ -0,0 +1,82 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_ +#include +#include "include/errorcode.h" +#include "src/runtime/delegate/tensorrt/tensorrt_allocator.h" +#include "src/common/log_adapter.h" +#define MAX_BATCH_SIZE 64 + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::lite { +class TensorRTLogger : public nvinfer1::ILogger { + void log(Severity severity, const char *msg) noexcept override { + if (severity == Severity::kINTERNAL_ERROR || severity == Severity::kERROR) { + MS_LOG(ERROR) << msg; + } else if (severity == Severity::kWARNING) { + MS_LOG(WARNING) << msg; + } else if (severity == Severity::kINFO) { + MS_LOG(INFO) << msg; + } else { + MS_LOG(DEBUG) << msg; + } + } +}; + +enum RuntimePrecisionMode : int { RuntimePrecisionMode_FP32, RuntimePrecisionMode_FP16 }; + +class TensorRTRuntime { + public: + TensorRTRuntime() = default; + + ~TensorRTRuntime(); + + int Init(); + + nvinfer1::IBuilder *GetBuilder() { return this->builder_; } + + int GetBatchSize() { return batch_size_; } + + void SetBatchSize(int batch_size) { batch_size_ = batch_size; } + + void SetCudaStream(cudaStream_t stream) { allocator_->SetCudaStream(stream); } + + RuntimePrecisionMode GetRuntimePrecisionMode() { return runtime_percision_mode_; } + + void SetRuntimePrecisionMode(RuntimePrecisionMode runtime_percision_mode) { + runtime_percision_mode_ = runtime_percision_mode; + } + + TensorRTAllocator *GetAllocator() { return this->allocator_; } + + void SetDeviceID(uint32_t device_id) { device_id_ = device_id; } + + uint32_t GetDeviceID() { return device_id_; } + + private: + bool is_init_ = false; + nvinfer1::IBuilder *builder_{nullptr}; + TensorRTLogger logger_; + TensorRTAllocator *allocator_{nullptr}; + int batch_size_{0}; + uint32_t device_id_{0}; + RuntimePrecisionMode runtime_percision_mode_{RuntimePrecisionMode::RuntimePrecisionMode_FP32}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_RUNTIME_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc new file mode 100644 index 00000000000..8047cac6e9e --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.cc @@ -0,0 +1,63 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_serializer.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" +#include "src/common/file_utils.h" + +namespace mindspore::lite { +nvinfer1::ICudaEngine *TensorRTSerializer::GetSerializedEngine() { + if (serialize_file_path_.size() == 0) { + return nullptr; + } + void *trt_model_stream{nullptr}; + size_t size{0}; + trt_model_stream = ReadFile(serialize_file_path_.c_str(), &size); + if (trt_model_stream == nullptr || size == 0) { + MS_LOG(WARNING) << "read engine file failed : " << serialize_file_path_; + return nullptr; + } + nvinfer1::IRuntime *runtime = nvinfer1::createInferRuntime(logger_); + if (runtime == nullptr) { + delete[] trt_model_stream; + MS_LOG(ERROR) << "createInferRuntime failed."; + return nullptr; + } + nvinfer1::ICudaEngine *engine = runtime->deserializeCudaEngine(trt_model_stream, size, nullptr); + delete[] trt_model_stream; + runtime->destroy(); + return engine; +} +void TensorRTSerializer::SaveSerializedEngine(nvinfer1::ICudaEngine *engine) { + if (serialize_file_path_.size() == 0) { + return; + } + nvinfer1::IHostMemory *ptr = engine->serialize(); + if (ptr == nullptr) { + MS_LOG(ERROR) << "serialize engine failed"; + return; + } + + int ret = WriteToBin(serialize_file_path_, ptr->data(), ptr->size()); + if (ret != RET_OK) { + MS_LOG(ERROR) << "save engine failed " << serialize_file_path_; + } else { + MS_LOG(INFO) << "save engine to " << serialize_file_path_; + } + ptr->destroy(); + return; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h new file mode 100644 index 00000000000..d5ae0b1baf8 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_serializer.h @@ -0,0 +1,45 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_ +#include +#include +#include +#include "include/errorcode.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::lite { +class TensorRTSerializer { + public: + explicit TensorRTSerializer(const std::string &serialize_file_path) + : serialize_file_path_(std::move(serialize_file_path)) {} + + ~TensorRTSerializer() = default; + + nvinfer1::ICudaEngine *GetSerializedEngine(); + + void SaveSerializedEngine(nvinfer1::ICudaEngine *engine); + + private: + std::string serialize_file_path_; + TensorRTLogger logger_; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SERIALIZER_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc new file mode 100644 index 00000000000..9fbceab0d3c --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.cc @@ -0,0 +1,681 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/delegate/tensorrt/tensorrt_subgraph.h" +#include +#include +#include +#include +#include +#include +#include "src/runtime/delegate/delegate_utils.h" + +namespace mindspore::lite { +TensorRTSubGraph::~TensorRTSubGraph() { + if (ctx_ != nullptr) { + delete ctx_; + } + if (config_ != nullptr) { + config_->destroy(); + config_ = nullptr; + } + if (trt_context_ != nullptr) { + trt_context_->destroy(); + trt_context_ = nullptr; + } + if (engine_ != nullptr) { + engine_->destroy(); + engine_ = nullptr; + } + if (tensor_bindings_ != nullptr) { + delete[] tensor_bindings_; + tensor_bindings_ = nullptr; + } + for (auto op : all_ops_) { + delete op; + } +} + +int TensorRTSubGraph::Init(cudaStream_t stream) { + auto ret = GetGraphInOutOps(inputs_, outputs_, &in_ops_, &out_ops_, all_ops_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Get TensorRT subgraph input and output ops failed."; + return RET_ERROR; + } + profile_ = runtime_->GetBuilder()->createOptimizationProfile(); + if (profile_ == nullptr) { + MS_LOG(ERROR) << "createOptimizationProfile failed."; + return RET_ERROR; + } + ctx_ = new (std::nothrow) TensorRTContext(); + if (ctx_ == nullptr) { + MS_LOG(ERROR) << "New TensorRTContext failed."; + return RET_OK; + } + ctx_->SetRuntime(runtime_); + if (!ctx_->Init()) { + MS_LOG(ERROR) << "New TensorRTContext failed."; + return RET_OK; + } + if (SetDeviceConfig(stream) != RET_OK) { + MS_LOG(WARNING) << "set tensorrt config failed."; + } + serializer_ = std::make_shared(serialize_file_path_); + if (serializer_ == nullptr) { + MS_LOG(ERROR) << "create Serializer failed."; + return RET_ERROR; + } + engine_ = serializer_->GetSerializedEngine(); + if (engine_ != nullptr) { + MS_LOG(INFO) << "using serialized engine " << serialize_file_path_; + return RET_OK; + } + for (size_t i = 0; i < inputs_.size(); i++) { + if (inputs_[i].Shape().size() != DIMENSION_4D) { + input_hw_index_ = -1; + } + } + return RET_OK; +} + +int TensorRTSubGraph::BuildEngine() { + // print all network ops + if (this->config_->addOptimizationProfile(profile_) == -1) { + MS_LOG(ERROR) << "addOptimizationProfile failed."; + return RET_ERROR; + } + MS_LOG(INFO) << "build engine for tensorrt network: " << ctx_->network()->getName(); + for (int i = 0; i < ctx_->network()->getNbLayers(); i++) { + MS_LOG(DEBUG) << "tensorrt op: " << ctx_->network()->getLayer(i)->getName(); + } + MS_LOG(DEBUG) << "end of tensorrt network: " << ctx_->network()->getName(); + + this->engine_ = runtime_->GetBuilder()->buildEngineWithConfig(*ctx_->network(), *this->config_); + if (this->engine_ == nullptr) { + MS_LOG(ERROR) << "Create engine failed in TensorRT network"; + return RET_ERROR; + } + if (serialize_file_path_.size() > 0) { + serializer_->SaveSerializedEngine(engine_); + } + return RET_OK; +} + +int TensorRTSubGraph::SetDeviceConfig(cudaStream_t stream) { + if (config_ == nullptr) { + this->config_ = runtime_->GetBuilder()->createBuilderConfig(); + if (this->config_ == nullptr) { + MS_LOG(ERROR) << "create builder config failed."; + return RET_ERROR; + } + } + // set fp16 + if (device_info_->GetEnableFP16() && runtime_->GetBuilder()->platformHasFastFp16()) { + MS_LOG(INFO) << "set fp16 flag successfully for tensorrt."; + config_->setFlag(nvinfer1::BuilderFlag::kFP16); + runtime_->SetRuntimePrecisionMode(RuntimePrecisionMode_FP16); + } + + // set int8 + if (IsInt8Mode() && runtime_->GetBuilder()->platformHasFastInt8()) { + MS_LOG(INFO) << "set int8 flag successfully for tensorrt."; + config_->setFlag(nvinfer1::BuilderFlag::kINT8); + // Mark calibrator as null + config_->setInt8Calibrator(nullptr); + input_hw_index_ = -1; + } else { + MS_LOG(INFO) << "inputs no quant params or platform not support int8."; + } + runtime_->SetCudaStream(stream); + config_->setProfileStream(stream); + stream_ = stream; + MS_LOG(INFO) << GetRankID() << " tensorrt subgraph stream: " << stream_; + + // config setMaxWorkspaceSize to 1152 MB for max limit + config_->setMaxWorkspaceSize(1152 * (1 << 20)); + return RET_OK; +} + +bool TensorRTSubGraph::IsInt8Mode() { + for (auto cur_op : all_ops_) { + if (cur_op->GetQuantType() == schema::QuantType_QUANT_ALL) { + return true; + } + } + return false; +} + +nvinfer1::ITensor *TensorRTSubGraph::SetTensorRTNetworkInput(const mindspore::MSTensor &in_tensor) { + for (int i = 0; i < ctx_->network()->getNbInputs(); i++) { + if (in_tensor.Name().compare(ctx_->network()->getInput(i)->getName()) == 0) { + MS_LOG(INFO) << "input tensor is already added in network: " << in_tensor.Name(); + return ctx_->network()->getInput(i); + } + } + + auto cuda_dtype = ConvertDataType(in_tensor.DataType()); + if (static_cast(cuda_dtype) == -1) { + MS_LOG(ERROR) << "Unsupported input data type " << static_cast(in_tensor.DataType()); + return nullptr; + } + nvinfer1::Dims input_dims = ParseInputDimsProfile(in_tensor); + MS_LOG(INFO) << "add network input: " << in_tensor.Name(); + return ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims); +} + +nvinfer1::Dims TensorRTSubGraph::ParseInputDimsProfile(const mindspore::MSTensor &in_tensor) { + nvinfer1::Dims input_dims = ConvertCudaDims(in_tensor.Shape()); + if (profile_ == nullptr) { + MS_LOG(ERROR) << "profile is null."; + return input_dims; + } + if (runtime_->GetBatchSize() == 0) { + runtime_->SetBatchSize(input_dims.d[0]); + MS_LOG(INFO) << "batch size init as " << runtime_->GetBatchSize(); + if (input_batchsize_index_ != -1) { + input_dims.d[0] = -1; // dynamic batch size with wildcard N, default batchsize is first dims + input_batchsize_index_ = 0; + } + } else { + if (input_batchsize_index_ != -1) { + for (int n = 0; n < input_dims.nbDims; n++) { + if (input_dims.d[n] == runtime_->GetBatchSize()) { + runtime_->SetBatchSize(std::max(input_dims.d[0], runtime_->GetBatchSize())); + // first dims equals to batchsize + input_dims.d[n] = -1; + input_batchsize_index_ = n; + break; + } + } + } + } + // only support NHWC HW dim resize + if (input_hw_index_ != -1) { + MS_LOG(INFO) << "input tensor format is (NHWC:1, NCHW:0): " << in_tensor.format(); + input_hw_index_ = in_tensor.format() == Format::NHWC ? 1 : 2; // NCHW is 2 + input_dims.d[input_hw_index_] = -1; + input_dims.d[input_hw_index_ + 1] = -1; + } + // We do not need to check the return of setDimension and addOptimizationProfile here as all dims are explicitly set + nvinfer1::Dims input_dims_min = ConvertCudaDims(in_tensor.Shape()); + if (input_batchsize_index_ != -1) { + input_dims_min.d[input_batchsize_index_] = 1; + if (input_hw_index_ != -1) { + input_dims_min.d[input_hw_index_] = 1; + input_dims_min.d[input_hw_index_ + 1] = 1; + } + } + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims_min)) { + MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name(); + return input_dims; + } + nvinfer1::Dims input_dims_opt = ConvertCudaDims(in_tensor.Shape()); + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims_opt)) { + MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name(); + return input_dims; + } + nvinfer1::Dims input_dims_max = ConvertCudaDims(in_tensor.Shape()); + // input_dims_max should be the same with input network dims + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims_max)) { + MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name(); + return input_dims; + } + return input_dims; +} + +int TensorRTSubGraph::ParseInputsProfile() { + MS_LOG(INFO) << "using serialied engine."; + for (auto in_tensor : inputs_) { + auto dim = ParseInputDimsProfile(in_tensor); + if (dim.nbDims <= 0) { + MS_LOG(ERROR) << "input dims is invalid."; + return RET_ERROR; + } + } + return RET_OK; +} + +int TensorRTSubGraph::BuildTensorRTGraph() { + MS_ASSERT(!all_ops_.empty()); + int ret; + if (engine_ != nullptr) { + return ParseInputsProfile(); + } + // build engine online + for (auto cur_op : all_ops_) { + cur_op->SetRuntime(runtime_); + for (auto in_tensor : cur_op->inputs()) { + // Data From CPU + if (IsSubGraphInputTensor(this->inputs(), in_tensor)) { + nvinfer1::ITensor *trt_tensor = SetTensorRTNetworkInput(in_tensor); + if (trt_tensor == nullptr) { + MS_LOG(ERROR) << "SetTensorRTNetworkInput failed for " << in_tensor.Name(); + return RET_ERROR; + } +#if TRT_VERSION_GE(7, 2) + // avoid bool input tensor + if (trt_tensor->getType() == nvinfer1::DataType::kBOOL) { + trt_tensor = TRTTensorCast(ctx_, trt_tensor, nvinfer1::DataType::kINT32, in_tensor.Name() + "_cast_int32"); + } +#endif + cur_op->AddInnerInTensors(ITensorHelper{trt_tensor, in_tensor.format(), true}); + continue; + } + + ITensorHelper trt_tensor = FindTensorRTInputs(cur_op, in_tensor); + if (trt_tensor.trt_tensor_ == nullptr) { + // weight tensor + if (IsCached(cur_op, in_tensor) && in_tensor.Data() != nullptr) { + ret = HandleCacheTensor(cur_op, in_tensor); + if (ret != RET_OK) { + MS_LOG(ERROR) << "HandleCacheTensor failed for " << in_tensor.Name(); + return RET_ERROR; + } + } else if (trt_specific_weight_nodes_.find(cur_op->type()) == trt_specific_weight_nodes_.end()) { + if (in_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "Weight Tensor data is nullptr."; + return RET_ERROR; + } + trt_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx_, in_tensor, cur_op->GetOpName()); + trt_tensor.format_ = Format::NHWC; + MS_LOG(INFO) << "auto convert constant tensor for: " << in_tensor.Name(); + cur_op->AddInnerInTensors(trt_tensor); + } + } else { + cur_op->AddInnerInTensors(trt_tensor); + } + } + MS_LOG(DEBUG) << "Parsing TensorRT op for " << cur_op->GetOpName(); + + ret = cur_op->AddInnerOp(ctx_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Add op failed in TensorRT network: " << cur_op->GetOpName(); + return RET_ERROR; + } + ret = cur_op->SetInt8DynamicRange(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Set Int8 dynamic range failed in TensorRT network: " << cur_op->GetOpName(); + return RET_ERROR; + } + } + ret = MarkOutputs(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "MarkOutputs failed in TensorRT network"; + return ret; + } + + std::string network_name = "network_" + std::string(ctx_->network()->getInput(0)->getName()) + "_" + + std::string(ctx_->network()->getOutput(0)->getName()); + ctx_->network()->setName(network_name.c_str()); + this->name_ = network_name; + ret = BuildEngine(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Create engine failed in TensorRT network"; + return ret; + } + return RET_OK; +} + +int TensorRTSubGraph::MarkOutputs() { + // Mark NetWork Output Tensor. + for (const auto &out_tensor : outputs_) { + for (auto out_op : this->out_ops_) { + for (size_t index = 0; index < out_op->outputs().size(); index++) { + if (out_op->outputs()[index] == out_tensor) { + MS_LOG(INFO) << "markOutput for: " << out_tensor.Name(); + nvinfer1::ITensor *out_trt_tensor = out_op->GetInnerOutTensor()[index].trt_tensor_; + if (out_op->GetInnerOutTensor()[index].trt_tensor_->getDimensions().nbDims == DIMENSION_4D && + out_op->GetInnerOutTensor()[index].format_ == Format::NCHW && + !SameDims(out_op->GetInnerOutTensor()[index].trt_tensor_->getDimensions(), out_tensor.Shape())) { + // transpose subgraph output from nchw to nhwc + nvinfer1::IShuffleLayer *transpose_layer_out = + NCHW2NHWC(ctx_, *out_op->GetInnerOutTensor()[index].trt_tensor_); + if (transpose_layer_out == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + transpose_layer_out->setName((out_tensor.Name() + "_transpose2NHWC").c_str()); + out_trt_tensor = transpose_layer_out->getOutput(0); + } + + out_trt_tensor->setName(out_tensor.Name().c_str()); + ctx_->network()->markOutput(*out_trt_tensor); + for (int n = 0; n < out_trt_tensor->getDimensions().nbDims; n++) { + if (out_trt_tensor->getDimensions().d[n] == -1) { + output_batchsize_index_ = n; + break; + } + } + } + } + } + } + return RET_OK; +} + +int TensorRTSubGraph::Prepare() { + int ret = lite::SetCudaDevice(device_info_); + if (ret != RET_OK) { + return ret; + } + if (this->engine_ == nullptr) { + MS_LOG(ERROR) << "engine_ is null in this builder_"; + return RET_ERROR; + } + this->trt_context_ = this->engine_->createExecutionContext(); + if (this->trt_context_ == nullptr) { + MS_LOG(ERROR) << "TensorRTSubGraph create context failed."; + return RET_ERROR; + } + int binding_num = this->engine_->getNbBindings(); + tensor_bindings_ = new (std::nothrow) void *[binding_num]; + if (tensor_bindings_ == nullptr) { + MS_LOG(ERROR) << "malloc tensor binding array failed."; + return RET_ERROR; + } + + for (auto tensor : inputs_) { + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, tensor.DataSize()); + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "malloc for inputs tensor device memory failed."; + return RET_ERROR; + } + int index = this->engine_->getBindingIndex(tensor.Name().c_str()); + tensor_bindings_[index] = device_ptr; + trt_in_tensor_name_.push_back(tensor.Name()); + nvinfer1::Dims input_dims = ConvertCudaDims(tensor.Shape()); + for (int od = 0; od < input_dims.nbDims; od++) { + MS_LOG(DEBUG) << "in tensor " << tensor.Name() << " dims at " << od << " is " << input_dims.d[od]; + } + + if (!this->trt_context_->setBindingDimensions(index, input_dims)) { + MS_LOG(ERROR) << "invalid input dims of " << tensor.Name(); + return RET_ERROR; + } + } + + // malloc for cache weight tensor + for (auto cache_tensor : cache_const_inputs_) { + size_t data_size = cache_mgr_->GetCacheDataSize(cache_tensor); + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(cache_tensor, data_size); + runtime_->GetAllocator()->MarkMemValid(cache_tensor.Name().c_str(), true); + int index = this->engine_->getBindingIndex(cache_tensor.Name().c_str()); + tensor_bindings_[index] = device_ptr; + auto cache_ret = cache_mgr_->SetDeviceCacheAddr(cache_tensor.Name(), device_ptr, data_size); + if (cache_ret != kSuccess) { + MS_LOG(ERROR) << "SetDeviceCacheAddr failed, cache tensor: " << cache_tensor.Name(); + return RET_ERROR; + } + } + + if (!this->trt_context_->allInputDimensionsSpecified()) { + MS_LOG(ERROR) << "input dims need to be specified."; + return RET_ERROR; + } + for (auto op : all_ops_) { + ret = op->Prepare(tensor_bindings_, engine_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "prepare op failed of " << op->GetOpName(); + return RET_ERROR; + } + } + for (auto tensor : outputs_) { + (void)tensor.MutableData(); + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(tensor, tensor.DataSize()); + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "malloc for outputs tensor device memory failed."; + return RET_ERROR; + } + int index = this->engine_->getBindingIndex(tensor.Name().c_str()); + tensor_bindings_[index] = device_ptr; + trt_out_tensor_name_.push_back(tensor.Name()); + } + return RET_OK; +} + +int TensorRTSubGraph::ReSize() { + if (input_batchsize_index_ == -1) { + MS_LOG(ERROR) << "current network don't support resize."; + return RET_ERROR; + } + for (size_t i = 0; i < trt_in_tensor_name_.size(); i++) { + if (ctx_->network() != nullptr) { + for (int j = 0; j < ctx_->network()->getNbInputs(); j++) { + if (trt_in_tensor_name_[i].compare(ctx_->network()->getInput(j)->getName()) != 0) { + continue; + } + nvinfer1::Dims construct_dims = ctx_->network()->getInput(j)->getDimensions(); + bool ret = ValidInputResizeDims(construct_dims, inputs_[i].Shape()); + if (!ret) { + MS_LOG(ERROR) << "input resize shape is invalid."; + return RET_ERROR; + } + } + } + + MS_LOG(INFO) << "resize at input_batch_index " << input_batchsize_index_ << ", update batch size to " + << inputs_[i].Shape()[input_batchsize_index_]; + runtime_->SetBatchSize(inputs_[i].Shape()[input_batchsize_index_]); + + // inputs_ is dupulated by mindrt, name is untustable. + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_in_tensor_name_[i], inputs_[i].DataSize(), + ConvertDataType(inputs_[i].DataType())); + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "realloc for input tensor device memory failed."; + return RET_ERROR; + } + int index = this->engine_->getBindingIndex(trt_in_tensor_name_[i].c_str()); + tensor_bindings_[index] = device_ptr; + // Set actual input size + nvinfer1::Dims input_dims = ConvertCudaDims(inputs_[i].Shape()); + for (int od = 0; od < input_dims.nbDims; od++) { + MS_LOG(DEBUG) << "in tensor " << trt_in_tensor_name_[i] << " dims at " << od << " is " << input_dims.d[od]; + } + + if (!this->trt_context_->setBindingDimensions(index, input_dims)) { + MS_LOG(ERROR) << "invalid input dims of " << inputs_[i].Name(); + return RET_ERROR; + } + } + if (!this->trt_context_->allInputDimensionsSpecified()) { + MS_LOG(ERROR) << "input dims need to be specified."; + return RET_ERROR; + } + + for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) { + int index = this->engine_->getBindingIndex(trt_out_tensor_name_[i].c_str()); + auto device_ptr = runtime_->GetAllocator()->MallocDeviceMem(trt_out_tensor_name_[i], outputs_[i].DataSize(), + ConvertDataType(outputs_[i].DataType())); + if (device_ptr == nullptr) { + MS_LOG(ERROR) << "realloc for outputs tensor device memory failed."; + return RET_ERROR; + } + tensor_bindings_[index] = device_ptr; + } + return RET_OK; +} + +bool TensorRTSubGraph::ValidInputResizeDims(const nvinfer1::Dims &construct_dims, + const std::vector &resize_input_shape) { + if (static_cast(construct_dims.nbDims) != resize_input_shape.size()) { + MS_LOG(ERROR) << "invalid resize input."; + return false; + } + if (input_hw_index_ == -1) { + // only NHWC format support HW resize, otherwise only support batchsize resize + for (int d = 0; d < construct_dims.nbDims; d++) { + if (d != input_batchsize_index_ && construct_dims.d[d] != resize_input_shape[d]) { + MS_LOG(ERROR) << "only support dynamic batch size resize input."; + return false; + } + } + } else if ((input_hw_index_ == 1 && construct_dims.d[DIMENSION_3D] != resize_input_shape[DIMENSION_3D]) || + (input_hw_index_ == DIMENSION_2D && construct_dims.d[1] != resize_input_shape[1])) { + // input may be nhwc || nchw + MS_LOG(ERROR) << "don't support dynamic channel resize input."; + return false; + } + return true; +} + +int TensorRTSubGraph::Execute() { + int ret = lite::SetCudaDevice(device_info_); + if (ret != RET_OK) { + return ret; + } + for (size_t i = 0; i < inputs_.size(); i++) { + if (runtime_->GetAllocator()->GetMemIsValid(trt_in_tensor_name_[i])) { + MS_LOG(INFO) << "no need memcpy to cuda for input tensor: " << trt_in_tensor_name_[i]; + continue; + } + + auto iter = model_input_to_cache_tensors_.find(trt_in_tensor_name_[i]); + if (iter != model_input_to_cache_tensors_.end()) { + for (auto &cache_tensor : iter->second) { + ret = cache_mgr_->CacheHandle(cache_tensor.Name(), inputs_[i], + runtime_->GetAllocator()->GetDevicePtr(trt_in_tensor_name_[i])); + if (ret != RET_OK) { + MS_LOG(ERROR) << "handle cache failed " << trt_in_tensor_name_[i]; + return RET_ERROR; + } + runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], true); + MS_LOG(DEBUG) << cache_tensor.Name() << " CacheHandle succ " << trt_in_tensor_name_[i]; + } + continue; + } + + ret = runtime_->GetAllocator()->SyncMemInHostAndDevice(inputs_[i], trt_in_tensor_name_[i], true); + if (ret != RET_OK) { + MS_LOG(ERROR) << "sync mem from host to device failed for " << trt_in_tensor_name_[i]; + return ret; + } + runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], true); + } + + if (!this->trt_context_->executeV2(tensor_bindings_)) { + MS_LOG(ERROR) << "TensorRT execute failed."; + return RET_ERROR; + } + + for (size_t i = 0; i < trt_out_tensor_name_.size(); i++) { + int index = this->engine_->getBindingIndex(trt_out_tensor_name_[i].c_str()); + // actual output tensor dims + auto out_dims = this->trt_context_->getBindingDimensions(index); + std::vector new_shape = lite::ConvertMSShape(out_dims); + // batchsize resize need set new batch size + if (input_batchsize_index_ != -1) { + if (runtime_->GetBatchSize() != new_shape[output_batchsize_index_]) { + new_shape[output_batchsize_index_] = runtime_->GetBatchSize(); + } + } + for (int od = 0; od < out_dims.nbDims; od++) { + MS_LOG(DEBUG) << "out tensor " << trt_out_tensor_name_[i] << " dims at " << od << " is " << new_shape[od]; + } + outputs_[i].SetShape(new_shape); + + if (outputs_[i].MutableData() == nullptr) { + MS_LOG(ERROR) << "realloc for outputs tensor failed."; + return RET_ERROR; + } + runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name_[i], true); + int sync_ret = runtime_->GetAllocator()->SyncMemInHostAndDevice(outputs_[i], trt_out_tensor_name_[i], false); + if (sync_ret != RET_OK) { + MS_LOG(ERROR) << "sync mem from device to host failed for " << trt_out_tensor_name_[i]; + return sync_ret; + } + runtime_->GetAllocator()->MarkMemValid(trt_out_tensor_name_[i], false); + } + // make mem invalid, prepare for next execute + for (size_t i = 0; i < inputs_.size(); i++) { + runtime_->GetAllocator()->MarkMemValid(trt_in_tensor_name_[i], false); + } + return RET_OK; +} + +ITensorHelper TensorRTSubGraph::FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) { + for (auto input_op : cur_op->in_ops()) { + for (size_t i = 0; i < input_op->outputs().size(); i++) { + auto out_tensor = input_op->outputs().at(i); + if (in_tensor.Name().compare(out_tensor.Name()) == 0) { + return input_op->GetInnerOutTensor().at(i); + } + } + } + return ITensorHelper{}; +} +bool TensorRTSubGraph::IsCached(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) { + return cache_mgr_ != nullptr && cache_mgr_->IsCacheTensor(in_tensor); +} + +void TensorRTSubGraph::FindCacheTensorInfo(TensorRTOp *cur_op, mindspore::MSTensor device_cache_tensor) { + auto iter = network_cache_tensor_info_.find(cur_op->GetOpName()); + if (iter != network_cache_tensor_info_.end()) { + return; + } + std::queue front_ops; + front_ops.push(cur_op); + network_cache_tensor_info_[cur_op->GetOpName()].front_op_can_cache_ = true; + iter = network_cache_tensor_info_.find(cur_op->GetOpName()); + while (!front_ops.empty()) { + auto front_op = front_ops.front(); + iter->second.front_op_can_cache_ = CanOpCache(front_op) ? iter->second.front_op_can_cache_ : false; + for (auto in_tensor : front_op->inputs()) { + if (IsSubGraphInputTensor(this->inputs(), in_tensor)) { + iter->second.network_input_tensor_.push_back(in_tensor); + model_input_to_cache_tensors_[in_tensor.Name()].push_back(device_cache_tensor); + MS_LOG(DEBUG) << cur_op->GetOpName() << "'s network input tensor name is " << in_tensor.Name() + << ", can cache: " << iter->second.front_op_can_cache_; + } + } + for (auto fronts_op : front_op->in_ops()) { + front_ops.push(fronts_op); + } + front_ops.pop(); + } +} + +bool TensorRTSubGraph::CanOpCache(TensorRTOp *cur_op) { return true; } + +int TensorRTSubGraph::HandleCacheTensor(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor) { + FindCacheTensorInfo(cur_op, in_tensor); + // cache kernel weight tensor + cache_const_inputs_.push_back(in_tensor); + auto shape = cache_mgr_->GetCacheShape(in_tensor); + MS_LOG(INFO) << "auto add cache constant tensor for: " << in_tensor.Name(); + auto cuda_dtype = ConvertDataType(in_tensor.DataType()); + nvinfer1::Dims input_dims = ConvertCudaDims(shape); + nvinfer1::ITensor *cache_input = ctx_->network()->addInput(in_tensor.Name().c_str(), cuda_dtype, input_dims); + if (cache_input == nullptr) { + MS_LOG(ERROR) << "add cache Weight Tensor data is nullptr."; + return RET_ERROR; + } + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMIN, input_dims)) { + MS_LOG(ERROR) << "setDimensions of kMIN failed for " << in_tensor.Name(); + return RET_ERROR; + } + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kOPT, input_dims)) { + MS_LOG(ERROR) << "setDimensions of kOPT failed for " << in_tensor.Name(); + return RET_ERROR; + } + if (!profile_->setDimensions(in_tensor.Name().c_str(), nvinfer1::OptProfileSelector::kMAX, input_dims)) { + MS_LOG(ERROR) << "setDimensions of kMAX failed for " << in_tensor.Name(); + return RET_ERROR; + } + ITensorHelper trt_tensor{cache_input, Format::NHWC, true}; + cur_op->AddInnerInTensors(trt_tensor); + return RET_OK; +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h new file mode 100644 index 00000000000..7134f450e37 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_subgraph.h @@ -0,0 +1,159 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_ +#include +#include +#include +#include +#include +#include +#include "include/api/kernel.h" +#include "src/runtime/delegate/tensorrt/tensorrt_runtime.h" +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/tensorrt_serializer.h" +#include "src/runtime/delegate/tensorrt/op/tensorrt_op.h" +#include "src/runtime/delegate/parameter_cache/embedding_cache_manager.h" +#include "include/api/context.h" + +namespace mindspore::lite { +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +struct CacheTensorInfo { + std::vector network_input_tensor_; + bool front_op_can_cache_; +}; + +class TensorRTSubGraph : public kernel::Kernel { + public: + TensorRTSubGraph(std::vector ops, const std::vector &inputs, + const std::vector &outputs, const mindspore::Context *ctx, + std::shared_ptr device_info, TensorRTRuntime *runtime, bool support_resize, + bool support_hw_resize) + : kernel::Kernel(inputs, outputs, nullptr, ctx), + all_ops_(std::move(ops)), + device_info_(device_info), + runtime_(runtime) { + trt_specific_weight_nodes_ = { + schema::PrimitiveType_Conv2DFusion, schema::PrimitiveType_ReduceFusion, schema::PrimitiveType_Transpose, + schema::PrimitiveType_Gather, schema::PrimitiveType_Reshape, schema::PrimitiveType_PowFusion, + schema::PrimitiveType_AddFusion, schema::PrimitiveType_DivFusion, schema::PrimitiveType_SubFusion, + schema::PrimitiveType_MatMulFusion, schema::PrimitiveType_PowFusion, schema::PrimitiveType_Eltwise, + schema::PrimitiveType_ScaleFusion, schema::PrimitiveType_MulFusion, schema::PrimitiveType_Minimum, + schema::PrimitiveType_StridedSlice, schema::PrimitiveType_PadFusion, schema::PrimitiveType_FullConnection, + schema::PrimitiveType_Cast, schema::PrimitiveType_ExpandDims, schema::PrimitiveType_Resize, + schema::PrimitiveType_Maximum, schema::PrimitiveType_BiasAdd, schema::PrimitiveType_LSTM, + schema::PrimitiveType_RealDiv, schema::PrimitiveType_LayerNormFusion, schema::PrimitiveType_Greater, + schema::PrimitiveType_Less, schema::PrimitiveType_TopKFusion, schema::PrimitiveType_TileFusion, + schema::PrimitiveType_Equal}; + if (!support_resize) { + input_batchsize_index_ = -1; + input_hw_index_ = -1; + } + if (!support_hw_resize) { + input_hw_index_ = -1; + } + } + + ~TensorRTSubGraph() override; + + int Prepare() override; + + int Execute() override; + + int ReSize(); + + int BuildTensorRTGraph(); + + int Init(cudaStream_t stream); + + void SetCacheManager(const std::shared_ptr &cache_mgr) { cache_mgr_ = cache_mgr; } + + void SetSerializePath(const std::string &path) { serialize_file_path_ = std::move(path); } + + private: + int BuildEngine(); + + int SetDeviceConfig(cudaStream_t stream); + + bool IsInt8Mode(); + + bool SupportFP16(); + + nvinfer1::ITensor *SetTensorRTNetworkInput(const mindspore::MSTensor &in_tensor); + + ITensorHelper FindTensorRTInputs(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor); + + int MarkOutputs(); + + bool IsCached(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor); + + void FindCacheTensorInfo(TensorRTOp *cur_op, mindspore::MSTensor device_cache_tensor); + + bool CanOpCache(TensorRTOp *cur_op); + + int HandleCacheTensor(TensorRTOp *cur_op, const mindspore::MSTensor &in_tensor); + + nvinfer1::Dims ParseInputDimsProfile(const mindspore::MSTensor &in_tensor); + int ParseInputsProfile(); + + bool ValidInputResizeDims(const nvinfer1::Dims &construct_dims, const std::vector &resize_input_shape); + + std::vector all_ops_{}; + // subgraph input nodes. + std::vector in_ops_{}; + // subgraph output nodes. + std::vector out_ops_{}; + + void **tensor_bindings_{nullptr}; + + std::shared_ptr device_info_{nullptr}; + + TensorRTRuntime *runtime_{nullptr}; // all subgraph in one delegate share a runtime_ + + std::set trt_specific_weight_nodes_; + + // save in/out tensor name for subgraph isolate. + std::vector trt_in_tensor_name_; + std::vector trt_out_tensor_name_; + + std::vector cache_const_inputs_; + std::map network_cache_tensor_info_; + + nvinfer1::INetworkDefinition *network_{nullptr}; + nvinfer1::IBuilderConfig *config_{nullptr}; + nvinfer1::ICudaEngine *engine_{nullptr}; + nvinfer1::IExecutionContext *trt_context_{nullptr}; + nvinfer1::IOptimizationProfile *profile_{nullptr}; + + TensorRTContext *ctx_; + + // -1 means don't support resize + int input_batchsize_index_{0}; + int output_batchsize_index_{0}; + int input_hw_index_{0}; + + std::map> model_input_to_cache_tensors_; + + std::shared_ptr cache_mgr_{nullptr}; + + std::shared_ptr serializer_{nullptr}; + + std::string serialize_file_path_; + cudaStream_t stream_{nullptr}; +}; +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_SUBGRAPH_H_ diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc new file mode 100644 index 00000000000..1e43b4b5fd0 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.cc @@ -0,0 +1,721 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_utils.h" +#include "src/runtime/delegate/tensorrt/op/cast_plugin.h" +#include "src/runtime/delegate/tensorrt/distribution/distribution_collective.h" + +namespace mindspore::lite { +nvinfer1::Dims ConvertCudaDims(int data, size_t size) { + nvinfer1::Dims dims{}; + dims.nbDims = -1; + if (size > static_cast(dims.MAX_DIMS)) { + MS_LOG(ERROR) << "invalid shape size: " << size; + return dims; + } + dims.nbDims = size; + for (size_t i = 0; i < size; i++) { + dims.d[i] = data; + } + return dims; +} + +nvinfer1::Dims ConvertCudaDims(const void *data, int64_t size) { + nvinfer1::Dims dims{}; + dims.nbDims = -1; + if (size > static_cast(dims.MAX_DIMS)) { + MS_LOG(ERROR) << "invalid shape size: " << size; + return dims; + } + dims.nbDims = size; + const int *dims_data = static_cast(data); + for (int i = 0; i < size; i++) { + dims.d[i] = *(dims_data + i); + } + return dims; +} + +bool SameDims(nvinfer1::Dims dims, const std::vector &shape) { + if (dims.nbDims != static_cast(shape.size())) { + return false; + } + // dynamic dim, only channel dim know + for (int i = 0; i < dims.nbDims; i++) { + if (dims.d[i] == -1) { + continue; + } + if (dims.d[i] != shape[i]) { + return false; + } + } + return true; +} + +std::vector ConvertMSShape(const nvinfer1::Dims dims) { + std::vector shape; + for (int i = 0; i < dims.nbDims; i++) { + shape.push_back(dims.d[i]); + } + return shape; +} + +std::vector NHWC2NCHW(std::vector nhwc_shape) { + std::vector nchw_shape; + if (nhwc_shape.size() != DIMENSION_4D) { + return nhwc_shape; + } + nchw_shape.push_back(nhwc_shape[kNHWC_N]); + nchw_shape.push_back(nhwc_shape[kNHWC_C]); + nchw_shape.push_back(nhwc_shape[kNHWC_H]); + nchw_shape.push_back(nhwc_shape[kNHWC_W]); + return nchw_shape; +} + +nvinfer1::IShuffleLayer *SetTranspose(TensorRTContext *ctx, const nvinfer1::ITensor &input, + nvinfer1::Permutation permutation) { + nvinfer1::IShuffleLayer *layer = ctx->network()->addShuffle(const_cast(input)); + if (layer == nullptr) { + MS_LOG(ERROR) << "failed to create ShuffleLayer when create transpose op."; + return nullptr; + } + layer->setFirstTranspose(permutation); + return layer; +} + +nvinfer1::DataType ConvertDataType(DataType type_id) { + std::map data_type_map = { +#if TRT_VERSION_GE(7, 2) + {DataType::kNumberTypeBool, nvinfer1::DataType::kBOOL}, +#endif + {DataType::kNumberTypeInt8, nvinfer1::DataType::kINT8}, + {DataType::kNumberTypeInt32, nvinfer1::DataType::kINT32}, + {DataType::kNumberTypeFloat32, nvinfer1::DataType::kFLOAT}, + {DataType::kNumberTypeFloat16, nvinfer1::DataType::kHALF}, + }; + auto iter = data_type_map.find(type_id); + nvinfer1::DataType data_type; + if (iter != data_type_map.end()) { + data_type = iter->second; + } else { + data_type = nvinfer1::DataType::kFLOAT; + MS_LOG(WARNING) << "invalid data_type for TensorRT, need check: " << static_cast(type_id); + } + return data_type; +} + +cudaDataType ConvertDataType(nvinfer1::DataType type_id) { + std::map data_type_map = { + {nvinfer1::DataType::kINT8, CUDA_R_8I}, + {nvinfer1::DataType::kINT32, CUDA_R_32I}, + {nvinfer1::DataType::kFLOAT, CUDA_R_32F}, + {nvinfer1::DataType::kHALF, CUDA_R_16F}, + }; + auto iter = data_type_map.find(type_id); + cudaDataType data_type; + if (iter != data_type_map.end()) { + data_type = iter->second; + } else { + data_type = CUDA_R_32F; + MS_LOG(WARNING) << "invalid data_type for TensorRT, need check: " << static_cast(type_id); + } + return data_type; +} + +nvinfer1::IShuffleLayer *NHWC2NCHW(TensorRTContext *ctx, const nvinfer1::ITensor &input) { + // NHWC 0123 NCHW 0312 + nvinfer1::Permutation perm{{0, 3, 1, 2}}; + return SetTranspose(ctx, input, perm); +} + +nvinfer1::IShuffleLayer *NCHW2NHWC(TensorRTContext *ctx, const nvinfer1::ITensor &input) { + // NCHW 0123 NHWC 0231 + nvinfer1::Permutation perm{{0, 2, 3, 1}}; + return SetTranspose(ctx, input, perm); +} + +nvinfer1::ITensor *ConvertConstantTensor(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::string &op_name) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor"; + return nullptr; + } + nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape()); + if (dims.nbDims == -1) { + MS_LOG(WARNING) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar."; + dims.nbDims = 1; + dims.d[0] = 1; + } + nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); + if (ms_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name(); + return nullptr; + } + nvinfer1::Weights weights{data_type, ms_tensor.Data().get(), ms_tensor.ElementNum()}; + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); + return constant_tensor->getOutput(0); +} + +nvinfer1::ITensor *ConvertScalarToITensor(TensorRTContext *ctx, size_t shape_size, const void *value, + const DataType data_type, const std::string &op_name) { + nvinfer1::Dims dims = ConvertCudaDims(1, shape_size); + if (dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name; + return nullptr; + } + nvinfer1::Weights weights{ConvertDataType(data_type), value, 1}; + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, op_name + "_constant"); + return constant_tensor->getOutput(0); +} + +std::experimental::optional TryConvertActivationType(schema::ActivationType activation_type) { + std::map action_map = { + {schema::ActivationType_RELU, ActivationParams{nvinfer1::ActivationType::kRELU, false, 0, false, 0}}, + {schema::ActivationType_SIGMOID, ActivationParams{nvinfer1::ActivationType::kSIGMOID, false, 0, false, 0}}, + {schema::ActivationType_TANH, ActivationParams{nvinfer1::ActivationType::kTANH, false, 0, false, 0}}, + {schema::ActivationType_LEAKY_RELU, ActivationParams{nvinfer1::ActivationType::kLEAKY_RELU, true, 0, false, 0}}, + {schema::ActivationType_ELU, ActivationParams{nvinfer1::ActivationType::kELU, true, 0, false, 0}}, + {schema::ActivationType_SELU, ActivationParams{nvinfer1::ActivationType::kSELU, true, 0, true, 0}}, + {schema::ActivationType_SOFTSIGN, ActivationParams{nvinfer1::ActivationType::kSOFTSIGN, false, 0, false, 0}}, + {schema::ActivationType_SOFTPLUS, ActivationParams{nvinfer1::ActivationType::kSOFTPLUS, true, 0, true, 0}}, + {schema::ActivationType_THRESHOLDRELU, + ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, true, 0, false, 0}}, + {schema::ActivationType_RELU6, ActivationParams{nvinfer1::ActivationType::kCLIP, true, 0, true, 6}}, + {schema::ActivationType_RELU1, ActivationParams{nvinfer1::ActivationType::kCLIP, true, 0, true, 1}}, + {schema::ActivationType_HARD_TANH, ActivationParams{nvinfer1::ActivationType::kCLIP, true, -1, true, 1}}, + // using plugin + {schema::ActivationType_GELU, ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, false, 0, false, 0}}, + {schema::ActivationType_SWISH, ActivationParams{nvinfer1::ActivationType::kTHRESHOLDED_RELU, false, 0, false, 0}}}; + return action_map.find(activation_type) != action_map.end() + ? std::experimental::optional(action_map[activation_type]) + : std::experimental::nullopt; +} + +void AlignShapeRank(std::vector *in_shape_ptr, const std::vector &out_shape) { + const size_t last_dim = in_shape_ptr->size() - 1; + const int in_rank = in_shape_ptr->size(); + int index = out_shape.size() - 1; + for (; index >= 0; index--) { + if (out_shape[index] == in_shape_ptr->at(last_dim)) { + break; + } + } + const int align_rank = index + 1; + if (index <= 0 || align_rank == in_rank) return; + for (int i = 0; i < index + 1 - in_rank; i++) { + in_shape_ptr->insert(in_shape_ptr->begin(), 1); + } +} + +nvinfer1::ITensor *ConvertTensorWithExpandDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::vector &expect_shape, const std::string &op_name) { + if (ctx == nullptr || ctx->network() == nullptr) { + MS_LOG(ERROR) << "network is null for ConvertTensorWithExpandDims"; + return nullptr; + } + auto origin_shape = ms_tensor.Shape(); + std::vector convert_shape(expect_shape); + AlignShapeRank(&origin_shape, convert_shape); + size_t origin_index = 0; + for (size_t i = 0; i < convert_shape.size(); ++i) { + if (origin_index >= origin_shape.size()) { + convert_shape[i] = 1; + continue; + } + if (origin_shape[origin_index] != convert_shape[i]) { + convert_shape[i] = origin_shape[origin_index]; + } + origin_index++; + } + if (ms_tensor.ElementNum() != + std::accumulate(convert_shape.begin(), convert_shape.end(), 1, std::multiplies())) { + MS_LOG(ERROR) << "ExpandDims failed for " << op_name; + return nullptr; + } + nvinfer1::Dims dims = ConvertCudaDims(convert_shape); + if (dims.nbDims == -1) { + MS_LOG(ERROR) << "ConvertCudaDims failed for " << op_name; + return nullptr; + } + nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType()); + if (ms_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "ConvertTensorWithExpandDims from a MSTensor with nullptr data"; + return nullptr; + } + nvinfer1::Weights weights{data_type, ms_tensor.Data().get(), ms_tensor.ElementNum()}; + nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights); + if (constant_tensor == nullptr) { + MS_LOG(ERROR) << "create constant_tensor failed."; + return nullptr; + } + ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name); + return constant_tensor->getOutput(0); +} + +nvinfer1::ITensor *ConvertConstantTensorWithDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::vector &expect_shape, const std::string &op_name) { + nvinfer1::ITensor *constant_input{nullptr}; + std::string tensor_name = op_name + "_" + ms_tensor.Name(); + if (ms_tensor.Shape().size() == 0 || ms_tensor.ElementNum() == 1) { + constant_input = + lite::ConvertScalarToITensor(ctx, expect_shape.size(), ms_tensor.Data().get(), ms_tensor.DataType(), tensor_name); + if (constant_input == nullptr) { + MS_LOG(ERROR) << "create Itensor from scalar tensor failed: " << tensor_name; + return nullptr; + } + } else if (ms_tensor.Shape().size() == expect_shape.size()) { + constant_input = lite::ConvertConstantTensor(ctx, ms_tensor, tensor_name); + if (constant_input == nullptr) { + MS_LOG(ERROR) << "create Itensor from constant tensor failed: " << tensor_name; + return nullptr; + } + } else if (ms_tensor.ElementNum() >= 1) { + constant_input = ConvertTensorWithExpandDims(ctx, ms_tensor, expect_shape, tensor_name); + if (constant_input == nullptr) { + MS_LOG(ERROR) << "create Itensor from ConvertTensorWithExpandDims failed: " << tensor_name; + return nullptr; + } + } else { + MS_LOG(ERROR) << "const tensor value needs check: " << tensor_name; + } + return constant_input; +} + +nvinfer1::Weights TransposeWeight4D(const mindspore::MSTensor &ms_tensor, void **pack_weight) { + // usage notice: malloc addr saved to pack_weight, save pack_weight ptr and free it when deconstruct + nvinfer1::Weights weights{}; + weights.count = ms_tensor.ElementNum(); + auto weight_shape = ms_tensor.Shape(); + if (weight_shape.size() != DIMENSION_4D) { + MS_LOG(ERROR) << ms_tensor.Name() << " dims is " << weight_shape.size(); + return weights; + } + if (ms_tensor.Data() == nullptr) { + MS_LOG(ERROR) << ms_tensor.Name() << " has null data"; + return weights; + } + void *pack_weight_tmp = malloc(ms_tensor.DataSize()); + if (pack_weight_tmp == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return weights; + } + *pack_weight = pack_weight_tmp; + weights.values = pack_weight_tmp; + + switch (ms_tensor.DataType()) { + case DataType::kNumberTypeFloat16: { + weights.type = nvinfer1::DataType::kHALF; + PackNHWCToNCHWFp16(ms_tensor.Data().get(), pack_weight_tmp, weight_shape[0], weight_shape[1] * weight_shape[2], + weight_shape[3], 0, 0); + break; + } + case DataType::kNumberTypeFloat32: { + weights.type = nvinfer1::DataType::kFLOAT; + PackNHWCToNCHWFp32(ms_tensor.Data().get(), pack_weight_tmp, weight_shape[0], weight_shape[1] * weight_shape[2], + weight_shape[3], 0, 0); + break; + } + default: { + MS_LOG(ERROR) << ms_tensor.Name() << " has unsupported tensor datatype for transpose data : " + << static_cast(ms_tensor.DataType()); + } + } + return weights; +} + +nvinfer1::Weights TransposeWeight2D(const mindspore::MSTensor &ms_tensor, void **pack_weight) { + // usage notice: malloc addr saved to pack_weight, save pack_weight ptr and free it when deconstruct + nvinfer1::Weights weights{}; + weights.count = ms_tensor.ElementNum(); + auto weight_shape = ms_tensor.Shape(); + if (weight_shape.size() != DIMENSION_2D) { + MS_LOG(ERROR) << ms_tensor.Name() << " dims is " << weight_shape.size(); + return weights; + } + if (ms_tensor.Data() == nullptr) { + MS_LOG(ERROR) << ms_tensor.Name() << " has null data"; + return weights; + } + void *pack_weight_tmp = malloc(ms_tensor.DataSize()); + if (pack_weight_tmp == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed."; + return weights; + } + *pack_weight = pack_weight_tmp; + weights.values = pack_weight_tmp; + + int row = weight_shape[0]; + int col = weight_shape[1]; + + switch (ms_tensor.DataType()) { + case DataType::kNumberTypeFloat16: { + weights.type = nvinfer1::DataType::kHALF; + auto src = static_cast(ms_tensor.Data().get()); + auto dst = static_cast(pack_weight_tmp); + for (int r = 0; r < row; ++r) { + for (int c = 0; c < col; ++c) { + dst[c * row + r] = src[r * col + c]; + } + } + break; + } + case DataType::kNumberTypeFloat32: { + weights.type = nvinfer1::DataType::kFLOAT; + auto dst = static_cast(pack_weight_tmp); + auto src = static_cast(ms_tensor.Data().get()); + for (int r = 0; r < row; ++r) { + for (int c = 0; c < col; ++c) { + dst[c * row + r] = src[r * col + c]; + } + } + break; + } + default: { + MS_LOG(ERROR) << ms_tensor.Name() << " has unsupported tensor datatype for transpose data : " + << static_cast(ms_tensor.DataType()); + } + } + return weights; +} + +nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor) { + nvinfer1::Weights weights{}; + weights.type = ConvertDataType(ms_tensor.DataType()); + weights.values = ms_tensor.Data().get(); + weights.count = ms_tensor.ElementNum(); + if (weights.values == nullptr) { + MS_LOG(ERROR) << "ConvertWeight from a MSTensor with nullptr data"; + } + return weights; +} + +nvinfer1::ITensor *TRTTensorCast(TensorRTContext *ctx, nvinfer1::ITensor *trt_tensor, nvinfer1::DataType data_type, + const std::string &name) { +#if TRT_VERSION_GE(7, 2) + data_type == nvinfer1::DataType::kBOOL ? nvinfer1::DataType::kINT32 : data_type; + auto cast_layer = ctx->network()->addIdentity(*trt_tensor); +#else + auto plugin = std::make_shared(name, trt_tensor->getType(), data_type); + nvinfer1::ITensor *inputTensors[] = {trt_tensor}; + nvinfer1::IPluginV2Layer *cast_layer = ctx->network()->addPluginV2(inputTensors, 1, *plugin); +#endif + if (cast_layer == nullptr) { + MS_LOG(ERROR) << "create cast layer failed for: " << name; + return nullptr; + } +#if TRT_VERSION_GE(7, 2) + cast_layer->setOutputType(0, data_type); +#endif + cast_layer->setName(name.c_str()); + nvinfer1::ITensor *cast_out = cast_layer->getOutput(0); + cast_out->setName((name + "_output").c_str()); + return cast_out; +} + +int SetCudaDevice(std::shared_ptr device_info_) { + return SetCudaDevice(static_cast(device_info_->GetDeviceID())); +} + +int SetCudaDevice(int device_id) { + int device = 0; + auto ret = cudaGetDevice(&device); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaGetDevice failed, device is untrustable. error code: " << ret; + return RET_ERROR; + } + int set_device_id = device_id; + int deviceCnt = 0; + + ret = cudaGetDeviceCount(&deviceCnt); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaGetDeviceCount failed."; + return RET_ERROR; + } + + if (set_device_id > deviceCnt - 1) { + MS_LOG(ERROR) << "invalid input device id as " << set_device_id << " for current device count " << deviceCnt; + return RET_ERROR; + } + if (device != set_device_id) { + ret = cudaSetDevice(set_device_id); + if (ret != cudaSuccess) { + MS_LOG(ERROR) << "cudaSetDevice failed, error code: " << ret; + return RET_ERROR; + } + } + if (cudaGetDevice(&device) != cudaSuccess) { + MS_LOG(ERROR) << "cudaGetDevice failed, device is untrustable."; + return RET_ERROR; + } + MS_LOG(DEBUG) << "cuda is running on device: " << device; + return RET_OK; +} + +Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm) { + if (input_format == Format::NHWC) { + if (perm.order[kNHWC_N] == kNHWC_N && perm.order[kNHWC_H] == kNHWC_C && perm.order[kNHWC_W] == kNHWC_W && + perm.order[kNHWC_C] == kNHWC_H) { + return Format::NCHW; + } + } else if (input_format == Format::NCHW) { + if (perm.order[kNCHW_N] == kNCHW_N && perm.order[kNCHW_C] == kNCHW_H && perm.order[kNCHW_H] == kNCHW_W && + perm.order[kNCHW_W] == kNCHW_C) { + return Format::NHWC; + } + } + MS_LOG(WARNING) << "transpose out format needs to check for " << input_format; + return input_format; +} +int ConvertAxisFromNHWC2NCHW(int nhwc_axis) { + // N0H1W2C3->N0C1H2W3 + if (nhwc_axis > kNHWC_C) { + return nhwc_axis; + } + switch (nhwc_axis) { + case kNHWC_N: + return kNCHW_N; + case kNHWC_H: + return kNCHW_H; + case kNHWC_W: + return kNCHW_W; + case kNHWC_C: + return kNCHW_C; + default: + MS_LOG(ERROR) << "invalid input axis for nhwc: " << nhwc_axis; + } + return nhwc_axis; +} + +void PackNHWCToNCHWFp16(const void *src, void *dst, size_t batches, size_t plane, size_t channel, size_t task_id, + size_t thread_count) { + size_t hw8 = plane / C8NUM; + size_t task_start = 0; + size_t task_end = plane; + if (thread_count > 0) { + size_t offset_hw = UP_DIV(hw8, thread_count) * C8NUM; + task_start = offset_hw * task_id; + size_t count = plane - task_start; + if (count == 0) { + return; + } + task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw); + hw8 = task_start + ((task_end - task_start) >= offset_hw ? offset_hw : 0); + } else { + hw8 *= C8NUM; + } + size_t c8 = channel / C8NUM * C8NUM; + size_t batch = plane * channel; + for (size_t n = 0; n < batches; n++) { + const uint16_t *src_batch = static_cast(src) + n * batch; + uint16_t *dst_batch = static_cast(dst) + n * batch; + size_t hw = task_start; + for (; hw < hw8; hw += C8NUM) { + size_t c = 0; + for (; c < c8; c += C8NUM) { + const uint16_t *src_ptr = src_batch + hw * channel + c; + uint16_t *dst_ptr = dst_batch + c * plane + hw; + for (size_t tr = 0; tr < C8NUM; tr++) { + for (size_t tc = 0; tc < C8NUM; tc++) { + dst_ptr[tc * plane + tr] = src_ptr[tr * channel + tc]; + } + } + } + for (; c < channel; c++) { + const uint16_t *src_ptr = src_batch + hw * channel + c; + uint16_t *dst_ptr = dst_batch + c * plane + hw; + for (size_t i = 0; i < C8NUM; i++) { + dst_ptr[i] = src_ptr[i * channel]; + } + } + } + for (; hw < task_end; hw++) { + const uint16_t *src_ptr = src_batch + hw * channel; + uint16_t *dst_ptr = dst_batch + hw; + for (size_t i = 0; i < channel; i++) { + dst_ptr[i * plane] = src_ptr[i]; + } + } + } +} +std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor, mindspore::Format format, bool is_same) { + nvinfer1::Dims dims = trt_tensor->getDimensions(); + std::string is_same_string = is_same ? " is same with ms tensor " : " is different from ms tensor "; + std::string out_string = "tensor " + std::string(trt_tensor->getName()) + ": format (NHWC:1, NCHW:0) is " + + std::to_string(static_cast(format)) + is_same_string + ", dims is "; + std::string dim_string = "["; + for (int i = 0; i < dims.nbDims; i++) { + dim_string += std::to_string(dims.d[i]); + if (i != dims.nbDims - 1) { + dim_string += ", "; + } + } + dim_string += "]"; + out_string += dim_string; + return out_string; +} + +std::string GetTensorFormat(ITensorHelper tensor_helper) { + return GetTensorFormat(tensor_helper.trt_tensor_, tensor_helper.format_, tensor_helper.same_format_); +} + +std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor) { return GetTensorFormat(trt_tensor, Format::NHWC, true); } + +std::experimental::optional TryConvertTRTReduceMode(schema::ReduceMode mode) { + std::map reduce_ops_ = { + {schema::ReduceMode::ReduceMode_ReduceMean, nvinfer1::ReduceOperation::kAVG}, + {schema::ReduceMode::ReduceMode_ReduceMax, nvinfer1::ReduceOperation::kMAX}, + {schema::ReduceMode::ReduceMode_ReduceMin, nvinfer1::ReduceOperation::kMIN}, + {schema::ReduceMode::ReduceMode_ReduceProd, nvinfer1::ReduceOperation::kPROD}, + {schema::ReduceMode::ReduceMode_ReduceL2, nvinfer1::ReduceOperation::kSUM}, + {schema::ReduceMode::ReduceMode_ReduceSum, nvinfer1::ReduceOperation::kSUM}, + }; + return reduce_ops_.find(mode) != reduce_ops_.end() + ? std::experimental::optional(reduce_ops_[mode]) + : std::experimental::nullopt; +} +int PreprocessInputs2SameDim(TensorRTContext *ctx, const ITensorHelper &input_tensor_helper, + ITensorHelper *out_tensor_helper) { + out_tensor_helper->trt_tensor_ = input_tensor_helper.trt_tensor_; + out_tensor_helper->format_ = input_tensor_helper.format_; + out_tensor_helper->same_format_ = true; + if (input_tensor_helper.trt_tensor_->getDimensions().nbDims == DIMENSION_4D && !input_tensor_helper.same_format_) { + if (input_tensor_helper.format_ == Format::NCHW) { + // transpose: NCHW->NHWC + nvinfer1::IShuffleLayer *transpose_layer_in = NCHW2NHWC(ctx, *input_tensor_helper.trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + transpose_layer_in->setName( + (std::string(input_tensor_helper.trt_tensor_->getName()) + "_input_transpose2NHWC").c_str()); + out_tensor_helper->trt_tensor_ = transpose_layer_in->getOutput(0); + out_tensor_helper->format_ = Format::NHWC; + } else { + // transpose: NHWC->NCHW + nvinfer1::IShuffleLayer *transpose_layer_in = NHWC2NCHW(ctx, *input_tensor_helper.trt_tensor_); + if (transpose_layer_in == nullptr) { + MS_LOG(ERROR) << "op action convert failed"; + return RET_ERROR; + } + transpose_layer_in->setName( + (std::string(input_tensor_helper.trt_tensor_->getName()) + "_input_transpose2NCHW").c_str()); + out_tensor_helper->trt_tensor_ = transpose_layer_in->getOutput(0); + out_tensor_helper->format_ = Format::NCHW; + } + } + return RET_OK; +} + +int GetDimsVolume(const nvinfer1::Dims &dims) { + if (dims.nbDims <= 0) { + return 0; + } + return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); +} + +int GetDimsVolume(const std::vector &shape) { + if (shape.size() == 0) { + return 0; + } + return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); +} + +std::experimental::optional SqueezeDims(const nvinfer1::Dims &in_dims, int pos) { + if (in_dims.nbDims <= 1) { + MS_LOG(ERROR) << "invalid shape size: " << in_dims.nbDims << "for squeeze."; + return {}; + } + nvinfer1::Dims out_dims; + int i = 0; + for (int j = 0; j <= in_dims.nbDims; ++j) { + if (j != pos) { + out_dims.d[i++] = in_dims.d[j]; + } + } + out_dims.nbDims = in_dims.nbDims - 1; + return std::experimental::optional(out_dims); +} + +std::experimental::optional UnsqueezeDims(const nvinfer1::Dims &in_dims, int pos, int val) { + if (in_dims.nbDims >= static_cast(in_dims.MAX_DIMS)) { + MS_LOG(ERROR) << "invalid shape size: " << in_dims.nbDims << "for unsqueeze."; + return {}; + } + nvinfer1::Dims out_dims; + int i = 0; + for (int j = 0; j <= in_dims.nbDims; ++j) { + if (j == pos) { + out_dims.d[j] = val; + } else { + out_dims.d[j] = in_dims.d[i++]; + } + } + out_dims.nbDims = in_dims.nbDims + 1; + return std::experimental::optional(out_dims); +} + +int ParseData2Vector(const mindspore::MSTensor &ms_tensor, std::vector *dst) { + if (ms_tensor.Data() == nullptr) { + MS_LOG(ERROR) << "ignore tensor: " << ms_tensor.Name(); + return RET_ERROR; + } + dst->clear(); + dst->resize(ms_tensor.ElementNum()); + switch (ms_tensor.DataType()) { + case DataType::kNumberTypeInt64: { + Data2Vector(dst, ms_tensor.Data().get()); + break; + } + case DataType::kNumberTypeInt32: { + Data2Vector(dst, ms_tensor.Data().get()); + break; + } + default: { + MS_LOG(ERROR) << ms_tensor.Name() << " has more datatype to parse"; + return RET_ERROR; + } + } + return RET_OK; +} + +nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const std::vector &shape) { + return Reshape(ctx, input, ConvertCudaDims(shape)); +} + +nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const nvinfer1::Dims &shape) { + auto reshape_layer = ctx->network()->addShuffle(*input); + if (reshape_layer == nullptr) { + MS_LOG(ERROR) << "add reshape_layer failed"; + return nullptr; + } + reshape_layer->setReshapeDimensions(shape); + return reshape_layer->getOutput(0); +} +} // namespace mindspore::lite diff --git a/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h new file mode 100644 index 00000000000..ad2bd3d3889 --- /dev/null +++ b/mindspore/lite/src/runtime/delegate/tensorrt/tensorrt_utils.h @@ -0,0 +1,184 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_UTILS_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_TENSORRT_UTILS_H_ +#include +#include +#include +#include +#include +#include +#include "src/runtime/delegate/tensorrt/tensorrt_context.h" +#include "src/runtime/delegate/tensorrt/cuda_impl/cublas_utils.h" +#include "mindspore/core/ir/dtype/type_id.h" +#include "schema/ops_generated.h" +#include "nnacl/pack.h" +#include "include/api/context.h" + +#define kNCHW_N 0 +#define kNCHW_C 1 +#define kNCHW_H 2 +#define kNCHW_W 3 +#define kNHWC_N 0 +#define kNHWC_H 1 +#define kNHWC_W 2 +#define kNHWC_C 3 + +namespace mindspore::lite { +#define TRT_VERSION_GE(major, minor) \ + (NV_TENSORRT_MAJOR > major) || ((NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR >= minor)) +#define TRT_VERSION_LS(major, minor) \ + (NV_TENSORRT_MAJOR < major) || ((NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR < minor)) +struct ITensorHelper { + nvinfer1::ITensor *trt_tensor_{nullptr}; + mindspore::Format format_{Format::NHWC}; + bool same_format_{true}; +}; +struct ActivationParams { + nvinfer1::ActivationType activation_type; + bool has_alpha; + float alpha; + bool has_beta; + float beta; +}; + +typedef union float32_bits { + unsigned int u; + float f; +} float32_bits; + +// Convert Tensor data to Cuda dims. +nvinfer1::Dims ConvertCudaDims(const void *data, int64_t size); + +nvinfer1::Dims ConvertCudaDims(int data, size_t size); + +bool SameDims(nvinfer1::Dims dims, const std::vector &shape); + +std::vector ConvertMSShape(const nvinfer1::Dims dims); + +std::vector NHWC2NCHW(std::vector nhwc_shape); + +nvinfer1::DataType ConvertDataType(DataType type_id); + +cudaDataType ConvertDataType(nvinfer1::DataType type_id); + +nvinfer1::IShuffleLayer *NHWC2NCHW(TensorRTContext *ctx, const nvinfer1::ITensor &input); + +nvinfer1::IShuffleLayer *NCHW2NHWC(TensorRTContext *ctx, const nvinfer1::ITensor &input); + +std::experimental::optional TryConvertActivationType(schema::ActivationType activation_type); + +nvinfer1::ITensor *ConvertConstantTensor(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::string &op_name); + +nvinfer1::ITensor *ConvertTensorWithExpandDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::vector &expect_shape, const std::string &op_name); + +nvinfer1::ITensor *ConvertScalarToITensor(TensorRTContext *ctx, size_t shape_size, const void *value, + const DataType data_type, const std::string &op_name); + +nvinfer1::ITensor *ConvertConstantTensorWithDims(TensorRTContext *ctx, const mindspore::MSTensor &ms_tensor, + const std::vector &expect_shape, const std::string &op_name); + +nvinfer1::Weights TransposeWeight4D(const mindspore::MSTensor &ms_tensor, void **pack_weight); + +nvinfer1::Weights TransposeWeight2D(const mindspore::MSTensor &ms_tensor, void **pack_weight); + +nvinfer1::Weights ConvertWeight(const mindspore::MSTensor &ms_tensor); + +nvinfer1::ITensor *TRTTensorCast(TensorRTContext *ctx, nvinfer1::ITensor *tensor, nvinfer1::DataType data_type, + const std::string &name); + +int SetCudaDevice(std::shared_ptr device_info_); + +int SetCudaDevice(int device_id); + +Format GetOutputFormat(Format input_format, nvinfer1::Permutation perm); + +int ConvertAxisFromNHWC2NCHW(int nhwc_axis); + +void PackNHWCToNCHWFp16(const void *src, void *dst, size_t batch, size_t plane, size_t channel, size_t task_id, + size_t thread_count); + +std::string GetTensorFormat(nvinfer1::ITensor *trt_tensor, mindspore::Format format, bool is_same); + +std::string GetTensorFormat(ITensorHelper tensor_helper); + +std::string GetTensorFormat(nvinfer1::ITensor *trt_tensors); + +std::experimental::optional TryConvertTRTReduceMode(schema::ReduceMode mode); + +int PreprocessInputs2SameDim(TensorRTContext *ctx, const ITensorHelper &input_tensor_helper, + ITensorHelper *out_tensor_helper); + +int GetDimsVolume(const nvinfer1::Dims &dims); + +int GetDimsVolume(const std::vector &shape); + +std::experimental::optional SqueezeDims(const nvinfer1::Dims &in_dims, int pos); + +std::experimental::optional UnsqueezeDims(const nvinfer1::Dims &in_dims, int pos, int val); + +nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const std::vector &shape); + +nvinfer1::ITensor *Reshape(TensorRTContext *ctx, nvinfer1::ITensor *input, const nvinfer1::Dims &shape); + +int ParseData2Vector(const mindspore::MSTensor &ms_tensor, std::vector *dst); + +template +bool SameDims(const std::vector &shape1, const std::vector &shape2) { + if (shape1.size() != shape2.size()) { + return false; + } + for (size_t i = 0; i < shape1.size(); i++) { + if (std::abs(shape1[i] - shape2[i]) > 1e-6) { + return false; + } + } + return true; +} + +template +nvinfer1::Dims ConvertCudaDims(const std::vector &shape) { + nvinfer1::Dims dims{}; + dims.nbDims = -1; + if (!shape.empty() && shape.size() <= static_cast(dims.MAX_DIMS)) { + dims.nbDims = shape.size(); + for (int i = 0; i < dims.nbDims; i++) { + dims.d[i] = static_cast(shape[i]); + } + } else { + MS_LOG(WARNING) << "ms shape is invalid or empty."; + } + return dims; +} + +inline size_t IntToSize(int u) { + if (u < 0) { + MS_LOG(WARNING) << "The int value(" << u << ") is less than 0."; + return SIZE_MAX; + } + return static_cast(u); +} +template +void Data2Vector(std::vector *dst, const void *src) { + auto src_ptr = static_cast(src); + for (int i = 0; i < dst->size(); i++) { + dst->at(i) = static_cast(src_ptr[i]); + } +} +} // namespace mindspore::lite +#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_TENSORRT_UTILS_H_