From aac8073a2e6759978bd4302951ceae83df836e84 Mon Sep 17 00:00:00 2001 From: yoni Date: Mon, 8 Aug 2022 12:18:58 +0300 Subject: [PATCH] Add Fast transformers to mindspore third party --- cmake/external_libs/fast_transformers.cmake | 13 + cmake/mind_expression.cmake | 1 + .../001-fast_transformer.patch | 3172 +++++++++++++++++ 3 files changed, 3186 insertions(+) create mode 100644 cmake/external_libs/fast_transformers.cmake create mode 100644 third_party/patch/fast_transformer/001-fast_transformer.patch diff --git a/cmake/external_libs/fast_transformers.cmake b/cmake/external_libs/fast_transformers.cmake new file mode 100644 index 00000000000..2eb2b53da34 --- /dev/null +++ b/cmake/external_libs/fast_transformers.cmake @@ -0,0 +1,13 @@ +set(REQ_URL "https://github.com/NVIDIA/FasterTransformer/archive/refs/tags/release/v5.0_tag.tar.gz") +set(MD5 "f2e06ec43f3b5b83017bd87b0427524f") +set(ft_libs "transformer-shared") + + +mindspore_add_pkg(fast_transformers + VER 0.5.0 + URL ${REQ_URL} + MD5 ${MD5} + LIBS ${ft_libs} + LIB_PATH output/lib + PATCHES ${MINDSPORE_PROJECT_DIR}/third_party/patch/fast_transformer/001-fast_transformer.patch + CMAKE_OPTION -DCMAKE_BUILD_TYPE=Release -DEXAMPLES=off) \ No newline at end of file diff --git a/cmake/mind_expression.cmake b/cmake/mind_expression.cmake index 92444f4accf..d7dd4d2b0de 100644 --- a/cmake/mind_expression.cmake +++ b/cmake/mind_expression.cmake @@ -49,6 +49,7 @@ endif() if(ENABLE_GPU) include(${CMAKE_SOURCE_DIR}/cmake/external_libs/cub.cmake) + include(${CMAKE_SOURCE_DIR}/cmake/external_libs/fast_transformers.cmake) if(ENABLE_MPI) include(${CMAKE_SOURCE_DIR}/cmake/external_libs/nccl.cmake) endif() diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch new file mode 100644 index 00000000000..7711c076a79 --- /dev/null +++ b/third_party/patch/fast_transformer/001-fast_transformer.patch @@ -0,0 +1,3172 @@ +diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml +new file mode 100644 +index 0000000..18054db +--- /dev/null ++++ b/.github/ISSUE_TEMPLATE/bug_report.yml +@@ -0,0 +1,32 @@ ++name: "Bug Report" ++description: Submit a bug report ++labels: [ "bug" ] ++body: ++ - type: textarea ++ id: description ++ attributes: ++ label: Description ++ description: Please share your system info with us. ++ render: shell ++ placeholder: branch, docker version, GPU type ++ validations: ++ required: true ++ ++ - type: textarea ++ id: reproduced-steps ++ attributes: ++ label: Reproduced Steps ++ description: Please provide the step to reproduce the bugs ++ render: shell ++ placeholder: | ++ Steps to reproduce your bugs: ++ ++ 1. docker run -ti --gpus all nvcr.io/nvidia/pytorch:22.03-py3 bash ++ 2. git clone https://github.com/NVIDIA/FasterTransformer.git ++ 3. cd FasterTransformer mkdir build && cd build ++ 4. cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release .. && make -j12 ++ 5. ./bin/bert_example 32 12 32 12 64 0 0 ++ 6. What error you see. ++ ++ validations: ++ required: true +diff --git a/.vscode/settings.json b/.vscode/settings.json +deleted file mode 100644 +index 6f535da..0000000 +--- a/.vscode/settings.json ++++ /dev/null +@@ -1,72 +0,0 @@ +-{ +- "files.associations": { +- "*.cuh": "cpp", +- "stdexcept": "cpp", +- "chrono": "cpp", +- "cmath": "cpp", +- "type_traits": "cpp", +- "cctype": "cpp", +- "clocale": "cpp", +- "cstdarg": "cpp", +- "cstddef": "cpp", +- "cstdio": "cpp", +- "cstdlib": "cpp", +- "cstring": "cpp", +- "ctime": "cpp", +- "cwchar": "cpp", +- "cwctype": "cpp", +- "array": "cpp", +- "atomic": "cpp", +- "*.tcc": "cpp", +- "condition_variable": "cpp", +- "cstdint": "cpp", +- "deque": "cpp", +- "unordered_map": "cpp", +- "vector": "cpp", +- "exception": "cpp", +- "algorithm": "cpp", +- "functional": "cpp", +- "iterator": "cpp", +- "map": "cpp", +- "memory": "cpp", +- "memory_resource": "cpp", +- "numeric": "cpp", +- "optional": "cpp", +- "random": "cpp", +- "ratio": "cpp", +- "set": "cpp", +- "string": "cpp", +- "string_view": "cpp", +- "system_error": "cpp", +- "tuple": "cpp", +- "utility": "cpp", +- "fstream": "cpp", +- "initializer_list": "cpp", +- "iomanip": "cpp", +- "iosfwd": "cpp", +- "iostream": "cpp", +- "istream": "cpp", +- "limits": "cpp", +- "mutex": "cpp", +- "new": "cpp", +- "ostream": "cpp", +- "sstream": "cpp", +- "streambuf": "cpp", +- "thread": "cpp", +- "cinttypes": "cpp", +- "typeinfo": "cpp", +- "bitset": "cpp", +- "hash_map": "cpp", +- "hash_set": "cpp", +- "slist": "cpp", +- "regex": "cpp", +- "strstream": "cpp", +- "complex": "cpp", +- "forward_list": "cpp", +- "list": "cpp", +- "unordered_set": "cpp", +- "future": "cpp", +- "cfenv": "cpp", +- "typeindex": "cpp" +- } +-} +\ No newline at end of file +diff --git a/3rdparty/trt_fused_multihead_attention/CMakeLists.txt b/3rdparty/trt_fused_multihead_attention/CMakeLists.txt +index 8707220..aea35e6 100644 +--- a/3rdparty/trt_fused_multihead_attention/CMakeLists.txt ++++ b/3rdparty/trt_fused_multihead_attention/CMakeLists.txt +@@ -21,7 +21,6 @@ set(trt_fused_multi_head_attention_files + ) + + file(GLOB trt_fused_multi_head_attention_files ${trt_fused_multi_head_attention_files} *.sm*.cpp) +- + add_library(trt_fused_multi_head_attention STATIC ${trt_fused_multi_head_attention_files}) + target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart) + set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE ON) +diff --git a/CMakeLists.txt b/CMakeLists.txt +index ea21014..3098d8a 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -14,7 +14,9 @@ + cmake_minimum_required(VERSION 3.8 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13 + project(FasterTransformer LANGUAGES CXX CUDA) + +-find_package(CUDA 10.2 REQUIRED) ++find_package(CUDA 10.1 REQUIRED) ++ ++option(EXAMPLES "build examples" on) + + if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") + add_definitions("-DENABLE_BF16") +@@ -125,8 +127,6 @@ if(NOT (FIND_SM STREQUAL True)) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ + -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ + -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ +- -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \ +- -gencode=arch=compute_86,code=\\\"sm_86,compute_86\\\" \ + ") + # -rdc=true") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") +@@ -136,7 +136,11 @@ if(NOT (FIND_SM STREQUAL True)) + set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0;8.6") + endif() + set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86) +- message("-- Assign GPU architecture (sm=70,75,80,86)") ++if(${CUDA_VERSION_STRING} VERSION_LESS_EQUAL "10.1" ) ++ message("${CUDA_VERSION_STRING} removing unsupported sm 80 & 86") ++ list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES 80 86) ++endif() ++ message("-- Assign GPU architecture (sm=${CMAKE_CUDA_ARCHITECTURES})") + endif() + + if(BUILD_PYT) +@@ -152,8 +156,9 @@ set(CMAKE_CXX_STANDARD "${CXX_STD}") + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}") +- ++if(${CUDA_VERSION_STRING} VERSION_GREATER "10.1.105" ) ++ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}") ++endif() + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + # set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose") + set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3") +@@ -230,8 +235,10 @@ link_directories( + + add_subdirectory(3rdparty) + add_subdirectory(src) +-add_subdirectory(examples) +-add_subdirectory(tests) ++if(EXAMPLES) ++ add_subdirectory(examples) ++ add_subdirectory(tests) ++endif() + + ######################################## + +@@ -313,6 +320,7 @@ add_library(transformer-static STATIC + set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(transformer-static PUBLIC -lcudart -lnccl -lmpi -lcublas -lcublasLt -lcurand) ++endif() + + add_library(transformer-shared SHARED + $ +@@ -324,29 +332,9 @@ add_library(transformer-shared SHARED + $ + $ + $ +- $ +- $ +- $ + $ +- $ + $ + $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ +- $ + $ + $ + $ +@@ -373,7 +361,6 @@ add_library(transformer-shared SHARED + $ + $ + $ +- $ + $ + $ + $ +@@ -387,14 +374,17 @@ add_library(transformer-shared SHARED + $ + $ + $) ++ + set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON) + set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX) +-target_link_libraries(transformer-shared PUBLIC -lcudart -lnccl -lmpi -lcublas -lcublasLt -lcurand) ++target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand) + +-include(GNUInstallDirs) ++#include(GNUInstallDirs) + set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer) + ++ ++ + include(CMakePackageConfigHelpers) + configure_package_config_file( + ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in +@@ -402,52 +392,23 @@ configure_package_config_file( + INSTALL_DESTINATION ${INSTALL_CONFIGDIR} + ) + +-install( +- FILES +- ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake +- DESTINATION ${INSTALL_CONFIGDIR} +-) + + install( + TARGETS + transformer-shared + EXPORT + transformer-shared-targets +- LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer +- ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer +-) +- +-install( +- EXPORT +- transformer-shared-targets +- FILE +- FasterTransformerTargets.cmake +- DESTINATION +- ${INSTALL_CONFIGDIR} ++ LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/output/lib ++ ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/output/lib + ) + + file(GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh") + foreach ( file ${HEADER_FILES} ) + file( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} ) + get_filename_component( dir ${rfile} DIRECTORY ) +- install( FILES ${file} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/${dir} ) ++ install( FILES ${file} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/output/include ) + endforeach() + + +-################################################################################ +-# add_executable(gpt sample/cpp/gpt_sample.cc ) +-# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static) +-# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding) +- +-export( +- EXPORT +- transformer-shared-targets +- FILE +- ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake +- NAMESPACE +- TritonCore:: +-) + +-export(PACKAGE FasterTransformer) + +-endif() # BUILD_MULTI_GPU +diff --git a/README.md b/README.md +index a60983c..45b5374 100644 +--- a/README.md ++++ b/README.md +@@ -52,7 +52,7 @@ FasterTransformer is built on top of CUDA, cuBLAS, cuBLASLt and C++. We provide + | Swin Transformer | PyTorch | Yes | Yes | - | - | - | + | Swin Transformer | TensorRT | Yes | Yes | - | - | - | + | ViT | PyTorch | Yes | Yes | - | - | - | +-| ViT | TensorRT | Yes | Yes | - | - | - | ++| ViT | TensorRT | Yes | - | - | - | - | + + * Note that the FasterTransformer supports the models above on C++ because all source codes are built on C++. + +diff --git a/deploy.sh b/deploy.sh +new file mode 100755 +index 0000000..ba7f644 +--- /dev/null ++++ b/deploy.sh +@@ -0,0 +1,25 @@ ++#copy cuda folder (once) ++base=`git rev-parse --show-toplevel` ++server=10.10.10.174 ++while getopts "d" opt ++do ++case "${opt}" in ++ "d" ) ++ debug=1 ++ shift ++ ;; ++esac ++done ++file=`realpath $1` ++shift ++rsync -v ${file} ${server}:${file} ++echo "file=${file}" ++rsync -v ${base}/../mindspore/trc/transformer/*.fp32 ${server}:${base}/build/bin ++# echo "cd ${base}/build/bin/" ++command=$(cat <<-ENDM ++ CUDA_VISIBLE_DEVICES=0 \ ++ ${file} $@ ++ENDM ++) ++echo "command=${command}" ++ssh ${server} "cd ${base}/build/bin ;${command}" +diff --git a/docs/gpt_guide.md b/docs/gpt_guide.md +index afcba9a..71c4fab 100644 +--- a/docs/gpt_guide.md ++++ b/docs/gpt_guide.md +@@ -312,7 +312,7 @@ python tools/checkpoint_util.py --model-type GPT --loader megatron --saver faste + To convert the Megatron GPT model to binary, FasterTransformer provides a tool `examples/onnx/multi_gpu_gpt/onnx_ckpt_convert.py` to convert the checkpoint. + + ```bash +-wget https://github.com/onnx/models/raw/master/text/machine_comprehension/gpt-2/model/gpt2-10.onnx ++wget https://github.com/onnx/models/raw/main/text/machine_comprehension/gpt-2/model/gpt2-10.onnx + python ../examples/onnx/multi_gpu_gpt/onnx_ckpt_convert.py -i gpt2-10.onnx -o ../models/onnx-models/c-model/124m/ -i_g 1 + python ../examples/onnx/multi_gpu_gpt/onnx_ckpt_convert.py -i gpt2-10.onnx -o ../models/onnx-models/c-model/124m/ -i_g 4 + ``` +diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt +index b67cd01..3cc4155 100644 +--- a/examples/cpp/CMakeLists.txt ++++ b/examples/cpp/CMakeLists.txt +@@ -13,6 +13,7 @@ + # limitations under the License. + + add_subdirectory(bert) ++add_subdirectory(ms) + add_subdirectory(bert_int8) + add_subdirectory(decoding) + add_subdirectory(gpt) +diff --git a/examples/cpp/ms/CMakeLists.txt b/examples/cpp/ms/CMakeLists.txt +new file mode 100644 +index 0000000..09920ff +--- /dev/null ++++ b/examples/cpp/ms/CMakeLists.txt +@@ -0,0 +1,20 @@ ++# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ ++add_executable(ms_benchmark ms.cc) ++if (SPARSITY_SUPPORT) ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt transformer-shared) ++else() ++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart transformer-shared) ++endif() +diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h +new file mode 100644 +index 0000000..b607656 +--- /dev/null ++++ b/examples/cpp/ms/initialize.h +@@ -0,0 +1,275 @@ ++#pragma once ++ ++#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" ++#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" ++ ++using namespace fastertransformer; ++struct opt_arg { ++ size_t batch_size; ++ size_t num_layers; ++ size_t seq_len; // source seq len ++ size_t tgt_seq_len; ++ size_t head_num; ++ size_t hidden_size; ++ size_t size_per_head; ++ bool is_remove_padding; ++ std::string model_name; ++ std::string compute_type; ++ std::string w_compute_type; ++ std::string s_compute_type; ++}; ++template ++struct DecriptorTest{ ++ std::vector input_tensors; ++ std::vector output_tensors; ++ std::vector output_python_tensors; ++ std::vector w_tensors; ++ BaseAttentionLayer* Attn; ++}; ++ ++typedef enum { ++ MHA_X1 = 1, // AttnIn + AttnMask ++ MHA_X2, // AttnIn + EncOut -- same seq size + AttnMask ++ MHA_CROSS, // AttnIn + EncOut + AttnMAsk ++}MODEL_TEST_ID_E; ++ ++int ModelNum(std::string model_name) { ++ if (model_name == "mha_x1") { ++ return MHA_X1; ++ } else if (model_name == "mha_x2") { ++ return MHA_X2; ++ } else if (model_name == "mha_cross") { ++ return MHA_CROSS; ++ } else { ++ return -1; ++ } ++} ++ ++template ++void InitializeAttn(opt_arg* opt_a, ++ DecriptorTest &desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) { ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ ++ desc.Attn = new MSMHALayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size * opt_a->seq_len,hidden_units}, ++ 0}); ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, ++ 0}); ++ // GPU RESULTS ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 3 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{3 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units}, 0}); ++} ++template ++void InitializeAttnX2(opt_arg* opt_a, ++ DecriptorTest &desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) { ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ ++ desc.Attn = new MSMHALayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, ++ 0}); ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, ++ 0}); ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, (size_t)(opt_a->seq_len)}, ++ 0}); ++ ++ // GPU RESULTS ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->size_per_head}, 0}); ++ ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{3 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 2 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units}, 0}); ++} ++ ++template ++void InitializeAttnCross(opt_arg* opt_a, ++ DecriptorTest &desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) { ++ const size_t hidden_units = opt_a->head_num * opt_a->size_per_head; ++ ++ desc.Attn = new MSMHALayer(opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->tgt_seq_len, ++ opt_a->head_num, ++ opt_a->size_per_head, ++ stream, ++ cublas_wrapper, ++ allocator, ++ false, // free buffer after fwd ++ true, // is_qk_buf_float_ ++ false); // sparse ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size * opt_a->seq_len, hidden_units}, ++ 0}); ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, ++ 0}); ++ ++ desc.input_tensors.push_back(Tensor{MEMORY_GPU, ++ getTensorType(), ++ std::vector{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len}, ++ 0}); ++ ++ // GPU RESULTS ++ ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ desc.output_tensors.push_back(Tensor{ ++ MEMORY_GPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ desc.output_python_tensors.push_back(Tensor{ ++ MEMORY_CPU, getTensorType(), std::vector{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0}); ++ ++ ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType(), std::vector{3 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, 2 * hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units, hidden_units}, 0}); ++ desc.w_tensors.push_back( ++ Tensor{MEMORY_GPU, getTensorType(), std::vector{hidden_units}, 0}); ++} ++ ++template ++void Init(opt_arg* opt_a, ++ DecriptorTest &desc, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ Allocator* allocator) ++{ ++ int model_num = ModelNum(opt_a->model_name); ++ switch (model_num) { ++ case MHA_X1: ++ InitializeAttn(opt_a, ++ desc, ++ stream, ++ cublas_wrapper, ++ allocator); ++ break; ++ case MHA_X2: ++ InitializeAttnX2(opt_a, ++ desc, ++ stream, ++ cublas_wrapper, ++ allocator); ++ break; ++ case MHA_CROSS: ++ InitializeAttnCross(opt_a, ++ desc, ++ stream, ++ cublas_wrapper, ++ allocator); ++ break; ++ default: ++ break; ++ } ++} ++template ++void InitWeight(opt_arg *opt_a, AttentionWeight &attn_weights, std::vector w_tensors) { ++ int modelId = ModelNum(opt_a->model_name); ++ if (modelId == MHA_X1) { ++ attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; ++ attn_weights.query_weight.bias = (const T*)w_tensors[1].data; ++ attn_weights.attention_output_weight.kernel = (const T*)w_tensors[2].data; ++ attn_weights.attention_output_weight.bias = (const T*)w_tensors[3].data; ++ } else if (modelId == MHA_X2 || modelId == MHA_CROSS) { ++ attn_weights.query_weight.kernel = (const T*)w_tensors[0].data; ++ attn_weights.query_weight.bias = (const T*)w_tensors[1].data; ++ attn_weights.key_weight.kernel = (const T*)w_tensors[2].data; ++ attn_weights.attention_output_weight.kernel = (const T*)w_tensors[3].data; ++ attn_weights.attention_output_weight.bias = (const T*)w_tensors[4].data; ++ } else { ++ // return ERROR illegal model ! ++ } ++} +diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc +new file mode 100644 +index 0000000..3121200 +--- /dev/null ++++ b/examples/cpp/ms/ms.cc +@@ -0,0 +1,434 @@ ++/* ++ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" ++#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" ++#include "src/fastertransformer/utils/logger.h" ++#include "examples/cpp/ms/initialize.h" ++#include ++#include ++#include ++using namespace fastertransformer; ++ ++template ++int MsExample(opt_arg* opt_a); ++void usage() { ++ std::cout << "Usage: ms_benchmark -b -l " ++ << "-s -H -S -p " ++ << "-T -W -F " ++ << "-m \n"; ++} ++ ++bool read_args(int argc, char* argv[], opt_arg* opt_a) { ++ int opt; ++ while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:")) != -1) { ++ switch (opt) { ++ case 'b': ++ opt_a->batch_size = atoi(optarg); ++ break; ++ case 'l': ++ opt_a->num_layers = atoi(optarg); ++ break; ++ case 's': ++ opt_a->seq_len = atoi(optarg); ++ break; ++ case 't': ++ opt_a->tgt_seq_len = atoi(optarg); ++ break; ++ case 'H': ++ opt_a->head_num = atoi(optarg); ++ break; ++ case 'S': ++ opt_a->hidden_size = atoi(optarg); ++ break; ++ case 'p': ++ opt_a->is_remove_padding = static_cast(atoi(optarg)); ++ break; ++ case 'm': ++ opt_a->model_name = std::string(optarg); ++ break; ++ case 'T': ++ opt_a->compute_type = std::string(optarg); ++ break; ++ case 'W': ++ opt_a->w_compute_type = std::string(optarg); ++ break; ++ case 'F': ++ opt_a->s_compute_type = std::string(optarg); ++ break; ++ case 'i': ++ case 'w': ++ break; ++ case 'h': ++ default: ++ usage(); ++ return false; ++ } ++ } ++ opt_a->size_per_head = opt_a->hidden_size / opt_a->head_num; ++ opt_a->tgt_seq_len = (opt_a->tgt_seq_len == -1) ? opt_a->seq_len : opt_a->tgt_seq_len; ++ return true; ++} ++ ++ ++ ++int main(int argc, char** argv) { ++ opt_arg opt_a; ++ opt_a.batch_size = 1; ++ opt_a.num_layers = 1; ++ opt_a.seq_len = 1; ++ opt_a.tgt_seq_len = -1; ++ opt_a.head_num = 1; ++ opt_a.hidden_size = 1; ++ opt_a.size_per_head = 1; ++ opt_a.is_remove_padding = false; ++ opt_a.model_name = ""; ++ opt_a.compute_type = "fp32"; ++ opt_a.w_compute_type = "fp32"; ++ opt_a.s_compute_type = "fp32"; ++ ++ ++ if (read_args(argc, argv, &opt_a)) { ++ bool c_type_fp32 = (opt_a.compute_type.compare("fp32") == 0); ++ bool w_type_fp32 = (opt_a.w_compute_type.compare("fp32") == 0); ++ bool s_type_fp32 = (opt_a.s_compute_type.compare("fp32") == 0); ++ ++ s_type_fp32 = c_type_fp32; // Do softmax compute type as compute type ++ if (c_type_fp32 && w_type_fp32 && s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (c_type_fp32 && w_type_fp32 && !s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (c_type_fp32 && !w_type_fp32 && s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (c_type_fp32 && !w_type_fp32 && !s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (!c_type_fp32 && w_type_fp32 && s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (!c_type_fp32 && w_type_fp32 && !s_type_fp32) { ++ return MsExample(&opt_a); ++ } else if (!c_type_fp32 && !w_type_fp32 && s_type_fp32) { ++ return MsExample(&opt_a); ++ } else { // (!c_type_fp32 && !w_type_fp32 && !s_type_fp32) ++ return MsExample(&opt_a); ++ } ++ } ++} ++ ++template ++int ReadFileBuf(const std::string file, T* buf, size_t size_buff) { ++ if (file.empty()) { ++ FT_LOG_ERROR("file is nullptr\n"); ++ return -1; ++ } ++ ++ std::ifstream ifs(file); ++ if (!ifs.good()) { ++ FT_LOG_ERROR("file: %s does not exist\n", file.c_str()); ++ return -1; ++ } ++ ++ if (!ifs.is_open()) { ++ FT_LOG_ERROR("file: open failed\n"); ++ return -1; ++ } ++ ++ ifs.seekg(0, std::ios::end); ++ size_t file_size = ifs.tellg(); ++ if (file_size != size_buff) { ++ ifs.close(); ++ FT_LOG_ERROR("file: %s size is %d desc size is %d\n", file.c_str(), file_size, size_buff); ++ return -1; ++ } ++ // return 0; ++ ifs.seekg(0, std::ios::beg); ++ ifs.read(reinterpret_cast(buf), size_buff); ++ ifs.close(); ++ return 0; ++} ++ ++template ++int CalcTensorsSize(std::vector& tensors) { ++ int total = 0; ++ for (size_t i = 0; i < tensors.size(); i++) { ++ float size = 1; ++ for (size_t j = 0; j < tensors[i].shape.size(); j++) { ++ size *= tensors[i].shape[j]; ++ } ++ total += size; ++ } ++ ++ return total * sizeof(T); ++} ++ ++template ++int ReadTensors(std::vector& tensors, std::string post, opt_arg* opt_a, bool cpy = true) { ++ for (size_t i = 0; i < tensors.size(); i++) { ++ // if (tensors[i].type != TYPE_FP32) { ++ // FT_LOG_ERROR("Type not supported, exiting "); ++ // return -1; ++ // } ++ float size = 1; ++ for (size_t j = 0; j < tensors[i].shape.size(); j++) { ++ size *= tensors[i].shape[j]; ++ } ++ std::string suffix = post.compare("weight") == 0 ? opt_a->w_compute_type : opt_a->compute_type; ++ std::string fn = opt_a->model_name + "_" + post + std::to_string(i + 1) + "." + suffix; ++ T* input; ++ T* input_host = (T*)malloc(size * sizeof(T)); ++ int res = ReadFileBuf(fn, input_host, size * sizeof(T)); ++ FT_CHECK(!res); ++ if (tensors[i].where == MEMORY_GPU) { ++ deviceMalloc(&input, size, false); ++ if (cpy) ++ cudaH2Dcpy(input, input_host, size); ++ else ++ deviceMemSetZero(input, size); ++ tensors[i].data = input; ++ free(input_host); ++ input_host = 0; ++ } ++ else if (tensors[i].where == MEMORY_CPU) { ++ tensors[i].data = input_host; ++ } ++ } ++ return 0; ++} ++template ++static float CompareData(const T* refOutput, int size, const T* msTensorData) { ++ constexpr float relativeTolerance = 1e-5; ++ constexpr float absoluteTolerance = 1e-8; ++ size_t errorCount = 0; ++ float meanError = 0; ++ std::cout << "Out tensor size is: " << size << std::endl; ++ std::cout << "Data of model output: "; ++ for (int j = 0; j < std::min(50, size); j++) { ++ std::cout << static_cast(msTensorData[j]) << " "; ++ } ++ std::cout << std::endl; ++ std::cout << "Data of Ref output : "; ++ for (int j = 0; j < std::min(50, size); j++) { ++ std::cout << static_cast(refOutput[j]) << " "; ++ } ++ std::cout << std::endl; ++ for (int j = 0; j < size; j++) { ++ if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) { ++ std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl; ++ FT_LOG_ERROR("Output tensor has nan or inf data, compare fail\n"); ++ // return RET_ERROR; ++ return -1; ++ } ++ ++ auto tolerance = absoluteTolerance + relativeTolerance * fabs(refOutput[j]); ++ auto absoluteError = std::fabs(static_cast(msTensorData[j]) - static_cast(refOutput[j])); ++ if (absoluteError > tolerance) { ++ if (fabs(refOutput[j]) == 0) { ++ if (absoluteError > 1e-5) { ++ meanError += absoluteError; ++ errorCount++; ++ } ++ else { ++ continue; ++ } ++ } ++ else { ++ // just assume that atol = rtol ++ meanError += absoluteError / (fabs(refOutput[j]) + FLT_MIN); ++ errorCount++; ++ } ++ } ++ } ++ if (meanError > 0.0f) { ++ meanError /= errorCount; ++ } ++ if (meanError <= 0.0000001) { ++ std::cout << "Mean bias of tensor: 0%" << std::endl; ++ } ++ else { ++ std::cout << "Mean bias of tensor: " << meanError * 100 << "%" << std::endl; ++ } ++ std::cout << std::endl; ++ return meanError; ++} ++template ++int CompareOutput(std::vector output_python_tensors, std::vector output_tensors) { ++ float total_bias = 0; ++ int total_size = 0; ++ float accuracy_threshold_ = 0.5f; ++ bool has_error = false; ++ for (size_t i = 0; i < output_tensors.size(); i++) { ++ float size = 1; ++ for (size_t j = 0; j < output_tensors[i].shape.size(); j++) { ++ size *= output_tensors[i].shape[j]; ++ } ++ T* output_device = (T*)output_tensors[i].data; ++ T* output_host = (T*)malloc(size * sizeof(T)); ++ cudaD2Hcpy(output_host, output_device, size); ++ float bias = CompareData((T*)output_python_tensors[i].data, size, output_host); ++ free(output_host); ++ if (bias >= 0) { ++ total_bias += bias; ++ total_size++; ++ } else { ++ has_error = true; ++ break; ++ } ++ } ++ if (!has_error) { ++ float mean_bias; ++ if (total_size != 0) { ++ mean_bias = total_bias / total_size * 100; ++ } else { ++ mean_bias = 0; ++ } ++ ++ std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" ++ << " threshold is:" <accuracy_threshold_) { ++ FT_LOG_INFO("Mean bias of all nodes/tensors is too big: %f %",mean_bias); ++ std::cout << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl; ++ return -9; ++ } else { ++ return 0; ++ } ++ } else { ++ FT_LOG_ERROR("Error in CompareData"); ++ std::cerr << "Error in CompareData" << std::endl; ++ std::cout << "=======================================================" << std::endl << std::endl; ++ return -1; ++ } ++} ++ ++void FreeDesc(std::vector &desc){ ++ for (size_t i = 0; i < desc.size(); i++) ++ { ++ if (desc[i].where == MEMORY_GPU) { ++ cudaFree((float*)desc[i].data); ++ } ++ else if(desc[i].where == MEMORY_CPU){ ++ free((float*)desc[i].data); ++ } ++ } ++} ++ ++ ++template ++int MsExample(opt_arg* opt_a) { ++ printf("[INFO] Device: %s \n", getDeviceName().c_str()); ++ ++ cudaStream_t stream; ++ cublasHandle_t cublas_handle; ++ cublasLtHandle_t cublaslt_handle; ++ cudaStreamCreate(&stream); ++ cublasCreate(&cublas_handle); ++ cublasLtCreate(&cublaslt_handle); ++#ifdef SPARSITY_ENABLED ++ cusparseLtHandle_t cusparselt_handle; ++ CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle)); ++#endif ++ cublasSetStream(cublas_handle, stream); ++ cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in", ""); ++ ++ Allocator allocator(getDevice()); ++ ++ std::mutex* cublas_wrapper_mutex = new std::mutex(); ++#ifdef SPARSITY_ENABLED ++ cublasMMWrapper cublas_wrapper = cublasMMWrapper( ++ cublas_handle, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator); ++#else ++ cublasMMWrapper cublas_wrapper = ++ cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator); ++#endif ++ if (std::is_same::value) { ++ if (std::is_same::value) { ++ cublas_wrapper.setFP16MixedGemmConfig(); ++ } else { ++ cublas_wrapper.setFP16GemmConfig(); ++ } ++ } else if (std::is_same::value) { ++ if (std::is_same::value) { ++ cublas_wrapper.setFP32MixedGemmConfig(); ++ } else { ++ cublas_wrapper.setFP32GemmConfig(); ++ } ++ } ++ ++ DecriptorTest desc; ++ Init(opt_a, ++ desc, ++ stream, ++ &cublas_wrapper, ++ &allocator); ++ int res = ReadTensors(desc.input_tensors, std::string("input"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_tensors, std::string("output"), opt_a, false); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.output_python_tensors, std::string("output"), opt_a); ++ FT_CHECK(!res); ++ ++ res = ReadTensors(desc.w_tensors, std::string("weight"), opt_a); ++ FT_CHECK(!res); ++ ++ std::cout << "inputs size: " << CalcTensorsSize(desc.input_tensors) << std::endl; ++ std::cout << "weights size: " << CalcTensorsSize(desc.w_tensors) << std::endl; ++ std::cout << "ouputs size: " << CalcTensorsSize(desc.output_tensors) << std::endl; ++ ++ AttentionWeight attn_weights; ++ InitWeight(opt_a, attn_weights, desc.w_tensors); ++ ++ // test for BE !! ++ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++ CompareOutput(desc.output_python_tensors, desc.output_tensors); ++ ++#define DO_TIME ++#ifdef DO_TIME ++ // warmup ++ for (int i = 0; i < 10; i++) { ++ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++ } ++ ++ // profile time ++ const int ite = 1000; ++ CudaTimer cuda_timer(stream); ++ cuda_timer.start(); ++ for (int i = 0; i < ite; i++) { ++ desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights); ++ } ++ float total_time = cuda_timer.stop(); ++ ++ printf("batch_size %ld seq_len %ld layer %ld " ++ "AVG FT-CPP-time %.2f ms (%d iterations) " ++ "Total Time %.2f ms\n", ++ opt_a->batch_size, ++ opt_a->seq_len, ++ opt_a->num_layers, ++ total_time / ite, ++ ite, total_time); ++#endif ++ ++#ifdef SPARSITY_ENABLED ++ cusparseLtDestroy(&cusparselt_handle); ++#endif ++ delete cublas_algo_map; ++ delete cublas_wrapper_mutex; ++ FreeDesc(desc.output_tensors); ++ FreeDesc(desc.input_tensors); ++ FreeDesc(desc.output_python_tensors); ++ FreeDesc(desc.w_tensors); ++ return 0; ++} +diff --git a/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer +new file mode 160000 +index 0000000..cbaa0d8 +--- /dev/null ++++ b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer +@@ -0,0 +1 @@ ++Subproject commit cbaa0d8707db403d85ad0e13c59f2f71cd6db425 +diff --git a/examples/pytorch/vit/ViT-quantization/ViT-pytorch b/examples/pytorch/vit/ViT-quantization/ViT-pytorch +new file mode 160000 +index 0000000..460a162 +--- /dev/null ++++ b/examples/pytorch/vit/ViT-quantization/ViT-pytorch +@@ -0,0 +1 @@ ++Subproject commit 460a162767de1722a014ed2261463dbbc01196b6 +diff --git a/path.sh b/path.sh +new file mode 100755 +index 0000000..53f5ca6 +--- /dev/null ++++ b/path.sh +@@ -0,0 +1 @@ ++export PATH=/usr/local/cuda-11/bin:/home/yoni/.vscode-server/bin/4af164ea3a06f701fe3e89a2bcbb421d2026b68f/bin/remote-cli:/home/yoni/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +diff --git a/src/fastertransformer/kernels/activation_kernels.cu b/src/fastertransformer/kernels/activation_kernels.cu +index 7ff8e0f..e1be64c 100644 +--- a/src/fastertransformer/kernels/activation_kernels.cu ++++ b/src/fastertransformer/kernels/activation_kernels.cu +@@ -201,12 +201,21 @@ __global__ void add_bias(H_T* out, const B_T* __restrict bias, int m, int n) + } + } + ++template ++__global__ void add_bias_basic(H_T* out, const B_T* __restrict bias, int m, int n) ++{ ++ for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) { ++ out[id] = out[id] + (H_T)ldg(&bias[id % n]); ++ } ++} ++ + template<> + __global__ void add_bias(half* out, const half* __restrict bias, int m, int n) + { + half2* out_ptr = (half2*)out; + const half2* bias_ptr = (half2*)bias; +- for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) { ++ int id = blockIdx.x * blockDim.x + threadIdx.x; ++ for (; id < m * n; id += blockDim.x * gridDim.x) { + out_ptr[id] = out_ptr[id] + __ldg(&bias_ptr[id % n]); + } + } +@@ -228,15 +237,29 @@ void invokeAddBias(H_T* out, const B_T* bias, const int m, const int n, cudaStre + { + const int data_type_factor = 4 / sizeof(H_T); // 1 for fp32, 2 for fp16 and bf16 + dim3 block, grid; +- if (n / 4 / data_type_factor <= 1024) { +- block.x = n / 4 / data_type_factor; +- grid.x = m; +- } +- else { +- block.x = 1024; +- grid.x = ceil(m * n / 1024.); ++ ++ bool reminder = (data_type_factor != 1) ? (n % data_type_factor) : false; ++ if (reminder) { ++ if (n / 4 <= 1024) { ++ block.x = n / 4; ++ grid.x = m; ++ } ++ else { ++ block.x = 1024; ++ grid.x = ceil(m * n / 1024.); ++ } ++ add_bias_basic<<>>(out, bias, m, n); ++ } else { ++ if (n / 4 / data_type_factor <= 1024) { ++ block.x = n / 4 / data_type_factor; ++ grid.x = m; ++ } ++ else { ++ block.x = 1024; ++ grid.x = ceil(m * n / 1024.); ++ } ++ add_bias<<>>(out, bias, m, (n / data_type_factor)); + } +- add_bias<<>>(out, bias, m, n / data_type_factor); + } + + template void invokeAddBias(float* out, const float* bias, const int m, const int n, cudaStream_t stream); +diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu +index f951e71..4455879 100644 +--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu ++++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu +@@ -243,6 +243,116 @@ __global__ void softmax_kernel_v4(T* qk_buf_, + } + } + ++template ++__global__ void softmax_cross_kernel_v4(T* qk_buf_, ++ const T_IN* qk_buf_src, ++ const T* attr_mask, ++ const int batch_size, ++ const int head_num, ++ const int seq_len, ++ const int trgt_seq_len, ++ const T scalar) ++{ ++ for (int seq_id = blockIdx.x; seq_id < seq_len; seq_id += gridDim.x) { ++ float data[ITEMS_PER_THREAD]; ++ int qk_offset; ++ __shared__ float s_mean, s_max; ++ float local_max = -1e20f; ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ qk_offset = ++ ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ int mask_offset = (blockIdx.y * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ ++ float qk = static_cast(qk_buf_src[qk_offset]); ++ float mask_val = static_cast(ldg(&attr_mask[mask_offset])); ++ ++ mask_val = (1.0f - mask_val) * -10000.0f; ++ ++ data[i] = qk * static_cast(scalar) + mask_val; ++ local_max = fmax(local_max, data[i]); ++ } ++ ++ float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax(local_max); ++ if (threadIdx.x == 0) { ++ s_max = max_val; ++ } ++ __syncthreads(); ++ ++ float local_sum = 0; ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ data[i] = __expf(data[i] - s_max); ++ local_sum += data[i]; ++ } ++ float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum(local_sum); ++ if (threadIdx.x == 0) { ++ s_mean = sum_val + 1e-6f; ++ s_mean = __fdividef(1.0f, s_mean); ++ } ++ __syncthreads(); ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ qk_offset = ++ ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ qk_buf_[qk_offset] = (T)(data[i] * s_mean); ++ } ++ } ++} ++ ++template ++__global__ void softmax_mix_kernel_v4(T* qk_buf_, ++ const T_M* attr_mask, ++ const int batch_size, ++ const int head_num, ++ const int seq_len, ++ const int trgt_seq_len, ++ const T scalar) ++{ ++ T* qk_buf_src = qk_buf_; ++ for (int seq_id = blockIdx.x; seq_id < seq_len; seq_id += gridDim.x) { ++ float data[ITEMS_PER_THREAD]; ++ int qk_offset; ++ __shared__ float s_mean, s_max; ++ float local_max = -1e20f; ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ qk_offset = ++ ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ int mask_offset = (blockIdx.y * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ ++ float qk = static_cast(qk_buf_src[qk_offset]); ++ float mask_val = static_cast(ldg(&attr_mask[mask_offset])); ++ ++ mask_val = (1.0f - mask_val) * -10000.0f; ++ ++ data[i] = qk * static_cast(scalar) + mask_val; ++ local_max = fmax(local_max, data[i]); ++ } ++ ++ float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax(local_max); ++ if (threadIdx.x == 0) { ++ s_max = max_val; ++ } ++ __syncthreads(); ++ ++ float local_sum = 0; ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ data[i] = __expf(data[i] - s_max); ++ local_sum += data[i]; ++ } ++ float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum(local_sum); ++ if (threadIdx.x == 0) { ++ s_mean = sum_val + 1e-6f; ++ s_mean = __fdividef(1.0f, s_mean); ++ } ++ __syncthreads(); ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < trgt_seq_len; i++) { ++ qk_offset = ++ ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * trgt_seq_len + blockDim.x * i + threadIdx.x; ++ qk_buf_[qk_offset] = (T)(data[i] * s_mean); ++ } ++ } ++} ++ + template + __global__ void softmax_kernel_v4_half2( + T* qk_buf_, const T* attr_mask, const int batch_size, const int head_num, const int seq_len, const T scalar) +@@ -298,6 +408,61 @@ __global__ void softmax_kernel_v4_half2( + } + } + ++template ++__global__ void softmax_cross_kernel_v4_half2( ++ T* qk_buf_, const T* attr_mask, const int batch_size, const int head_num, const int seq_len, const int trgt_seq_len, const T scalar) ++{ ++ using T2 = typename TypeConverter::Type; ++ T2* qk_buf_half2 = (T2*)qk_buf_; ++ const T2* attr_mask_half2 = (const T2*)attr_mask; ++ ++ for (int seq_id = blockIdx.x; seq_id < seq_len; seq_id += gridDim.x) { ++ T2 data[ITEMS_PER_THREAD]; ++ int qk_offset; ++ __shared__ float s_mean, s_max; ++ float local_max = -1e20f; ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++ qk_offset = ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * (trgt_seq_len / 2) + blockDim.x * i ++ + threadIdx.x; ++ int mask_offset = (blockIdx.y * seq_len + seq_id) * (trgt_seq_len / 2) + blockDim.x * i + threadIdx.x; ++ ++ T2 qk = qk_buf_half2[qk_offset]; ++ T2 mask_val = ldg(&attr_mask_half2[mask_offset]); ++ mask_val = hmul2(hsub2(float2type2(1.0f), mask_val), float2type2(-10000.0f)); ++ ++ data[i] = hadd2(hmul2(qk, type2type2(scalar)), mask_val); ++ ++ local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y)); ++ } ++ ++ float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax(local_max); ++ if (threadIdx.x == 0) { ++ s_max = max_val; ++ } ++ __syncthreads(); ++ ++ float local_sum = 0; ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++ data[i] = hexp2(hsub2(data[i], float2type2(s_max))); ++ local_sum += (float)(data[i].x + data[i].y); ++ } ++ ++ float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum(local_sum); ++ ++ if (threadIdx.x == 0) { ++ s_mean = sum_val + 1e-6f; ++ s_mean = __fdividef(1.0f, s_mean); ++ } ++ __syncthreads(); ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++ qk_offset = ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id) * (trgt_seq_len / 2) + blockDim.x * i ++ + threadIdx.x; ++ qk_buf_half2[qk_offset] = hmul2(data[i], float2type2(s_mean)); ++ } ++ } ++} ++ + template + __global__ void softmax_kernel_v5_half2( + T* qk_buf_, const T* attr_mask, const int batch_size, const int head_num, const int seq_len, const T scalar) +@@ -415,6 +580,123 @@ __global__ void softmax_kernel_v5_half2( + } + } + ++template ++__global__ void softmax_cross_kernel_v5_half2( ++ T* qk_buf_, const T* attr_mask, const int batch_size, const int head_num, const int seq_len, const int trgt_seq_len, const T scalar) ++{ ++ using T2 = typename TypeConverter::Type; ++ T2* qk_buf_half2 = (T2*)qk_buf_; ++ const T2* attr_mask_half2 = (const T2*)attr_mask; ++ ++ for (int seq_id = blockIdx.x; seq_id < seq_len; seq_id += gridDim.x * NUM) { ++ T2 data[NUM][ITEMS_PER_THREAD]; ++ ++ int qk_offset[NUM]; ++ ++ __shared__ float s_sum[NUM], s_max[NUM]; ++ float local_max[NUM]; ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ local_max[j] = -1e20f; ++ } ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++ int mask_offset[NUM]; ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ qk_offset[j] = ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id + j * gridDim.x) * (trgt_seq_len / 2) ++ + blockDim.x * i + threadIdx.x; ++ mask_offset[j] = ++ (blockIdx.y * seq_len + seq_id + j * gridDim.x) * (trgt_seq_len / 2) + blockDim.x * i + threadIdx.x; ++ } ++ ++ T2 mask_val[NUM]; ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ mask_val[j] = ldg(&attr_mask_half2[mask_offset[j]]); ++ } ++ ++ T2 qk[NUM]; ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ qk[j] = qk_buf_half2[qk_offset[j]]; ++ } ++ ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ mask_val[j] = hmul2(hsub2(float2type2(1.0f), mask_val[j]), float2type2(-10000.0f)); ++ } ++ ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ data[j][i] = hadd2(hmul2(qk[j], type2type2(scalar)), mask_val[j]); ++ local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y)); ++ } ++ } ++ ++ if (blockDim.x <= 32) { ++ warpReduceMaxV2(local_max); ++ } ++ else { ++ blockReduceMaxV2(local_max); ++ } ++ ++ if (threadIdx.x == 0) { ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ s_max[j] = local_max[j]; ++ } ++ } ++ __syncthreads(); ++ ++ float local_sum[NUM]; ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ local_sum[j] = {0.f}; ++ } ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ data[j][i] = hexp2(hsub2(data[j][i], float2type2(s_max[j]))); ++ } ++ ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ local_sum[j] += (float)(data[j][i].x + data[j][i].y); ++ } ++ } ++ ++ if (blockDim.x <= 32) { ++ warpReduceSumV2(local_sum); ++ } ++ else { ++ blockReduceSumV2(local_sum); ++ } ++ ++ if (threadIdx.x == 0) { ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ s_sum[j] = __fdividef(1.0f, local_sum[j] + 1e-6f); ++ } ++ } ++ __syncthreads(); ++ ++ for (int i = 0; blockDim.x * i + threadIdx.x < (trgt_seq_len / 2) && i < ITEMS_PER_THREAD; i++) { ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ qk_offset[j] = ((blockIdx.y * head_num + blockIdx.z) * seq_len + seq_id + j * gridDim.x) * (trgt_seq_len / 2) ++ + blockDim.x * i + threadIdx.x; ++ } ++ ++#pragma unroll ++ for (int j = 0; j < NUM; j++) { ++ qk_buf_half2[qk_offset[j]] = hmul2(data[j][i], float2type2(s_sum[j])); ++ } ++ } ++ } ++} ++ + #define SOFTMAX_KERNEL(ITEMS_PER_THREAD) \ + block.x /= ITEMS_PER_THREAD; \ + assert(block.x <= 1024); \ +@@ -434,6 +716,50 @@ __global__ void softmax_kernel_v5_half2( + <<>>(buffer, buffer_src, attr_mask, batch_size, head_num, seq_len, scalar); \ + } + ++#define SOFTMAX_CROSS_KERNEL(ITEMS_PER_THREAD) \ ++ block.x /= ITEMS_PER_THREAD; \ ++ assert(block.x <= 1024); \ ++ if (is_half2) { \ ++ if (grid.x % 4 == 0) { \ ++ grid.x /= 4; \ ++ softmax_cross_kernel_v5_half2<<>>( \ ++ (half*)buffer, (const half*)attr_mask, batch_size, head_num, seq_len, trgt_seq_len, \ ++ (const half)scalar); \ ++ } \ ++ else { \ ++ softmax_cross_kernel_v4_half2<<>>( \ ++ (half*)buffer, (const half*)attr_mask, batch_size, head_num, seq_len, trgt_seq_len, \ ++ (const half)scalar); \ ++ } \ ++ } \ ++ else { \ ++ softmax_cross_kernel_v4 \ ++ <<>>(buffer, buffer_src, attr_mask, batch_size, head_num, seq_len, \ ++ trgt_seq_len, scalar); \ ++ } ++ ++#define SOFTMAX_MIX_KERNEL(ITEMS_PER_THREAD) \ ++ block.x /= ITEMS_PER_THREAD; \ ++ assert(block.x <= 1024); \ ++ if (is_half2) { \ ++ if (grid.x % 4 == 0) { \ ++ grid.x /= 4; \ ++ softmax_cross_kernel_v5_half2<<>>( \ ++ (half*)io_buffer, (const half*)attr_mask, batch_size, head_num, seq_len, trgt_seq_len, \ ++ (const half)scalar); \ ++ } \ ++ else { \ ++ softmax_cross_kernel_v4_half2<<>>( \ ++ (half*)io_buffer, (const half*)attr_mask, batch_size, head_num, seq_len, trgt_seq_len, \ ++ (const half)scalar); \ ++ } \ ++ } \ ++ else { \ ++ softmax_mix_kernel_v4 \ ++ <<>>(io_buffer, attr_mask, batch_size, head_num, seq_len, \ ++ trgt_seq_len, scalar); \ ++ } ++ + #ifdef ENABLE_BF16 + #define SOFTMAX_KERNEL_BF16(ITEMS_PER_THREAD) \ + block.x /= ITEMS_PER_THREAD; \ +@@ -501,6 +827,80 @@ void invokeMaskedSoftMax(T* buffer, + } + } + ++template ++void invokeCrossMaskedSoftMax(T* buffer, ++ const T_IN* buffer_src, ++ const T* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int trgt_seq_len, ++ const int head_num, ++ const T scalar, ++ cudaStream_t stream) ++{ ++ ++ dim3 grid(seq_len, batch_size, head_num); ++ if (batch_size * head_num > 360) { ++ grid.x = ceil(float(seq_len) / 32.0f); ++ } ++ ++ bool is_half2 = sizeof(T) == 2 && sizeof(T_IN) == 2 && trgt_seq_len % 2 == 0; ++ dim3 block((trgt_seq_len / (is_half2 ? 2 : 1) + 31) / 32 * 32); ++ ++ if (block.x > 3072 && block.x <= 4096) { ++ SOFTMAX_CROSS_KERNEL(4) ++ } ++ if (block.x > 2048) { ++ SOFTMAX_CROSS_KERNEL(3) ++ } ++ else if (block.x > 1024) { ++ SOFTMAX_CROSS_KERNEL(2) ++ } ++ else if (block.x > 0) { ++ SOFTMAX_CROSS_KERNEL(1) ++ } ++ else { ++ FT_CHECK(trgt_seq_len <= 4096 || seq_len <= 4096); ++ } ++} ++ ++ ++template ++void invokeMixMaskedSoftMax(T* io_buffer, ++ const T_M* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int trgt_seq_len, ++ const int head_num, ++ const T scalar, ++ cudaStream_t stream) ++{ ++ ++ dim3 grid(seq_len, batch_size, head_num); ++ if (batch_size * head_num > 360) { ++ grid.x = ceil(float(seq_len) / 32.0f); ++ } ++ ++ bool is_half2 = sizeof(T) == 2 && sizeof(T_M) == 2 && trgt_seq_len % 2 == 0; ++ dim3 block((trgt_seq_len / (is_half2 ? 2 : 1) + 31) / 32 * 32); ++ ++ if (block.x > 3072 && block.x <= 4096) { ++ SOFTMAX_MIX_KERNEL(4) ++ } ++ if (block.x > 2048) { ++ SOFTMAX_MIX_KERNEL(3) ++ } ++ else if (block.x > 1024) { ++ SOFTMAX_MIX_KERNEL(2) ++ } ++ else if (block.x > 0) { ++ SOFTMAX_MIX_KERNEL(1) ++ } ++ else { ++ FT_CHECK(trgt_seq_len <= 4096 || seq_len <= 4096); ++ } ++} ++ + #ifdef ENABLE_BF16 + template<> + void invokeMaskedSoftMax(__nv_bfloat16* buffer, +@@ -574,13 +974,78 @@ void invokeMaskedSoftMax(__nv_bfloat16* buffer, + FT_CHECK(seq_len <= 4096); + } + } ++ ++template<> ++void invokeCrossMaskedSoftMax(__nv_bfloat16* buffer, ++ const __nv_bfloat16* buffer_src, ++ const __nv_bfloat16* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const __nv_bfloat16 scalar, ++ cudaStream_t stream) {;} ++ ++template<> ++void invokeCrossMaskedSoftMax(__nv_bfloat16* buffer, ++ const float* buffer_src, ++ const __nv_bfloat16* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const __nv_bfloat16 scalar, ++ cudaStream_t stream) {;} + #endif // ENABLE_BF16 + +-template void invokeMaskedSoftMax(float* buffer, ++template void invokeMixMaskedSoftMax(float* io_buffer, ++ const float* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const float scalar, ++ cudaStream_t stream); ++ ++template void invokeMixMaskedSoftMax(half* io_buffer, ++ const half* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const half scalar, ++ cudaStream_t stream); ++ ++template void invokeMixMaskedSoftMax(float* io_buffer, ++ const half* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const float scalar, ++ cudaStream_t stream); ++ ++template void invokeMixMaskedSoftMax(half* io_buffer, ++ const float* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const half scalar, ++ cudaStream_t stream); ++ ++ template void invokeMaskedSoftMax(float* buffer, ++ const float* buffer_src, ++ const float* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int head_num, ++ const float scalar, ++ cudaStream_t stream); ++ ++template void invokeCrossMaskedSoftMax(float* buffer, + const float* buffer_src, + const float* attr_mask, + const int batch_size, +- const int seq_len, ++ const int seq_len, const int trgt_seq_len, + const int head_num, + const float scalar, + cudaStream_t stream); +@@ -594,6 +1059,15 @@ template void invokeMaskedSoftMax(half* buffer, + const half scalar, + cudaStream_t stream); + ++template void invokeCrossMaskedSoftMax(half* buffer, ++ const float* buffer_src, ++ const half* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const half scalar, ++ cudaStream_t stream); ++ + template void invokeMaskedSoftMax(half* buffer, + const half* buffer_src, + const half* attr_mask, +@@ -603,6 +1077,15 @@ template void invokeMaskedSoftMax(half* buffer, + const half scalar, + cudaStream_t stream); + ++template void invokeCrossMaskedSoftMax(half* buffer, ++ const half* buffer_src, ++ const half* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const half scalar, ++ cudaStream_t stream); ++ + #ifdef ENABLE_BF16 + template void invokeMaskedSoftMax(__nv_bfloat16* buffer, + const __nv_bfloat16* buffer_src, +@@ -621,6 +1104,25 @@ template void invokeMaskedSoftMax(__nv_bfloat16* buffer, + const int head_num, + const __nv_bfloat16 scalar, + cudaStream_t stream); ++ ++template void invokeCrossMaskedSoftMax(__nv_bfloat16* buffer, ++ const __nv_bfloat16* buffer_src, ++ const __nv_bfloat16* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const __nv_bfloat16 scalar, ++ cudaStream_t stream); ++ ++template void invokeCrossMaskedSoftMax(__nv_bfloat16* buffer, ++ const float* buffer_src, ++ const __nv_bfloat16* attr_mask, ++ const int batch_size, ++ const int seq_len, const int trgt_seq_len, ++ const int head_num, ++ const __nv_bfloat16 scalar, ++ cudaStream_t stream); ++ + #endif // ENABLE_BF16 + + template +@@ -726,9 +1228,10 @@ void invokeTransposeQKV(T* dst, + seq_per_block *= 2; + } + +- FT_CHECK(grid.x * seq_per_block == batch_size * head_num * seq_len); ++ FT_CHECK((int)(grid.x * seq_per_block) == batch_size * head_num * seq_len); + +- if (seq_per_block * size_per_head % 2 == 0) { ++ // if (seq_per_block * size_per_head % 2 == 0) { ++ if (size_per_head % 2 == 0) { + block.x = seq_per_block * size_per_head / 2; + if (std::is_same::value) { + transpose<<>>( +@@ -1061,12 +1564,12 @@ template void invokeTransposeAttentionOutRemovePadding(half* src, + const int* mask_offset, + cudaStream_t stream); + +-template ++template + __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, + T* k_buf, + T* v_buf, + const T* __restrict QKV, +- const T* __restrict qkv_bias, ++ const U* __restrict qkv_bias, + const int batch_size, + const int seq_len, + const int head_num, +@@ -1081,7 +1584,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, + for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < batch_size * seq_len * 3 * n; + index += gridDim.x * blockDim.x) { + int bias_id = index % (3 * n); +- T val = ldg(&QKV[index]) + ldg(&qkv_bias[bias_id]); ++ T val = ldg(&QKV[index]) + (T)ldg(&qkv_bias[bias_id]); + + int tmp_index = index; + const int target_batch_id = tmp_index / (seq_len * 3 * n); +@@ -1116,12 +1619,12 @@ struct Vec_t<__nv_bfloat16> { + }; + #endif + +-template ++template + __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, + T* k_buf, + T* v_buf, + const T* __restrict QKV, +- const T* __restrict qkv_bias, ++ const U* __restrict qkv_bias, + const int batch_size, + const int seq_len, + const int head_num, +@@ -1170,12 +1673,12 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf, + *reinterpret_cast(&v_buf[dest_idx]) = v; + } + +-template ++template + void invokeAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, +- const T* qkv_bias, ++ const U* qkv_bias, + const int batch_size, + const int seq_len, + const int head_num, +@@ -1200,6 +1703,155 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, + } + } + ++ ++template ++__global__ void invokeCrossAddFusedQKVBiasTransposeQ(T* q_buf, ++ const T* __restrict QKV, ++ const U* __restrict qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int head_num, ++ const int size_per_head) ++{ ++ // QKV: [m, 1, n] ++ // qkv_bias: [1, n] ++ // q_buf: [batch, head_num, seq_len, size_per_head] ++ ++ T* qkv_ptr[1] = {q_buf}; ++ const int n = head_num * size_per_head; ++ for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < batch_size * seq_len * 1 * n; ++ index += gridDim.x * blockDim.x) { ++ int bias_id = index % (1 * n); ++ T val = ldg(&QKV[index]) + (T)ldg(&qkv_bias[bias_id]); ++ ++ int tmp_index = index; ++ const int target_batch_id = tmp_index / (seq_len * 1 * n); ++ tmp_index -= target_batch_id * seq_len * 1 * n; ++ const int seq_id = tmp_index / (1 * n); ++ tmp_index -= seq_id * 1 * n; ++ const int qkv_id = tmp_index / n; ++ tmp_index -= qkv_id * n; ++ const int head_id = tmp_index / size_per_head; ++ const int size_id = tmp_index - head_id * size_per_head; ++ ++ qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head ++ + seq_id * size_per_head + size_id] = val; ++ } ++} ++ ++template ++__global__ void invokeCrossAddFusedQKVBiasTransposeKV(T* k_buf, T* v_buf, ++ const T* __restrict QKV, ++ const U* __restrict qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int head_num, ++ const int size_per_head) ++{ ++ // QKV: [m, 2, n] ++ // qkv_bias: [2, n] ++ // k_buf, v_buf: [batch, head_num, seq_len, size_per_head] ++ ++ T* qkv_ptr[2] = {k_buf, v_buf}; ++ const int n = head_num * size_per_head; ++ for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < batch_size * seq_len * 2 * n; ++ index += gridDim.x * blockDim.x) { ++ int bias_id = index % (2 * n); ++ T val = ldg(&QKV[index]) + (T)ldg(&qkv_bias[bias_id]); ++ ++ int tmp_index = index; ++ const int target_batch_id = tmp_index / (seq_len * 2 * n); ++ tmp_index -= target_batch_id * seq_len * 2 * n; ++ const int seq_id = tmp_index / (2 * n); ++ tmp_index -= seq_id * 2 * n; ++ const int qkv_id = tmp_index / n; ++ tmp_index -= qkv_id * n; ++ const int head_id = tmp_index / size_per_head; ++ const int size_id = tmp_index - head_id * size_per_head; ++ //printf("%d %d\n", head_id, size_id); ++ qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head ++ + seq_id * size_per_head + size_id] = val; ++ } ++} ++ ++template ++void invokeCrossAddFusedQKVBiasTranspose(T* q_buf, ++ T* k_buf, ++ T* v_buf, ++ T* QKV, ++ const U* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream) ++{ ++ ++ const int m = batch_size * seq_len; ++ const int n = head_num * size_per_head; ++ dim3 block(384); ++ dim3 grid((int)(ceil(1.0 * m * n / 384))); ++ invokeCrossAddFusedQKVBiasTransposeQ<<>>( ++ q_buf, QKV, qkv_bias, batch_size, seq_len, head_num, size_per_head); ++ ++ const int m2 = batch_size * tgt_seq_len; ++ const int n2 = head_num * size_per_head; ++ dim3 block2(384); ++ dim3 grid2((int)(ceil(1.0 * m2 * n2 / 384))); ++ invokeCrossAddFusedQKVBiasTransposeKV<<>>( ++ k_buf, v_buf, QKV + m * n, qkv_bias + n2, batch_size, tgt_seq_len, head_num, size_per_head); ++ ++} ++ ++template void invokeCrossAddFusedQKVBiasTranspose(float* q_buf, ++ float* k_buf, ++ float* v_buf, ++ float* QKV, ++ const float* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++ ++template void invokeCrossAddFusedQKVBiasTranspose(half* q_buf, ++ half* k_buf, ++ half* v_buf, ++ half* QKV, ++ const half* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++ ++template void invokeCrossAddFusedQKVBiasTranspose(float* q_buf, ++ float* k_buf, ++ float* v_buf, ++ float* QKV, ++ const half* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++ ++template void invokeCrossAddFusedQKVBiasTranspose(half* q_buf, ++ half* k_buf, ++ half* v_buf, ++ half* QKV, ++ const float* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++ + template void invokeAddFusedQKVBiasTranspose(float* q_buf, + float* k_buf, + float* v_buf, +@@ -1224,6 +1876,30 @@ template void invokeAddFusedQKVBiasTranspose(half* q_buf, + const int rotary_embedding_dim, + cudaStream_t stream); + ++template void invokeAddFusedQKVBiasTranspose(float* q_buf, ++ float* k_buf, ++ float* v_buf, ++ float* QKV, ++ const half* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int head_num, ++ const int size_per_head, ++ const int rotary_embedding_dim, ++ cudaStream_t stream); ++ ++template void invokeAddFusedQKVBiasTranspose(half* q_buf, ++ half* k_buf, ++ half* v_buf, ++ half* QKV, ++ const float* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int head_num, ++ const int size_per_head, ++ const int rotary_embedding_dim, ++ cudaStream_t stream); ++ + #ifdef ENABLE_BF16 + template void invokeAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, + __nv_bfloat16* k_buf, +@@ -1236,6 +1912,19 @@ template void invokeAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, + const int size_per_head, + const int rotary_embedding_dim, + cudaStream_t stream); ++ ++template void invokeCrossAddFusedQKVBiasTranspose(__nv_bfloat16* q_buf, ++ __nv_bfloat16* k_buf, ++ __nv_bfloat16* v_buf, ++ __nv_bfloat16* QKV, ++ const __nv_bfloat16* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++ + #endif + + template +diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h +index be8b178..21d9a62 100644 +--- a/src/fastertransformer/kernels/unfused_attention_kernels.h ++++ b/src/fastertransformer/kernels/unfused_attention_kernels.h +@@ -42,6 +42,26 @@ void invokeMaskedSoftMax(T* buffer, + const int head_num, + const T scalar, + cudaStream_t stream); ++template ++void invokeCrossMaskedSoftMax(T* buffer, ++ const T_IN* buffer_src, ++ const T* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const T scalar, ++ cudaStream_t stream); ++ ++template ++void invokeMixMaskedSoftMax(T* io_buffer, ++ const T_M* attr_mask, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const T scalar, ++ cudaStream_t stream); + + template + void invokeTransposeQKV(T* dst, +@@ -81,12 +101,12 @@ void invokeTransposeAttentionOutRemovePadding(T* src, + const int* mask_offset, + cudaStream_t stream); + +-template ++template + void invokeAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, +- const T* qkv_bias, ++ const U* qkv_bias, + const int batch_size, + const int seq_len, + const int head_num, +@@ -97,12 +117,29 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf, + q_buf, k_buf, v_buf, QKV, qkv_bias, batch_size, seq_len, head_num, size_per_head, 0, stream); + } + +-template ++template ++void invokeCrossAddFusedQKVBiasTranspose(T* q_buf, ++ T* k_buf, ++ T* v_buf, ++ T* QKV, ++ const U* qkv_bias, ++ const int batch_size, ++ const int seq_len, ++ const int tgt_seq_len, ++ const int head_num, ++ const int size_per_head, ++ cudaStream_t stream); ++// { ++// invokeCrossAddFusedQKVBiasTranspose( ++// q_buf, k_buf, v_buf, QKV, qkv_bias, batch_size, seq_len, tgt_seq_len, head_num, size_per_head, stream); ++// } ++ ++template + void invokeAddFusedQKVBiasTranspose(T* q_buf, + T* k_buf, + T* v_buf, + T* QKV, +- const T* qkv_bias, ++ const U* qkv_bias, + const int batch_size, + const int seq_len, + const int head_num, +diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt +index cbaf4fa..2ab5320 100644 +--- a/src/fastertransformer/layers/CMakeLists.txt ++++ b/src/fastertransformer/layers/CMakeLists.txt +@@ -30,15 +30,18 @@ set_property(TARGET FfnLayerINT8 PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET FfnLayerINT8 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(FfnLayerINT8 PUBLIC -lcublasLt -lcublas -lcudart cublasMMWrapper cublasINT8MMWrapper activation_int8_kernels memory_utils) + ++if(EXAMPLES) + add_library(TensorParallelGeluFfnLayer STATIC TensorParallelGeluFfnLayer.cc) + set_property(TARGET TensorParallelGeluFfnLayer PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET TensorParallelGeluFfnLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(TensorParallelGeluFfnLayer PUBLIC -lcudart FfnLayer nccl_utils) + ++ + add_library(TensorParallelReluFfnLayer STATIC TensorParallelReluFfnLayer.cc) + set_property(TARGET TensorParallelReluFfnLayer PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET TensorParallelReluFfnLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(TensorParallelReluFfnLayer PUBLIC -lcudart FfnLayer nccl_utils) ++endif() + + add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc) + set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON) +diff --git a/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h +index b21e3a7..746cb71 100644 +--- a/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h ++++ b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h +@@ -62,13 +62,13 @@ AttentionType getAttentionTypeINT8( + } + } + +-template ++template + class BaseAttentionLayer: public BaseLayer { + + public: + virtual void forward(std::vector* output_tensors, + const std::vector* input_tensors, +- const AttentionWeight* attention_weights) = 0; ++ const AttentionWeight* attention_weights) = 0; + BaseAttentionLayer(cudaStream_t stream, + cublasMMWrapper* cublas_wrapper, + IAllocator* allocator, +diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt +index 9cef315..7170af4 100644 +--- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt ++++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt +@@ -42,8 +42,8 @@ target_link_libraries(DecoderSelfAttentionLayer PUBLIC -lcublas -lcudart cublasM + add_library(GptContextAttentionLayer STATIC GptContextAttentionLayer.cc) + set_property(TARGET GptContextAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +-target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels) +- ++target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels) ++if(EXAMPLES) + add_library(TensorParallelDecoderSelfAttentionLayer STATIC TensorParallelDecoderSelfAttentionLayer.cc) + set_property(TARGET TensorParallelDecoderSelfAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET TensorParallelDecoderSelfAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -63,6 +63,7 @@ add_library(TensorParallelUnfusedAttentionLayer STATIC TensorParallelUnfusedAtte + set_property(TARGET TensorParallelUnfusedAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET TensorParallelUnfusedAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(TensorParallelUnfusedAttentionLayer PUBLIC -lcudart UnfusedAttentionLayer nccl_utils) ++endif() + + add_library(WindowAttention STATIC WindowAttention.cc) + set_property(TARGET WindowAttention PROPERTY POSITION_INDEPENDENT_CODE ON) +diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +index bada640..e606bc2 100644 +--- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc ++++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc +@@ -17,6 +17,7 @@ + + #include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h" + #include "src/fastertransformer/kernels/unfused_attention_kernels.h" ++#include "src/fastertransformer/kernels/activation_kernels.h" + + namespace fastertransformer { + +@@ -49,7 +50,7 @@ void GptContextAttentionLayer::forward(std::vector + T* attention_out = (T*)output_tensors->at(0).data; + const T* attention_input = (const T*)input_tensors->at(0).data; + const T* attention_mask = (const T*)input_tensors->at(1).data; +- const bool is_final = *((bool*)(input_tensors->at(2).data)); ++ const bool is_final = false; // *((bool*)(input_tensors->at(2).data)); + + const int m = input_tensors->at(0).shape[0]; + +@@ -428,4 +429,503 @@ template class GptContextAttentionLayer; + template class GptContextAttentionLayer<__nv_bfloat16>; + #endif + ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++template ++MSMHALayer::MSMHALayer(size_t max_batch_size, ++ size_t max_src_seq_len, ++ size_t max_tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse): ++ BaseAttentionLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), ++ max_batch_size_(max_batch_size), ++ max_src_seq_len_(max_src_seq_len), ++ max_tgt_seq_len_(max_tgt_seq_len), ++ head_num_(head_num), ++ size_per_head_(size_per_head), ++ hidden_size_(head_num * size_per_head), ++ is_qk_buf_float_(false), // for now set to false ++ optimized_offset_(0) ++{ ++} ++ ++template ++void MSMHALayer::forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const AttentionWeight* attention_weights) ++{ ++ // input_tensors: use 1 gemm -- multi head attention ++ // input_query [batch_size * seq_len, hidden_dimension] ++ // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++ // input_tensors: use 2 gemm -- cross attention ++ // input_query [batch_size * seq_len, hidden_dimension] ++ // enc_output [batch_size * tgt_len, hidden_dimension] ++ // attention_mask [batch_size, 1, seq_len, seq_len] ++ ++ // output_tensors: ++ // attention_out [batch_size * seq_len, hidden_dimension] ++ // key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x] ++ // value_cache [batch, local_head_num, max_seq_len, size_per_head] ++ ++ ++ // validate in / out tensors ++ int in_tensor_number = input_tensors->size(); ++ FT_CHECK(in_tensor_number == 2 || in_tensor_number == 3); ++ FT_CHECK(output_tensors->size() == 3); ++ // FT_CHECK(isValidBatchSize(request_batch_size); ++ // FT_CHECK(isValidSrcSeqLen(request_src_seq_len); ++ // FT_CHECK(isValidTgtSeqLen(request_tgt_seq_len); ++ ++ // setup in tensors id ++ int attn_input_tensor_id = 0; ++ int encoder_out_tensor_id = (in_tensor_number == 3) ? 1 : 0; ++ int attn_mask_tensor_id = in_tensor_number - 1; // always last tensor ++ ++ const int request_batch_size = input_tensors->at(attn_mask_tensor_id).shape[0]; ++ const int request_src_seq_len = input_tensors->at(attn_mask_tensor_id).shape[2]; ++ const int request_tgt_seq_len = input_tensors->at(attn_mask_tensor_id).shape[3]; ++ ++ // alloc buffer according to current in size ++ allocateBuffer(request_batch_size, request_src_seq_len, request_tgt_seq_len); ++ sync_check_cuda_error(); ++ ++ const T* attention_input = (const T*)input_tensors->at(attn_input_tensor_id).data; ++ const T* attention_mask = (const T*)input_tensors->at(attn_mask_tensor_id).data; ++ const bool is_final = false; ++ ++ const cudaDataType_t gemm_data_type = (std::is_same::value) ? CUDA_R_32F : CUDA_R_16F; ++ const cudaDataType_t softmax_data_type = (std::is_same::value) ? CUDA_R_32F : CUDA_R_16F; ++ ++ const int m = input_tensors->at(attn_input_tensor_id).shape[0]; ++ if (input_tensors->size() == 3) { ++ // cross attention ++ Tensor encoder_output_tensor = input_tensors->at(encoder_out_tensor_id); ++ if (request_src_seq_len == request_tgt_seq_len) { ++ // if source and target seq are the same we can boost performance ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ hidden_size_, // n ++ m, ++ hidden_size_, // k ++ attention_weights->query_weight.kernel, ++ hidden_size_, // n ++ attention_input, ++ hidden_size_, // k ++ qkv_buf_, ++ 3 * hidden_size_ /* n */); ++ ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ 2 * hidden_size_, // n ++ encoder_output_tensor.shape[0], ++ hidden_size_, // k ++ attention_weights->key_weight.kernel, ++ 2 * hidden_size_, // n ++ encoder_output_tensor.data, ++ hidden_size_, // k ++ qkv_buf_ + hidden_size_, ++ 3 * hidden_size_ /* n */); ++ } else { ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ hidden_size_, // n ++ m, ++ hidden_size_, // k ++ attention_weights->query_weight.kernel, ++ hidden_size_, // n ++ attention_input, ++ hidden_size_, // k ++ qkv_buf_, ++ hidden_size_ /* n */); ++ ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ 2 * hidden_size_, // n ++ encoder_output_tensor.shape[0], ++ hidden_size_, // k ++ attention_weights->key_weight.kernel, ++ 2 * hidden_size_, // n ++ encoder_output_tensor.data, ++ hidden_size_, // k ++ qkv_buf_ + m * hidden_size_, ++ 2 * hidden_size_ /* n */); ++ } ++ } else { ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ 3 * hidden_size_, // n ++ m, ++ hidden_size_, // k ++ attention_weights->query_weight.kernel, ++ 3 * hidden_size_, // n ++ attention_input, ++ hidden_size_, // k ++ qkv_buf_, ++ 3 * hidden_size_ /* n */); ++ } ++ sync_check_cuda_error(); ++ ++ if (request_src_seq_len == request_tgt_seq_len) { ++ invokeAddFusedQKVBiasTranspose( ++ (T*)q_buf_2_, ++ (T*)output_tensors->at(1).data, //k_buf_2_, ++ (T*)output_tensors->at(2).data, //v_buf_2_, ++ (T*)qkv_buf_, ++ (U*)attention_weights->query_weight.bias, ++ request_batch_size, ++ request_src_seq_len, ++ head_num_, ++ size_per_head_, ++ 0, ++ stream_); ++ } else { ++ invokeCrossAddFusedQKVBiasTranspose( ++ (T*)q_buf_2_, ++ (T*)output_tensors->at(1).data, //k_buf_2_, ++ (T*)output_tensors->at(2).data, //v_buf_2_, ++ qkv_buf_, ++ attention_weights->query_weight.bias, ++ request_batch_size, ++ request_src_seq_len, ++ request_tgt_seq_len, ++ head_num_, ++ size_per_head_, ++ stream_); ++ } ++ sync_check_cuda_error(); ++ // Use batch major ++ // put k/v_buf from shape [B, H, L, Dh] ++ // to cache [B, H, Dh/x, L, x] and [B, H, L, Dh/x, x] ++ // invokeTranspose4dBatchMajor((T*)output_tensors->at(1).data, //k_buf_2_, ++ // (T*)output_tensors->at(2).data, //v_buf_2_, ++ // (T*)output_tensors->at(1).data, //k_buf_2_, ++ // (T*)output_tensors->at(2).data, //v_buf_2_, ++ // request_batch_size, ++ // request_tgt_seq_len, //request_seq_len, ++ // request_tgt_seq_len, //max_seq_len, ++ // size_per_head_, ++ // head_num_, ++ // stream_); ++ // sync_check_cuda_error(); ++ ++ if (is_final == false) { ++ const cudaDataType_t gemm_data_type = getCudaDataType(); ++ if (is_qk_buf_float_ == true && gemm_data_type != CUDA_R_32F) { ++ // cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, ++ // CUBLAS_OP_N, ++ // request_seq_len, ++ // request_seq_len, ++ // size_per_head_, ++ // 1.0f, ++ // (T*)output_tensors->at(1).data, // k_buf_2_, ++ // gemm_data_type, ++ // size_per_head_, ++ // request_seq_len * size_per_head_, ++ // q_buf_2_, ++ // gemm_data_type, ++ // size_per_head_, ++ // request_seq_len * size_per_head_, ++ // 0.0f, ++ // qk_buf_float_, ++ // CUDA_R_32F, ++ // request_seq_len, ++ // request_seq_len * request_seq_len, ++ // request_batch_size * head_num_, ++ // CUDA_R_32F); ++ // sync_check_cuda_error(); ++ // T scalar = 1 / sqrtf(size_per_head_ * 1.0f); ++ // invokeMaskedSoftMax(qk_buf_, ++ // qk_buf_float_, ++ // attention_mask, ++ // request_batch_size, ++ // request_seq_len, ++ // head_num_, ++ // scalar, ++ // stream_); ++ // sync_check_cuda_error(); ++ } ++ else { ++ cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T, ++ CUBLAS_OP_N, ++ request_tgt_seq_len, ++ request_src_seq_len, ++ size_per_head_, ++ 1.0f, ++ (T*)output_tensors->at(1).data, // k_buf_2_, ++ gemm_data_type, ++ size_per_head_, ++ request_tgt_seq_len * size_per_head_, ++ q_buf_2_, ++ gemm_data_type, ++ size_per_head_, ++ request_src_seq_len * size_per_head_, ++ 0.0f, ++ qk_buf_, ++ softmax_data_type, ++ request_tgt_seq_len, ++ request_src_seq_len * request_tgt_seq_len, ++ request_batch_size * head_num_, ++ CUDA_R_32F); ++ ++ S scalar = (S) (1.0f / sqrtf(size_per_head_ * 1.0f)); ++ invokeMixMaskedSoftMax(qk_buf_, ++ attention_mask, ++ request_batch_size, ++ request_src_seq_len, ++ request_tgt_seq_len, ++ head_num_, ++ scalar, ++ stream_); ++ ++ // if (request_src_seq_len == request_tgt_seq_len) { ++ // invokeMaskedSoftMax(qk_buf_, ++ // qk_buf_, ++ // attention_mask, ++ // request_batch_size, ++ // request_tgt_seq_len, ++ // head_num_, ++ // scalar, ++ // stream_); ++ // } else { ++ // invokeCrossMaskedSoftMax(qk_buf_, ++ // qk_buf_, ++ // attention_mask, ++ // request_batch_size, ++ // request_src_seq_len, request_tgt_seq_len, ++ // head_num_, ++ // scalar, ++ // stream_); ++ // } ++ sync_check_cuda_error(); ++ } ++ cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ size_per_head_, ++ request_src_seq_len, ++ request_tgt_seq_len, ++ 1.0f, ++ (T*)output_tensors->at(2).data, // v_buf_2_, ++ gemm_data_type, ++ size_per_head_, ++ request_tgt_seq_len * size_per_head_, ++ qk_buf_, ++ softmax_data_type, ++ request_tgt_seq_len, ++ request_src_seq_len * request_tgt_seq_len, ++ 0.0f, ++ qkv_buf_2_, ++ gemm_data_type, ++ size_per_head_, ++ request_src_seq_len * size_per_head_, ++ request_batch_size * head_num_, ++ CUDA_R_32F); ++ sync_check_cuda_error(); ++ ++ ++ invokeTransposeQKV( ++ qkv_buf_3_, ++ qkv_buf_2_, ++ request_batch_size, ++ request_src_seq_len, ++ head_num_, ++ size_per_head_, ++ stream_); ++ sync_check_cuda_error(); ++// #ifdef SPARSITY_ENABLED ++// if (sparse_ && cublas_wrapper_->isUseSparse(1, hidden_size_, m_padded, local_hidden_size_)) { ++// cublas_wrapper_->SpGemm(CUBLAS_OP_N, ++// CUBLAS_OP_N, ++// hidden_size_, ++// m_padded, ++// local_hidden_size_, ++// attention_weights->attention_output_weight.sp_kernel, ++// qkv_buf_3_, ++// attention_out); ++// } ++// else { ++// #endif ++ cublas_wrapper_->Gemm(CUBLAS_OP_N, ++ CUBLAS_OP_N, ++ hidden_size_, ++ m, ++ hidden_size_, ++ attention_weights->attention_output_weight.kernel, ++ hidden_size_, ++ qkv_buf_3_, ++ hidden_size_, ++ (T*)output_tensors->at(0).data, ++ hidden_size_); ++ int len = request_batch_size * request_src_seq_len; ++ sync_check_cuda_error(); ++ invokeAddBias((T*)output_tensors->at(0).data, (const T*)attention_weights->attention_output_weight.bias, len, hidden_size_, stream_); ++ sync_check_cuda_error(); ++// #ifdef SPARSITY_ENABLED ++// } ++// #endif ++ } ++ if (is_free_buffer_after_forward_ == true) { ++ freeBuffer(); ++ } ++ sync_check_cuda_error(); ++} ++ ++template ++MSMHALayer::~MSMHALayer() ++{ ++ cublas_wrapper_ = nullptr; ++ freeBuffer(); ++} ++ ++template ++void MSMHALayer::allocateBuffer() ++{ ++ FT_CHECK(false); ++ // allocate according to max parameters ++ if (is_allocate_buffer_ == false) { ++ #if 1 ++ size_t qkv_len = getQElemNum() + getKElemNum() + getVElemNum(); ++ qkv_buf_ = reinterpret_cast(allocator_->malloc(sizeof(T) * qkv_len, true)); ++ q_buf_2_ = reinterpret_cast(allocator_->malloc(sizeof(T) * getQElemNum(), true)); ++ qk_buf_ = reinterpret_cast(allocator_->malloc(sizeof(S) *getQKElemNum(), true)); ++ qkv_buf_2_ = reinterpret_cast(allocator_->malloc(sizeof(T) * getQKVElemNum(), true)); ++ qkv_buf_3_ = reinterpret_cast(allocator_->malloc(sizeof(T) * getQKVElemNum(), true)); ++ ++ // if (is_qk_buf_float_ == true) { ++ // qk_buf_float_ = (float*)allocator_->malloc( ++ // sizeof(float) * getQKElemNum(), true); ++ // } ++ #else ++ size_t buff_size = getQKElemNum() + getQElemNum(); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ optimized_offset_ = getQKVElemNum(); ++ qkv_buf_ = buf_ + optimized_offset_; ++ q_buf_2_ = buf_; ++ qk_buf_ = buf_ + optimized_offset_; ++ qkv_buf_2_ = buf_; ++ qkv_buf_3_ = buf_ + optimized_offset_; ++ #endif ++ ++ is_allocate_buffer_ = true; ++ } ++} ++ ++template ++void MSMHALayer::allocateBuffer(size_t batch_size, size_t src_seq_len, size_t tgt_seq_len) { ++ FT_LOG_DEBUG(__PRETTY_FUNCTION__); ++ #if 1 ++ size_t qkv_len = getQElemNum(batch_size, src_seq_len) + getKElemNum(batch_size, tgt_seq_len) ++ + getVElemNum(batch_size, tgt_seq_len); ++ qkv_buf_ = reinterpret_cast(allocator_->reMalloc(qkv_buf_, sizeof(T) * qkv_len, true)); ++ q_buf_2_ = reinterpret_cast(allocator_->reMalloc(q_buf_2_, sizeof(T) * getQElemNum(batch_size, src_seq_len), true)); ++ qk_buf_ = reinterpret_cast( ++ allocator_->reMalloc(qk_buf_, sizeof(S) * getQKElemNum(batch_size, src_seq_len, tgt_seq_len), true)); ++ qkv_buf_2_ = reinterpret_cast(allocator_->reMalloc(qkv_buf_2_, sizeof(T) * getQKVElemNum(batch_size, src_seq_len), true)); ++ qkv_buf_3_ = reinterpret_cast(allocator_->reMalloc(qkv_buf_3_, sizeof(T) * getQKVElemNum(batch_size, src_seq_len), true)); ++ ++ // if (is_qk_buf_float_ == true) { ++ // qk_buf_float_ = (float*)allocator_->reMalloc( ++ // qk_buf_float_, sizeof(float) * getQKElemNum(batch_size, size_t src_seq_len, size_t tgt_seq_len), true); ++ // } ++ #else ++ size_t buff_size = getQKElemNum(batch_size, src_seq_len, tgt_seq_len) + getQKVElemNum(batch_size, src_seq_len); ++ buf_ = reinterpret_cast(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true)); ++ optimized_offset_ = getQKVElemNum(batch_size, src_seq_len); ++ qkv_buf_ = buf_ + optimized_offset_; ++ q_buf_2_ = buf_; ++ qk_buf_ = buf_ + optimized_offset_; ++ qkv_buf_2_ = buf_; ++ qkv_buf_3_ = buf_ + optimized_offset_; ++ #endif ++ is_allocate_buffer_ = true; ++} ++ ++template ++void MSMHALayer::freeBuffer() { ++ if (is_allocate_buffer_) { ++ FT_LOG_DEBUG(__PRETTY_FUNCTION__); ++ allocator_->free(qkv_buf_); ++ allocator_->free(q_buf_2_); ++ allocator_->free(qk_buf_); ++ allocator_->free(qkv_buf_2_); ++ allocator_->free(qkv_buf_3_); ++ // if (is_qk_buf_float_ == true) { ++ // allocator_->free(qk_buf_float_); ++ // } ++ is_allocate_buffer_ = false; ++ } ++} ++ ++template ++bool MSMHALayer::isValidBatchSize(size_t batch_size) { ++ if (batch_size <= max_batch_size_) { ++ return true; ++ } else { ++ freeBuffer(); ++ max_batch_size_ = batch_size * 1.2; ++ return true; ++ } ++} ++ ++template ++bool MSMHALayer::isValidSrcSeqLen(size_t seq_len) { ++ if (seq_len <= max_src_seq_len_) { ++ return true; ++ } else { ++ freeBuffer(); ++ max_src_seq_len_ = seq_len * 1.2; ++ return true; ++ } ++} ++ ++template ++bool MSMHALayer::isValidTgtSeqLen(size_t seq_len) { ++ if (seq_len <= max_tgt_seq_len_) { ++ return true; ++ } else { ++ freeBuffer(); ++ max_tgt_seq_len_ = seq_len * 1.2; ++ return true; ++ } ++} ++ ++ ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++template class MSMHALayer; ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + } // namespace fastertransformer +diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +index 92e2175..df67c9a 100644 +--- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h ++++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h +@@ -107,4 +107,205 @@ public: + const AttentionWeight* attention_weights) override; + }; + ++ ++template ++class MSMHALayer: public BaseAttentionLayer { ++private: ++ // buffer handling ++ size_t max_batch_size_ = 0; ++ ++ size_t max_src_seq_len_ = 0; ++ size_t max_tgt_seq_len_ = 0; ++ ++ ++ // metadata ++ const size_t head_num_; ++ const size_t size_per_head_; ++ const size_t hidden_size_; ++ ++ void allocateBuffer() override; ++ void allocateBuffer(size_t batch_size, size_t src_seq_len, size_t tgt_seq_len); ++ void freeBuffer() override; ++ bool isValidBatchSize(size_t batch_size); ++ bool isValidSrcSeqLen(size_t seq_len); ++ bool isValidTgtSeqLen(size_t seq_len); ++ ++ size_t getQElemNum(size_t b = 0, size_t s = 0) { ++ b = (b == 0) ? max_batch_size_ : b; ++ s = (s == 0) ? max_src_seq_len_ : s; ++ return b * s * hidden_size_; ++ } ++ ++ size_t getKElemNum(size_t b = 0, size_t s = 0) { ++ b = (b == 0) ? max_batch_size_ : b; ++ s = (s == 0) ? max_tgt_seq_len_ : s; ++ return b * s * hidden_size_; ++ } ++ ++ size_t getVElemNum(size_t b = 0, size_t s = 0) { ++ b = (b == 0) ? max_batch_size_ : b; ++ s = (s == 0) ? max_tgt_seq_len_ : s; ++ return b * s * hidden_size_; ++ } ++ ++ size_t getQKElemNum(size_t b = 0, size_t s = 0, size_t t = 0) { ++ b = (b == 0) ? max_batch_size_ : b; ++ s = (s == 0) ? max_src_seq_len_ : s; ++ t = (t == 0) ? max_tgt_seq_len_ : t; ++ ++ return b * head_num_ * s * t; ++ } ++ ++ size_t getQKVElemNum(size_t b = 0, size_t s = 0) { ++ b = (b == 0) ? max_batch_size_ : b; ++ s = (s == 0) ? max_src_seq_len_ : s; ++ ++ return b * s * hidden_size_; ++ } ++ ++ ++ using BaseAttentionLayer::is_free_buffer_after_forward_; ++ using BaseAttentionLayer::is_allocate_buffer_; ++ using BaseAttentionLayer::cublas_wrapper_; ++ using BaseAttentionLayer::allocator_; ++ ++ bool is_qk_buf_float_; ++ int optimized_offset_; ++ ++ ++protected: ++ using BaseAttentionLayer::stream_; ++ using BaseAttentionLayer::sparse_; ++ T* qkv_buf_ = nullptr; ++ T* q_buf_2_ = nullptr; ++ //T* k_buf_2_ = nullptr; ++ //T* v_buf_2_ = nullptr; ++ S* qk_buf_ = nullptr; ++ float* qk_buf_float_ = nullptr; ++ T* qkv_buf_2_ = nullptr; ++ T* qkv_buf_3_ = nullptr; ++ T* buf_ = nullptr; ++ ++public: ++ MSMHALayer(size_t batch_size, ++ size_t src_seq_len, ++ size_t tgt_seq_len, ++ size_t head_num, ++ size_t size_per_head, ++ cudaStream_t stream, ++ cublasMMWrapper* cublas_wrapper, ++ IAllocator* allocator, ++ bool is_free_buffer_after_forward, ++ bool is_qk_buf_float, ++ bool sparse = false); ++ ++ MSMHALayer(MSMHALayer const& attention_layer); ++ ++ virtual ~MSMHALayer(); ++ ++ void forward(std::vector* output_tensors, ++ const std::vector* input_tensors, ++ const AttentionWeight* attention_weights) override; ++}; ++ ++// template ++// class MSMixedMHALayer: public BaseAttentionLayer { ++// private: ++// // buffer handling ++// size_t max_batch_size_ = 0; ++ ++// size_t max_src_seq_len_ = 0; ++// size_t max_tgt_seq_len_ = 0; ++ ++ ++// // metadata ++// const size_t head_num_; ++// const size_t size_per_head_; ++// const size_t hidden_size_; ++ ++// void allocateBuffer() override; ++// void allocateBuffer(size_t batch_size, size_t src_seq_len, size_t tgt_seq_len); ++// void freeBuffer() override; ++// bool isValidBatchSize(size_t batch_size); ++// bool isValidSrcSeqLen(size_t seq_len); ++// bool isValidTgtSeqLen(size_t seq_len); ++ ++// size_t getQElemNum(size_t b = 0, size_t s = 0) { ++// b = (b == 0) ? max_batch_size_ : b; ++// s = (s == 0) ? max_src_seq_len_ : s; ++// return b * s * hidden_size_; ++// } ++ ++// size_t getKElemNum(size_t b = 0, size_t s = 0) { ++// b = (b == 0) ? max_batch_size_ : b; ++// s = (s == 0) ? max_tgt_seq_len_ : s; ++// return b * s * hidden_size_; ++// } ++ ++// size_t getVElemNum(size_t b = 0, size_t s = 0) { ++// b = (b == 0) ? max_batch_size_ : b; ++// s = (s == 0) ? max_tgt_seq_len_ : s; ++// return b * s * hidden_size_; ++// } ++ ++// size_t getQKElemNum(size_t b = 0, size_t s = 0, size_t t = 0) { ++// b = (b == 0) ? max_batch_size_ : b; ++// s = (s == 0) ? max_src_seq_len_ : s; ++// t = (t == 0) ? max_tgt_seq_len_ : t; ++ ++// return b * head_num_ * s * t; ++// } ++ ++// size_t getQKVElemNum(size_t b = 0, size_t s = 0) { ++// b = (b == 0) ? max_batch_size_ : b; ++// s = (s == 0) ? max_src_seq_len_ : s; ++ ++// return b * s * hidden_size_; ++// } ++ ++ ++// using BaseAttentionLayer::is_free_buffer_after_forward_; ++// using BaseAttentionLayer::is_allocate_buffer_; ++// using BaseAttentionLayer::cublas_wrapper_; ++// using BaseAttentionLayer::allocator_; ++ ++// bool is_qk_buf_float_; ++// int optimized_offset_; ++ ++ ++// protected: ++// using BaseAttentionLayer::stream_; ++// using BaseAttentionLayer::sparse_; ++// T* qkv_buf_ = nullptr; ++// T* q_buf_2_ = nullptr; ++// //T* k_buf_2_ = nullptr; ++// //T* v_buf_2_ = nullptr; ++// T* qk_buf_ = nullptr; ++// float* qk_buf_float_ = nullptr; ++// T* qkv_buf_2_ = nullptr; ++// T* qkv_buf_3_ = nullptr; ++// T* buf_ = nullptr; ++ ++// public: ++// MSMixedMHALayer(size_t batch_size, ++// size_t src_seq_len, ++// size_t tgt_seq_len, ++// size_t head_num, ++// size_t size_per_head, ++// cudaStream_t stream, ++// cublasMMWrapper* cublas_wrapper, ++// IAllocator* allocator, ++// bool is_free_buffer_after_forward, ++// bool is_qk_buf_float, ++// bool sparse = false); ++ ++// MSMixedMHALayer(MSMixedMHALayer const& attention_layer); ++ ++// virtual ~MSMHALayer(); ++ ++// void forward(std::vector* output_tensors, ++// const std::vector* input_tensors, ++// const AttentionWeight* attention_weights) override; ++// }; ++ + } // namespace fastertransformer +diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt +index af33e76..21efb6d 100644 +--- a/src/fastertransformer/models/CMakeLists.txt ++++ b/src/fastertransformer/models/CMakeLists.txt +@@ -21,7 +21,9 @@ add_subdirectory(xlnet) + + add_subdirectory(t5) + add_subdirectory(gptj) +-add_subdirectory(multi_gpu_gpt) ++if(EXAMPLES) ++ add_subdirectory(multi_gpu_gpt) ++endif() + add_subdirectory(swin) + add_subdirectory(swin_int8) + add_subdirectory(vit) +diff --git a/src/fastertransformer/models/gptj/CMakeLists.txt b/src/fastertransformer/models/gptj/CMakeLists.txt +index d7d9d3e..e69a988 100644 +--- a/src/fastertransformer/models/gptj/CMakeLists.txt ++++ b/src/fastertransformer/models/gptj/CMakeLists.txt +@@ -19,6 +19,7 @@ set_property(TARGET GptJDecoderLayerWeight PROPERTY POSITION_INDEPENDENT_CODE O + set_property(TARGET GptJDecoderLayerWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(GptJDecoderLayerWeight PUBLIC memory_utils) + ++if(off) + add_library(GptJDecoder STATIC GptJDecoder.cc) + set_property(TARGET GptJDecoder PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET GptJDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -40,12 +41,14 @@ target_link_libraries(GptJContextDecoder PUBLIC -lcudart cublasMMWrapper + add_residual_kernels + gpt_kernels + nccl_utils) ++endif() + + add_library(GptJWeight STATIC GptJWeight.cc) + set_property(TARGET GptJWeight PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET GptJWeight PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(GptJWeight PUBLIC GptJDecoderLayerWeight) + ++if(off) + add_library(GptJ STATIC GptJ.cc) + set_property(TARGET GptJ PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET GptJ PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -58,3 +61,4 @@ target_link_libraries(GptJ PUBLIC -lcudart + BaseBeamSearchLayer + bert_preprocess_kernels + GptJWeight) ++endif() +\ No newline at end of file +diff --git a/src/fastertransformer/models/gptj/GptJ.cc b/src/fastertransformer/models/gptj/GptJ.cc +index 0829e0d..cfd72b8 100644 +--- a/src/fastertransformer/models/gptj/GptJ.cc ++++ b/src/fastertransformer/models/gptj/GptJ.cc +@@ -928,7 +928,7 @@ void GptJ::forward(std::unordered_map* output_tensors, + if (output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr) { + ftNcclSend(output_tensors->at("output_log_probs").getPtr(), +- batch_size * beam_width * input_tensors->at("max_output_seq_len").getVal(), ++ output_tensors->at("output_log_probs").size(), + 0, + pipeline_para_, + stream_); +@@ -958,7 +958,7 @@ void GptJ::forward(std::unordered_map* output_tensors, + if (output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr) { + ftNcclRecv(output_tensors->at("output_log_probs").getPtr(), +- batch_size * beam_width * input_tensors->at("max_output_seq_len").getVal(), ++ output_tensors->at("output_log_probs").size(), + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); +diff --git a/src/fastertransformer/models/multi_gpu_gpt/CMakeLists.txt b/src/fastertransformer/models/multi_gpu_gpt/CMakeLists.txt +index 10b9e0b..86d733f 100644 +--- a/src/fastertransformer/models/multi_gpu_gpt/CMakeLists.txt ++++ b/src/fastertransformer/models/multi_gpu_gpt/CMakeLists.txt +@@ -37,7 +37,7 @@ set_property(TARGET ParallelGptDecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(ParallelGptDecoder PUBLIC -lcudart TensorParallelGeluFfnLayer + TensorParallelDecoderSelfAttentionLayer layernorm_kernels + add_residual_kernels nccl_utils) +- ++ + add_library(ParallelGpt STATIC ParallelGpt.cc) + set_property(TARGET ParallelGpt PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET ParallelGpt PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +diff --git a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc +index 17f9099..d74dc6c 100644 +--- a/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc ++++ b/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc +@@ -1057,7 +1057,7 @@ void ParallelGpt::forward(std::unordered_map* output_ten + if (output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr) { + ftNcclSend(output_tensors->at("output_log_probs").getPtr(), +- batch_size * beam_width * input_tensors->at("max_output_seq_len").getVal(), ++ output_tensors->at("output_log_probs").size(), + 0, + pipeline_para_, + stream_); +@@ -1087,7 +1087,7 @@ void ParallelGpt::forward(std::unordered_map* output_ten + if (output_tensors->count("output_log_probs") > 0 + && output_tensors->at("output_log_probs").data != nullptr) { + ftNcclRecv(output_tensors->at("output_log_probs").getPtr(), +- batch_size * beam_width * input_tensors->at("max_output_seq_len").getVal(), ++ output_tensors->at("output_log_probs").size(), + pipeline_para_.world_size_ - 1, + pipeline_para_, + stream_); +diff --git a/src/fastertransformer/models/t5/CMakeLists.txt b/src/fastertransformer/models/t5/CMakeLists.txt +index 9f3455d..e75bbbd 100644 +--- a/src/fastertransformer/models/t5/CMakeLists.txt ++++ b/src/fastertransformer/models/t5/CMakeLists.txt +@@ -14,6 +14,7 @@ + + cmake_minimum_required(VERSION 3.8) + ++if(False) + add_library(T5Decoder STATIC T5Decoder.cc T5DecoderLayerWeight.cc) + set_property(TARGET T5Decoder PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET T5Decoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -21,6 +22,7 @@ target_link_libraries(T5Decoder PUBLIC -lcudart cublasMMWrapper TensorParallelDe + TensorParallelDecoderCrossAttentionLayer TensorParallelReluFfnLayer + layernorm_kernels add_residual_kernels nccl_utils memory_utils) + ++ + add_library(T5Decoding STATIC T5Decoding.cc T5DecodingWeight.cc) + set_property(TARGET T5Decoding PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET T5Decoding PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -28,6 +30,8 @@ target_link_libraries(T5Decoding PUBLIC -lcudart cublasMMWrapper T5Decoder bert_ + decoding_kernels DynamicDecodeLayer BaseBeamSearchLayer + beam_search_topk_kernels gpt_kernels) + ++ ++ + add_library(T5Encoder STATIC T5Encoder.cc T5EncoderWeight.cc T5EncoderLayerWeight.cc) + set_property(TARGET T5Encoder PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET T5Encoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) +@@ -36,4 +40,5 @@ target_link_libraries(T5Encoder PUBLIC -lcudart bert_preprocess_kernels cublasMM + TensorParallelGeluFfnLayer layernorm_kernels add_residual_kernels nccl_utils) + + add_executable(t5_gemm t5_gemm.cc) +-target_link_libraries(t5_gemm PUBLIC -lcudart t5_gemm_func memory_utils) +\ No newline at end of file ++target_link_libraries(t5_gemm PUBLIC -lcudart t5_gemm_func memory_utils) ++endif() +\ No newline at end of file +diff --git a/src/fastertransformer/models/t5/T5Encoder.cc b/src/fastertransformer/models/t5/T5Encoder.cc +index 698e3d6..db989ff 100644 +--- a/src/fastertransformer/models/t5/T5Encoder.cc ++++ b/src/fastertransformer/models/t5/T5Encoder.cc +@@ -380,7 +380,7 @@ void T5Encoder::forward(std::unordered_map* output_tenso + request_seq_len, + request_seq_len, + local_batch_size, +- hidden_units_, ++ d_model_, + stream_); + } + else { +diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt +index 3d0f28a..3d2efbd 100644 +--- a/src/fastertransformer/utils/CMakeLists.txt ++++ b/src/fastertransformer/utils/CMakeLists.txt +@@ -44,10 +44,12 @@ set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(memory_utils PUBLIC -lnvToolsExt) + ++if(EXAMPLES) + add_library(nccl_utils STATIC nccl_utils.cc) + set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON) + set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(nccl_utils PUBLIC -lnccl) ++endif() + + add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc) + set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) +diff --git a/src/fastertransformer/utils/cublasMMWrapper.cc b/src/fastertransformer/utils/cublasMMWrapper.cc +index e291151..6ddd6bd 100644 +--- a/src/fastertransformer/utils/cublasMMWrapper.cc ++++ b/src/fastertransformer/utils/cublasMMWrapper.cc +@@ -313,6 +313,22 @@ void cublasMMWrapper::setFP16GemmConfig() + computeType_ = CUDA_R_32F; + } + ++void cublasMMWrapper::setFP32MixedGemmConfig() ++{ ++ Atype_ = CUDA_R_32F; ++ Btype_ = CUDA_R_16F; ++ Ctype_ = CUDA_R_32F; ++ computeType_ = CUDA_R_32F; ++} ++ ++void cublasMMWrapper::setFP16MixedGemmConfig() ++{ ++ Atype_ = CUDA_R_16F; ++ Btype_ = CUDA_R_32F; ++ Ctype_ = CUDA_R_32F; ++ computeType_ = CUDA_R_32F; ++} ++ + #ifdef ENABLE_BF16 + void cublasMMWrapper::setBF16GemmConfig() + { +diff --git a/src/fastertransformer/utils/cublasMMWrapper.h b/src/fastertransformer/utils/cublasMMWrapper.h +index 6f410ab..a2159e0 100644 +--- a/src/fastertransformer/utils/cublasMMWrapper.h ++++ b/src/fastertransformer/utils/cublasMMWrapper.h +@@ -121,6 +121,8 @@ public: + + void setFP32GemmConfig(); + void setFP16GemmConfig(); ++ void setFP32MixedGemmConfig(); ++ void setFP16MixedGemmConfig(); + #ifdef ENABLE_BF16 + void setBF16GemmConfig(); + #endif +diff --git a/src/fastertransformer/utils/logger.h b/src/fastertransformer/utils/logger.h +index bcdf8fa..e3e7007 100644 +--- a/src/fastertransformer/utils/logger.h ++++ b/src/fastertransformer/utils/logger.h +@@ -65,7 +65,7 @@ private: + #else + const Level DEFAULT_LOG_LEVEL = INFO; + #endif +- Level level_ = DEFAULT_LOG_LEVEL; ++ Level level_ = ERROR; // DEFAULT_LOG_LEVEL; + + Logger() + {