[NVPTX] Add ex2.approx.f16/f16x2 support

NOTE: this is a follow-up commit with the missing clang-side changes.

This patch adds builtins and intrinsics for the f16 and f16x2 variants of the ex2
instruction.

These two variants were added in PTX7.0, and are supported by sm_75 and above.

Note that this isn't wired with the exp2 llvm intrinsic because the ex2
instruction is only available in its approx variant.

Running ptxas on the assembly generated by the test f16-ex2.ll works as
expected.

Differential Revision: https://reviews.llvm.org/D119157
This commit is contained in:
Nicolas Miller 2022-03-01 10:34:02 -08:00 committed by Artem Belevich
parent a895182302
commit 510fd283fd
2 changed files with 17 additions and 0 deletions

View File

@ -282,6 +282,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "")
BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "")
BUILTIN(__nvvm_ex2_approx_f, "ff", "")
BUILTIN(__nvvm_ex2_approx_d, "dd", "")
TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70))
TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "")
BUILTIN(__nvvm_lg2_approx_f, "ff", "")

View File

@ -1,4 +1,9 @@
// REQUIRES: nvptx-registered-target
//
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
@ -32,6 +37,16 @@
#define __device__ __attribute__((device))
__device__ void nvvm_ex2_sm75() {
#if __CUDA_ARCH__ >= 750
// CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
__nvvm_ex2_approx_f16(0.1f16);
// CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
__nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
#endif
// CHECK: ret void
}
// CHECK-LABEL: nvvm_min_max_sm80
__device__ void nvvm_min_max_sm80() {
#if __CUDA_ARCH__ >= 800