[Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions

Adds NVPTX builtins and intrinsics for the CUDA PTX `cp.async` instructions for
`sm_80` architecture or newer.

PTX ISA description of `cp.async`:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive

Authored-by: Stuart Adams <stuart.adams@codeplay.com>
Co-Authored-by: Alexander Johnston <alexander@codeplay.com>

Differential Revision: https://reviews.llvm.org/D100394
This commit is contained in:
Stuart Adams 2021-05-17 09:28:20 -07:00 committed by Artem Belevich
parent 1417ddafdb
commit 02c2468864
7 changed files with 693 additions and 2 deletions

View File

@ -462,6 +462,29 @@ BUILTIN(__nvvm_membar_cta, "v", "")
BUILTIN(__nvvm_membar_gl, "v", "")
BUILTIN(__nvvm_membar_sys, "v", "")
// mbarrier
TARGET_BUILTIN(__nvvm_mbarrier_init, "vWi*i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_init_shared, "vWi*3i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_inval, "vWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_inval_shared, "vWi*3", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive, "WiWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared, "WiWi*3", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop, "WiWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared, "WiWi*3", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_test_wait, "bWi*Wi", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared, "bWi*3Wi", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_mbarrier_pending_count, "iWi", "", AND(SM_80,PTX70))
// Memcpy, Memset
BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","")
@ -726,6 +749,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
// Async Copy
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive, "vWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
#undef BUILTIN
#undef TARGET_BUILTIN
#pragma pop_macro("AND")

View File

@ -1,4 +1,10 @@
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
@ -672,3 +678,80 @@ __device__ void nvvm_vote(int pred) {
__nvvm_vote_ballot(pred);
// CHECK: ret void
}
// CHECK-LABEL: nvvm_mbarrier
__device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
#if __CUDA_ARCH__ >= 800
__nvvm_mbarrier_init(addr, count);
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
__nvvm_mbarrier_init_shared(sharedAddr, count);
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
__nvvm_mbarrier_inval(addr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
__nvvm_mbarrier_inval_shared(sharedAddr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
__nvvm_mbarrier_arrive(addr);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
__nvvm_mbarrier_arrive_shared(sharedAddr);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
__nvvm_mbarrier_arrive_noComplete(addr, count);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
__nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
__nvvm_mbarrier_arrive_drop(addr);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
__nvvm_mbarrier_arrive_drop_shared(sharedAddr);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
__nvvm_mbarrier_arrive_drop_noComplete(addr, count);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
__nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
__nvvm_mbarrier_test_wait(addr, state);
// CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
__nvvm_mbarrier_test_wait_shared(sharedAddr, state);
// CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
__nvvm_mbarrier_pending_count(state);
// CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
#endif
// CHECK: ret void
}
// CHECK-LABEL: nvvm_async_copy
__device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
#if __CUDA_ARCH__ >= 800
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
__nvvm_cp_async_mbarrier_arrive(addr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
__nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
__nvvm_cp_async_mbarrier_arrive_noinc(addr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
__nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
__nvvm_cp_async_ca_shared_global_4(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
__nvvm_cp_async_ca_shared_global_8(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
__nvvm_cp_async_ca_shared_global_16(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
__nvvm_cp_async_cg_shared_global_16(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
__nvvm_cp_async_commit_group();
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
__nvvm_cp_async_wait_group(0);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
__nvvm_cp_async_wait_group(8);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
__nvvm_cp_async_wait_group(16);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
__nvvm_cp_async_wait_all();
#endif
// CHECK: ret void
}

View File

@ -31,7 +31,11 @@
// * llvm.nvvm.max.ull --> ibid.
// * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32
def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
def llvm_global_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 1>; // (global)i8*
def llvm_shared_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 3>; // (shared)i8*
def llvm_i64ptr_ty : LLVMPointerType<llvm_i64_ty>; // i64*
def llvm_any_i64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
def llvm_shared_i64ptr_ty : LLVMQualPointerType<llvm_i64_ty, 3>; // (shared)i64*
//
// MISC
@ -1052,6 +1056,110 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
Intrinsic<[], [], []>;
// Async Copy
def int_nvvm_cp_async_mbarrier_arrive :
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_cp_async_mbarrier_arrive_shared :
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_cp_async_mbarrier_arrive_noinc :
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_cp_async_ca_shared_global_4 :
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.4">;
def int_nvvm_cp_async_ca_shared_global_8 :
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.8">;
def int_nvvm_cp_async_ca_shared_global_16 :
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.16">;
def int_nvvm_cp_async_cg_shared_global_16 :
GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.cg.shared.global.16">;
def int_nvvm_cp_async_commit_group :
GCCBuiltin<"__nvvm_cp_async_commit_group">,
Intrinsic<[],[],[]>;
def int_nvvm_cp_async_wait_group :
GCCBuiltin<"__nvvm_cp_async_wait_group">,
Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
def int_nvvm_cp_async_wait_all :
GCCBuiltin<"__nvvm_cp_async_wait_all">,
Intrinsic<[],[],[]>;
// mbarrier
def int_nvvm_mbarrier_init : GCCBuiltin<"__nvvm_mbarrier_init">,
Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_init_shared :
GCCBuiltin<"__nvvm_mbarrier_init_shared">,
Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_inval : GCCBuiltin<"__nvvm_mbarrier_inval">,
Intrinsic<[],[llvm_i64ptr_ty],
[IntrConvergent, IntrWriteMem, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
def int_nvvm_mbarrier_inval_shared :
GCCBuiltin<"__nvvm_mbarrier_inval_shared">,
Intrinsic<[],[llvm_shared_i64ptr_ty],
[IntrConvergent, IntrWriteMem, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
def int_nvvm_mbarrier_arrive : GCCBuiltin<"__nvvm_mbarrier_arrive">,
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_shared :
GCCBuiltin<"__nvvm_mbarrier_arrive_shared">,
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_noComplete :
GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_noComplete_shared :
GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_drop :
GCCBuiltin<"__nvvm_mbarrier_arrive_drop">,
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_drop_shared :
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_drop_noComplete :
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_arrive_drop_noComplete_shared :
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_test_wait :
GCCBuiltin<"__nvvm_mbarrier_test_wait">,
Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_test_wait_shared :
GCCBuiltin<"__nvvm_mbarrier_test_wait_shared">,
Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
def int_nvvm_mbarrier_pending_count :
GCCBuiltin<"__nvvm_mbarrier_pending_count">,
Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent]>;
// Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the
// pointer's alignment.
def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
@ -1141,7 +1249,7 @@ def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
// For getting the handle from a texture or surface variable
def int_nvvm_texsurf_handle
: Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyi64ptr_ty],
: Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_any_i64ptr_ty],
[IntrNoMem], "llvm.nvvm.texsurf.handle">;
def int_nvvm_texsurf_handle_internal
: Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],

View File

@ -144,11 +144,13 @@ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"

View File

@ -288,6 +288,211 @@ def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
//-----------------------------------
// Async Copy Functions
//-----------------------------------
multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
!strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
[(Intrin Int32Regs:$addr)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
!strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
[(Intrin Int64Regs:$addr)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm CP_ASYNC_MBARRIER_ARRIVE :
CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
def CP_ASYNC_COMMIT_GROUP :
NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
Requires<[hasPTX70, hasSM80]>;
def CP_ASYNC_WAIT_GROUP :
NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
[(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
Requires<[hasPTX70, hasSM80]>;
def CP_ASYNC_WAIT_ALL :
NVPTXInst<(outs), (ins), "cp.async.wait_all;",
[(int_nvvm_cp_async_wait_all)]>,
Requires<[hasPTX70, hasSM80]>;
//-----------------------------------
// MBarrier Functions
//-----------------------------------
multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
[(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
[(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
int_nvvm_mbarrier_init_shared>;
multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
!strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
[(Intrin Int32Regs:$addr)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
!strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
[(Intrin Int64Regs:$addr)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
int_nvvm_mbarrier_inval_shared>;
multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
!strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
[(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
!strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
[(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
defm MBARRIER_ARRIVE_SHARED :
MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs Int64Regs:$state),
(ins Int32Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.arrive.noComplete", AddrSpace,
".b64 $state, [$addr], $count;"),
[(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs Int64Regs:$state),
(ins Int64Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.arrive.noComplete", AddrSpace,
".b64 $state, [$addr], $count;"),
[(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_ARRIVE_NOCOMPLETE :
MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
!strconcat("mbarrier.arrive_drop", AddrSpace,
".b64 $state, [$addr];"),
[(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
!strconcat("mbarrier.arrive_drop", AddrSpace,
".b64 $state, [$addr];"),
[(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_ARRIVE_DROP :
MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
defm MBARRIER_ARRIVE_DROP_SHARED :
MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs Int64Regs:$state),
(ins Int32Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
".b64 $state, [$addr], $count;"),
[(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs Int64Regs:$state),
(ins Int64Regs:$addr, Int32Regs:$count),
!strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
".b64 $state, [$addr], $count;"),
[(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
!strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
[(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
!strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
[(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
Requires<[hasPTX70, hasSM80]>;
}
defm MBARRIER_TEST_WAIT :
MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
defm MBARRIER_TEST_WAIT_SHARED :
MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
"mbarrier.pending_count.b64 $res, $state;",
[(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
Requires<[hasPTX70, hasSM80]>;
def MBARRIER_PENDING_COUNT :
MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
//-----------------------------------
// Math Functions
//-----------------------------------

View File

@ -0,0 +1,110 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
declare void @llvm.nvvm.cp.async.wait.group(i32)
; ALL-LABEL: asyncwaitgroup
define void @asyncwaitgroup() {
; ALL: cp.async.wait_group 8;
tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
; ALL: cp.async.wait_group 0;
tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
; ALL: cp.async.wait_group 16;
tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
ret void
}
declare void @llvm.nvvm.cp.async.wait.all()
; ALL-LABEL: asyncwaitall
define void @asyncwaitall() {
; ALL: cp.async.wait_all
tail call void @llvm.nvvm.cp.async.wait.all()
ret void
}
declare void @llvm.nvvm.cp.async.commit.group()
; ALL-LABEL: asynccommitgroup
define void @asynccommitgroup() {
; ALL: cp.async.commit_group
tail call void @llvm.nvvm.cp.async.commit.group()
ret void
}
declare void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
; CHECK-LABEL: asyncmbarrier
define void @asyncmbarrier(i64* %a) {
; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
ret void
}
; CHECK-LABEL: asyncmbarriershared
define void @asyncmbarriershared(i64 addrspace(3)* %a) {
; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
ret void
}
; CHECK-LABEL: asyncmbarriernoinc
define void @asyncmbarriernoinc(i64* %a) {
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
ret void
}
; CHECK-LABEL: asyncmbarriernoincshared
define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) {
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
ret void
}
declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
; CHECK-LABEL: asynccasharedglobal4i8
define void @asynccasharedglobal4i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
ret void
}
declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
; CHECK-LABEL: asynccasharedglobal8i8
define void @asynccasharedglobal8i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
ret void
}
declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
; CHECK-LABEL: asynccasharedglobal16i8
define void @asynccasharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
ret void
}
declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
; CHECK-LABEL: asynccgsharedglobal16i8
define void @asynccgsharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
ret void
}

View File

@ -0,0 +1,145 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
; CHECK-LABEL: barrierinit
define void @barrierinit(i64* %a, i32 %b) {
; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
tail call void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
ret void
}
; CHECK-LABEL: barrierinitshared
define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) {
; CHECK_PTX32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
tail call void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
ret void
}
declare void @llvm.nvvm.mbarrier.inval(i64* %a)
declare void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
; CHECK-LABEL: barrierinval
define void @barrierinval(i64* %a) {
; CHECK_PTX32: mbarrier.inval.b64 [%r{{[0-1]+}}];
; CHECK_PTX64: mbarrier.inval.b64 [%rd{{[0-1]+}}];
tail call void @llvm.nvvm.mbarrier.inval(i64* %a)
ret void
}
; CHECK-LABEL: barrierinvalshared
define void @barrierinvalshared(i64 addrspace(3)* %a) {
; CHECK_PTX32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}];
; CHECK_PTX64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}];
tail call void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
ret void
}
declare i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
declare i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
; CHECK-LABEL: barrierarrive
define void @barrierarrive(i64* %a) {
; CHECK_PTX32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
ret void
}
; CHECK-LABEL: barrierarriveshared
define void @barrierarriveshared(i64 addrspace(3)* %a) {
; CHECK_PTX32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
ret void
}
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
; CHECK-LABEL: barrierarrivenoComplete
define void @barrierarrivenoComplete(i64* %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
ret void
}
; CHECK-LABEL: barrierarrivenoCompleteshared
define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
ret void
}
declare i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
; CHECK-LABEL: barrierarrivedrop
define void @barrierarrivedrop(i64* %a) {
; CHECK_PTX32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
ret void
}
; CHECK-LABEL: barrierarrivedropshared
define void @barrierarrivedropshared(i64 addrspace(3)* %a) {
; CHECK_PTX32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
; CHECK_PTX64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
ret void
}
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
; CHECK-LABEL: barrierarrivedropnoComplete
define void @barrierarrivedropnoComplete(i64* %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
ret void
}
; CHECK-LABEL: barrierarrivedropnoCompleteshared
define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
ret void
}
declare i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
declare i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
; CHECK-LABEL: barriertestwait
define void @barriertestwait(i64* %a, i64 %b) {
; CHECK_PTX32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
ret void
}
; CHECK-LABEL: barriertestwaitshared
define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) {
; CHECK_PTX32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
ret void
}
declare i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
; CHECK-LABEL: barrierpendingcount
define i32 @barrierpendingcount(i64* %a, i64 %b) {
; CHECK_PTX32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
; CHECK_PTX64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
%ret = tail call i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
ret i32 %ret
}