forked from OSchip/llvm-project
[Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions
Adds NVPTX builtins and intrinsics for the CUDA PTX `cp.async` instructions for `sm_80` architecture or newer. PTX ISA description of `cp.async`: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive Authored-by: Stuart Adams <stuart.adams@codeplay.com> Co-Authored-by: Alexander Johnston <alexander@codeplay.com> Differential Revision: https://reviews.llvm.org/D100394
This commit is contained in:
parent
1417ddafdb
commit
02c2468864
|
@ -462,6 +462,29 @@ BUILTIN(__nvvm_membar_cta, "v", "")
|
|||
BUILTIN(__nvvm_membar_gl, "v", "")
|
||||
BUILTIN(__nvvm_membar_sys, "v", "")
|
||||
|
||||
// mbarrier
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_init, "vWi*i", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_init_shared, "vWi*3i", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_inval, "vWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_inval_shared, "vWi*3", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive, "WiWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared, "WiWi*3", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop, "WiWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared, "WiWi*3", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_test_wait, "bWi*Wi", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared, "bWi*3Wi", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_mbarrier_pending_count, "iWi", "", AND(SM_80,PTX70))
|
||||
|
||||
// Memcpy, Memset
|
||||
|
||||
BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","")
|
||||
|
@ -726,6 +749,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
|
|||
TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
|
||||
TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
|
||||
|
||||
// Async Copy
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive, "vWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
|
||||
|
||||
TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
|
||||
TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
|
||||
|
||||
#undef BUILTIN
|
||||
#undef TARGET_BUILTIN
|
||||
#pragma pop_macro("AND")
|
||||
|
|
|
@ -1,4 +1,10 @@
|
|||
// REQUIRES: nvptx-registered-target
|
||||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
|
||||
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
|
||||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
|
||||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
|
||||
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
|
||||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
|
||||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
|
||||
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
|
||||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
|
||||
|
@ -672,3 +678,80 @@ __device__ void nvvm_vote(int pred) {
|
|||
__nvvm_vote_ballot(pred);
|
||||
// CHECK: ret void
|
||||
}
|
||||
|
||||
// CHECK-LABEL: nvvm_mbarrier
|
||||
__device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
__nvvm_mbarrier_init(addr, count);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
|
||||
__nvvm_mbarrier_init_shared(sharedAddr, count);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
|
||||
|
||||
__nvvm_mbarrier_inval(addr);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
|
||||
__nvvm_mbarrier_inval_shared(sharedAddr);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
|
||||
|
||||
__nvvm_mbarrier_arrive(addr);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
|
||||
__nvvm_mbarrier_arrive_shared(sharedAddr);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
|
||||
__nvvm_mbarrier_arrive_noComplete(addr, count);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
|
||||
__nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
|
||||
|
||||
__nvvm_mbarrier_arrive_drop(addr);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
|
||||
__nvvm_mbarrier_arrive_drop_shared(sharedAddr);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
|
||||
__nvvm_mbarrier_arrive_drop_noComplete(addr, count);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
|
||||
__nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
|
||||
// CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
|
||||
|
||||
__nvvm_mbarrier_test_wait(addr, state);
|
||||
// CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
|
||||
__nvvm_mbarrier_test_wait_shared(sharedAddr, state);
|
||||
// CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
|
||||
|
||||
__nvvm_mbarrier_pending_count(state);
|
||||
// CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
|
||||
#endif
|
||||
// CHECK: ret void
|
||||
}
|
||||
|
||||
// CHECK-LABEL: nvvm_async_copy
|
||||
__device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
|
||||
__nvvm_cp_async_mbarrier_arrive(addr);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
|
||||
__nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
|
||||
__nvvm_cp_async_mbarrier_arrive_noinc(addr);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
|
||||
__nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
|
||||
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
|
||||
__nvvm_cp_async_ca_shared_global_4(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
|
||||
__nvvm_cp_async_ca_shared_global_8(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
|
||||
__nvvm_cp_async_ca_shared_global_16(dst, src);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
|
||||
__nvvm_cp_async_cg_shared_global_16(dst, src);
|
||||
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
|
||||
__nvvm_cp_async_commit_group();
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
|
||||
__nvvm_cp_async_wait_group(0);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
|
||||
__nvvm_cp_async_wait_group(8);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
|
||||
__nvvm_cp_async_wait_group(16);
|
||||
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
|
||||
__nvvm_cp_async_wait_all();
|
||||
#endif
|
||||
// CHECK: ret void
|
||||
}
|
|
@ -31,7 +31,11 @@
|
|||
// * llvm.nvvm.max.ull --> ibid.
|
||||
// * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32
|
||||
|
||||
def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
|
||||
def llvm_global_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 1>; // (global)i8*
|
||||
def llvm_shared_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 3>; // (shared)i8*
|
||||
def llvm_i64ptr_ty : LLVMPointerType<llvm_i64_ty>; // i64*
|
||||
def llvm_any_i64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
|
||||
def llvm_shared_i64ptr_ty : LLVMQualPointerType<llvm_i64_ty, 3>; // (shared)i64*
|
||||
|
||||
//
|
||||
// MISC
|
||||
|
@ -1052,6 +1056,110 @@ let TargetPrefix = "nvvm" in {
|
|||
def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
|
||||
Intrinsic<[], [], []>;
|
||||
|
||||
// Async Copy
|
||||
def int_nvvm_cp_async_mbarrier_arrive :
|
||||
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
|
||||
Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_cp_async_mbarrier_arrive_shared :
|
||||
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
|
||||
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_cp_async_mbarrier_arrive_noinc :
|
||||
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
|
||||
Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
|
||||
GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
|
||||
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
|
||||
|
||||
def int_nvvm_cp_async_ca_shared_global_4 :
|
||||
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.4">;
|
||||
def int_nvvm_cp_async_ca_shared_global_8 :
|
||||
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.8">;
|
||||
def int_nvvm_cp_async_ca_shared_global_16 :
|
||||
GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.ca.shared.global.16">;
|
||||
def int_nvvm_cp_async_cg_shared_global_16 :
|
||||
GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
|
||||
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
|
||||
[IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
|
||||
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
|
||||
"llvm.nvvm.cp.async.cg.shared.global.16">;
|
||||
|
||||
def int_nvvm_cp_async_commit_group :
|
||||
GCCBuiltin<"__nvvm_cp_async_commit_group">,
|
||||
Intrinsic<[],[],[]>;
|
||||
|
||||
def int_nvvm_cp_async_wait_group :
|
||||
GCCBuiltin<"__nvvm_cp_async_wait_group">,
|
||||
Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
|
||||
|
||||
def int_nvvm_cp_async_wait_all :
|
||||
GCCBuiltin<"__nvvm_cp_async_wait_all">,
|
||||
Intrinsic<[],[],[]>;
|
||||
|
||||
// mbarrier
|
||||
def int_nvvm_mbarrier_init : GCCBuiltin<"__nvvm_mbarrier_init">,
|
||||
Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_init_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_init_shared">,
|
||||
Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
|
||||
def int_nvvm_mbarrier_inval : GCCBuiltin<"__nvvm_mbarrier_inval">,
|
||||
Intrinsic<[],[llvm_i64ptr_ty],
|
||||
[IntrConvergent, IntrWriteMem, IntrArgMemOnly,
|
||||
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
|
||||
def int_nvvm_mbarrier_inval_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_inval_shared">,
|
||||
Intrinsic<[],[llvm_shared_i64ptr_ty],
|
||||
[IntrConvergent, IntrWriteMem, IntrArgMemOnly,
|
||||
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
|
||||
|
||||
def int_nvvm_mbarrier_arrive : GCCBuiltin<"__nvvm_mbarrier_arrive">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_shared">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_noComplete :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_noComplete_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
|
||||
def int_nvvm_mbarrier_arrive_drop :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_drop">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_drop_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_drop_noComplete :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_arrive_drop_noComplete_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
|
||||
Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
|
||||
|
||||
def int_nvvm_mbarrier_test_wait :
|
||||
GCCBuiltin<"__nvvm_mbarrier_test_wait">,
|
||||
Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
|
||||
def int_nvvm_mbarrier_test_wait_shared :
|
||||
GCCBuiltin<"__nvvm_mbarrier_test_wait_shared">,
|
||||
Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
|
||||
|
||||
def int_nvvm_mbarrier_pending_count :
|
||||
GCCBuiltin<"__nvvm_mbarrier_pending_count">,
|
||||
Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent]>;
|
||||
|
||||
// Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the
|
||||
// pointer's alignment.
|
||||
def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
|
||||
|
@ -1141,7 +1249,7 @@ def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
|
|||
|
||||
// For getting the handle from a texture or surface variable
|
||||
def int_nvvm_texsurf_handle
|
||||
: Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyi64ptr_ty],
|
||||
: Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_any_i64ptr_ty],
|
||||
[IntrNoMem], "llvm.nvvm.texsurf.handle">;
|
||||
def int_nvvm_texsurf_handle_internal
|
||||
: Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],
|
||||
|
|
|
@ -144,11 +144,13 @@ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
|
|||
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
|
||||
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
|
||||
def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
|
||||
def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
|
||||
|
||||
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
|
||||
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
|
||||
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
|
||||
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
|
||||
def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
|
||||
|
||||
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
|
||||
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
|
||||
|
|
|
@ -288,6 +288,211 @@ def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
|
|||
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
|
||||
|
||||
|
||||
//-----------------------------------
|
||||
// Async Copy Functions
|
||||
//-----------------------------------
|
||||
|
||||
multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
|
||||
!strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
|
||||
[(Intrin Int32Regs:$addr)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
|
||||
!strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
|
||||
[(Intrin Int64Regs:$addr)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm CP_ASYNC_MBARRIER_ARRIVE :
|
||||
CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
|
||||
defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
|
||||
CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
|
||||
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
|
||||
CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
|
||||
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
|
||||
CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
|
||||
|
||||
multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
|
||||
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
|
||||
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
|
||||
|
||||
defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
|
||||
CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
|
||||
|
||||
multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
|
||||
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
|
||||
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
|
||||
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
|
||||
CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
|
||||
|
||||
def CP_ASYNC_COMMIT_GROUP :
|
||||
NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
|
||||
def CP_ASYNC_WAIT_GROUP :
|
||||
NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
|
||||
[(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
|
||||
def CP_ASYNC_WAIT_ALL :
|
||||
NVPTXInst<(outs), (ins), "cp.async.wait_all;",
|
||||
[(int_nvvm_cp_async_wait_all)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
|
||||
//-----------------------------------
|
||||
// MBarrier Functions
|
||||
//-----------------------------------
|
||||
|
||||
multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
|
||||
[(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
|
||||
[(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
|
||||
defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
|
||||
int_nvvm_mbarrier_init_shared>;
|
||||
|
||||
multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
|
||||
!strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
|
||||
[(Intrin Int32Regs:$addr)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
|
||||
!strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
|
||||
[(Intrin Int64Regs:$addr)]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
|
||||
defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
|
||||
int_nvvm_mbarrier_inval_shared>;
|
||||
|
||||
multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
|
||||
!strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
|
||||
[(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
|
||||
!strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
|
||||
[(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
|
||||
defm MBARRIER_ARRIVE_SHARED :
|
||||
MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
|
||||
|
||||
multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs Int64Regs:$state),
|
||||
(ins Int32Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.arrive.noComplete", AddrSpace,
|
||||
".b64 $state, [$addr], $count;"),
|
||||
[(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs Int64Regs:$state),
|
||||
(ins Int64Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.arrive.noComplete", AddrSpace,
|
||||
".b64 $state, [$addr], $count;"),
|
||||
[(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_ARRIVE_NOCOMPLETE :
|
||||
MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
|
||||
defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
|
||||
MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
|
||||
|
||||
multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
|
||||
!strconcat("mbarrier.arrive_drop", AddrSpace,
|
||||
".b64 $state, [$addr];"),
|
||||
[(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
|
||||
!strconcat("mbarrier.arrive_drop", AddrSpace,
|
||||
".b64 $state, [$addr];"),
|
||||
[(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_ARRIVE_DROP :
|
||||
MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
|
||||
defm MBARRIER_ARRIVE_DROP_SHARED :
|
||||
MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
|
||||
|
||||
multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs Int64Regs:$state),
|
||||
(ins Int32Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
|
||||
".b64 $state, [$addr], $count;"),
|
||||
[(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs Int64Regs:$state),
|
||||
(ins Int64Regs:$addr, Int32Regs:$count),
|
||||
!strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
|
||||
".b64 $state, [$addr], $count;"),
|
||||
[(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
|
||||
MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
|
||||
defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
|
||||
MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
|
||||
int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
|
||||
|
||||
multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
|
||||
def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
|
||||
!strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
|
||||
[(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
|
||||
!strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
|
||||
[(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
}
|
||||
|
||||
defm MBARRIER_TEST_WAIT :
|
||||
MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
|
||||
defm MBARRIER_TEST_WAIT_SHARED :
|
||||
MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
|
||||
|
||||
class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
|
||||
NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
|
||||
"mbarrier.pending_count.b64 $res, $state;",
|
||||
[(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
|
||||
Requires<[hasPTX70, hasSM80]>;
|
||||
|
||||
def MBARRIER_PENDING_COUNT :
|
||||
MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
|
||||
|
||||
//-----------------------------------
|
||||
// Math Functions
|
||||
//-----------------------------------
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
|
||||
|
||||
declare void @llvm.nvvm.cp.async.wait.group(i32)
|
||||
|
||||
; ALL-LABEL: asyncwaitgroup
|
||||
define void @asyncwaitgroup() {
|
||||
; ALL: cp.async.wait_group 8;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
|
||||
; ALL: cp.async.wait_group 0;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
|
||||
; ALL: cp.async.wait_group 16;
|
||||
tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.wait.all()
|
||||
|
||||
; ALL-LABEL: asyncwaitall
|
||||
define void @asyncwaitall() {
|
||||
; ALL: cp.async.wait_all
|
||||
tail call void @llvm.nvvm.cp.async.wait.all()
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.commit.group()
|
||||
|
||||
; ALL-LABEL: asynccommitgroup
|
||||
define void @asynccommitgroup() {
|
||||
; ALL: cp.async.commit_group
|
||||
tail call void @llvm.nvvm.cp.async.commit.group()
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
|
||||
declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
|
||||
declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
|
||||
declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
|
||||
|
||||
; CHECK-LABEL: asyncmbarrier
|
||||
define void @asyncmbarrier(i64* %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriershared
|
||||
define void @asyncmbarriershared(i64 addrspace(3)* %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriernoinc
|
||||
define void @asyncmbarriernoinc(i64* %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: asyncmbarriernoincshared
|
||||
define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) {
|
||||
; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
|
||||
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal4i8
|
||||
define void @asynccasharedglobal4i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal8i8
|
||||
define void @asynccasharedglobal8i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
|
||||
; CHECK-LABEL: asynccasharedglobal16i8
|
||||
define void @asynccasharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
|
||||
; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
|
||||
; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
|
||||
tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
|
||||
; CHECK-LABEL: asynccgsharedglobal16i8
|
||||
define void @asynccgsharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
|
||||
; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
|
||||
; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
|
||||
tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
|
||||
|
||||
declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
|
||||
declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
|
||||
; CHECK-LABEL: barrierinit
|
||||
define void @barrierinit(i64* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
tail call void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierinitshared
|
||||
define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
tail call void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.nvvm.mbarrier.inval(i64* %a)
|
||||
declare void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
|
||||
|
||||
; CHECK-LABEL: barrierinval
|
||||
define void @barrierinval(i64* %a) {
|
||||
; CHECK_PTX32: mbarrier.inval.b64 [%r{{[0-1]+}}];
|
||||
; CHECK_PTX64: mbarrier.inval.b64 [%rd{{[0-1]+}}];
|
||||
tail call void @llvm.nvvm.mbarrier.inval(i64* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierinvalshared
|
||||
define void @barrierinvalshared(i64 addrspace(3)* %a) {
|
||||
; CHECK_PTX32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}];
|
||||
; CHECK_PTX64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}];
|
||||
tail call void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
|
||||
|
||||
; CHECK-LABEL: barrierarrive
|
||||
define void @barrierarrive(i64* %a) {
|
||||
; CHECK_PTX32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierarriveshared
|
||||
define void @barrierarriveshared(i64 addrspace(3)* %a) {
|
||||
; CHECK_PTX32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
|
||||
; CHECK-LABEL: barrierarrivenoComplete
|
||||
define void @barrierarrivenoComplete(i64* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierarrivenoCompleteshared
|
||||
define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
|
||||
|
||||
; CHECK-LABEL: barrierarrivedrop
|
||||
define void @barrierarrivedrop(i64* %a) {
|
||||
; CHECK_PTX32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierarrivedropshared
|
||||
define void @barrierarrivedropshared(i64 addrspace(3)* %a) {
|
||||
; CHECK_PTX32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
|
||||
; CHECK_PTX64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
|
||||
declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
|
||||
; CHECK-LABEL: barrierarrivedropnoComplete
|
||||
define void @barrierarrivedropnoComplete(i64* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barrierarrivedropnoCompleteshared
|
||||
define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
|
||||
; CHECK_PTX32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||
%ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
|
||||
declare i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
|
||||
|
||||
; CHECK-LABEL: barriertestwait
|
||||
define void @barriertestwait(i64* %a, i64 %b) {
|
||||
; CHECK_PTX32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
|
||||
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: barriertestwaitshared
|
||||
define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) {
|
||||
; CHECK_PTX32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
|
||||
%ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
|
||||
|
||||
; CHECK-LABEL: barrierpendingcount
|
||||
define i32 @barrierpendingcount(i64* %a, i64 %b) {
|
||||
; CHECK_PTX32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
|
||||
; CHECK_PTX64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
|
||||
%ret = tail call i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
|
||||
ret i32 %ret
|
||||
}
|
Loading…
Reference in New Issue