[AMDGPU] Set implicit arg attributes for indirect calls

This patch adds attributes corresponding to
implicits to functions/kernels if
1. it has an indirect call OR
2. it's address is taken.

Once such attributes are set, rest of the codegen would work
out-of-box for indirect calls. This patch eliminates
the potential overhead -fixed-abi imposes even though indirect functions
calls are not used.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D99347
This commit is contained in:
madhur13490 2021-03-25 07:25:51 +00:00
parent 0f42675c86
commit 5682ae2fc6
37 changed files with 1303 additions and 1516 deletions

View File

@ -25,6 +25,13 @@
using namespace llvm;
namespace {
static constexpr StringLiteral ImplicitAttrNames[] = {
// X ids unnecessarily propagated to kernels.
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
"amdgpu-implicitarg-ptr"};
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
@ -194,18 +201,10 @@ static bool handleAttr(Function &Parent, const Function &Callee,
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
// X ids unnecessarily propagated to kernels.
static constexpr StringLiteral AttrNames[] = {
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
"amdgpu-implicitarg-ptr"};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
for (StringRef AttrName : AttrNames)
for (StringRef AttrName : ImplicitAttrNames)
handleAttr(Parent, Callee, AttrName);
}
@ -268,7 +267,20 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
bool HasIndirectCall = false;
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
CallingConv::ID CC = F.getCallingConv();
bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
// If this function hasAddressTaken() = true
// then add all attributes corresponding to the implicit args.
if (CallingConvSupportsAllImplicits &&
F.hasAddressTaken(nullptr, true, true, true)) {
for (StringRef AttrName : ImplicitAttrNames) {
F.addFnAttr(AttrName);
}
Changed = true;
}
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
@ -281,10 +293,12 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const Function *Callee =
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
// TODO: Do something with indirect calls.
// Note the occurence of indirect call.
if (!Callee) {
if (!CB->isInlineAsm())
if (!CB->isInlineAsm()) {
HasIndirectCall = true;
HaveCall = true;
}
continue;
}
@ -351,6 +365,28 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
}
// This pass cannot copy attributes from callees to callers
// if there is an indirect call and in thus such cases,
// hasAddressTaken() would be false for kernels and functions
// making an indirect call (if they are themselves not indirectly called).
// We must tag all such kernels/functions with all implicits attributes
// for correctness.
// e.g.
// 1. Kernel K1 makes an indirect call to function F1.
// Without detecting an indirect call in K1, this pass will not
// add all implicit args to K1 (which is incorrect).
// 2. Kernel K1 makes direct call to F1 which makes indirect call to function
// F2.
// Without detecting an indirect call in F1 (whose hasAddressTaken() is
// false), the pass will not add all implicit args to F1 (which is
// essential for correctness).
if (CallingConvSupportsAllImplicits && HasIndirectCall) {
for (StringRef AttrName : ImplicitAttrNames) {
F.addFnAttr(AttrName);
}
Changed = true;
}
return Changed;
}

View File

@ -388,10 +388,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
}
// Set -fixed-function-abi to true if not provided..
if (TT.getOS() == Triple::AMDHSA &&
EnableAMDGPUFixedFunctionABIOpt.getNumOccurrences() == 0)
EnableFixedFunctionABI = true;
}
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;

View File

@ -53,9 +53,9 @@ define i32 @asm_vgpr_early_clobber() {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]]
; CHECK: $vgpr0 = COPY [[ADD]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
@ -87,8 +87,8 @@ define i32 @test_single_vgpr_output() nounwind {
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
@ -102,8 +102,8 @@ define i32 @test_single_sgpr_output_s32() nounwind {
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
@ -118,9 +118,9 @@ define float @test_multiple_register_outputs_same() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]]
; CHECK: $vgpr0 = COPY [[FADD]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
@ -138,9 +138,9 @@ define double @test_multiple_register_outputs_mixed() #0 {
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2883594 /* regdef:VReg_64 */, def %9
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %9
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
; CHECK: $vgpr0 = COPY [[UV]](s32)
; CHECK: $vgpr1 = COPY [[UV1]](s32)
@ -209,8 +209,8 @@ define float @test_input_vgpr(i32 %src) nounwind {
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %9
; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %2
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
@ -225,8 +225,8 @@ define i32 @test_memory_constraint(i32 addrspace(3)* %a) nounwind {
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
; CHECK: $vgpr0 = COPY [[COPY2]](s32)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
@ -243,8 +243,8 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %11
; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %4
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
@ -258,14 +258,14 @@ define i32 @test_sgpr_matching_constraint() nounwind {
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %10
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %3
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32)
; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %12
; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %5
; CHECK: $vgpr0 = COPY [[COPY5]](s32)
; CHECK: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
@ -288,10 +288,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %11
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %12
; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %13
; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %4
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %5
; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %6
; CHECK: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
@ -312,11 +312,11 @@ define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
; CHECK: bb.1.entry:
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %10
; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %3
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0

View File

@ -14,7 +14,7 @@ define void @func_use_lds_global() {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: ds_write_b32 v0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@ -37,10 +37,10 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: flat_store_dword v[0:1], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: func_use_lds_global_constexpr_cast:

View File

@ -101,7 +101,7 @@ bb2:
; ALL-LABEL: {{^}}test_workitem_id_x_func:
; ALL: s_waitcnt
; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31
; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
@ -110,7 +110,7 @@ define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 {
}
; ALL-LABEL: {{^}}test_workitem_id_y_func:
; HSA: v_lshrrev_b32_e32 v2, 10, v31
; HSA: v_lshrrev_b32_e32 v2, 10, v2
; MESA: v_lshrrev_b32_e32 v2, 10, v2
define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.y()
@ -119,7 +119,7 @@ define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 {
}
; ALL-LABEL: {{^}}test_workitem_id_z_func:
; HSA: v_lshrrev_b32_e32 v2, 20, v31
; HSA: v_lshrrev_b32_e32 v2, 20, v2
; MESA: v_lshrrev_b32_e32 v2, 20, v2
define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.z()

View File

@ -172,7 +172,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off
@ -227,14 +227,14 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; GCN-NEXT: s_add_u32 s6, s32, 0x1000
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v4, s6
; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v5, s6
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v2, 1
; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: global_store_dword v[0:1], v2, off

View File

@ -42,7 +42,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
; Test handling inside a non-kernel
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc

View File

@ -122,18 +122,18 @@ bb:
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
; GFX908: .amdhsa_next_free_vgpr 32
; GFX90A: .amdhsa_accum_offset 32
; GCN: NumVgprs: 32
; GFX90A: .amdhsa_accum_offset 12
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
; GFX90A: TotalNumVgprs: 64
; GFX90A: TotalNumVgprs: 44
; GFX908: VGPRBlocks: 7
; GFX90A: VGPRBlocks: 7
; GFX90A: VGPRBlocks: 5
; GFX908: NumVGPRsForWavesPerEU: 32
; GFX90A: NumVGPRsForWavesPerEU: 64
; GFX90A: AccumOffset: 32
; GFX90A: NumVGPRsForWavesPerEU: 44
; GFX90A: AccumOffset: 12
; GCN: Occupancy: 8
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
bb:
call void @func_32_agprs() #0
@ -141,10 +141,10 @@ bb:
}
; GCN-LABEL: {{^}}func_call_func_32_agprs:
; GCN: NumVgprs: 32
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
; GFX90A: TotalNumVgprs: 64
; GFX90A: TotalNumVgprs: 44
define void @func_call_func_32_agprs() #0 {
bb:
call void @func_32_agprs() #0
@ -154,21 +154,21 @@ bb:
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
; GFX908: .amdhsa_next_free_vgpr 32
; GFX90A: .amdhsa_next_free_vgpr 56
; GFX90A: .amdhsa_accum_offset 32
; GCN: NumVgprs: 32
; GFX908: .amdhsa_next_free_vgpr 24
; GFX90A: .amdhsa_next_free_vgpr 48
; GFX90A: .amdhsa_accum_offset 24
; GCN: NumVgprs: 24
; GCN: NumAgprs: 24
; GFX908: TotalNumVgprs: 32
; GFX90A: TotalNumVgprs: 56
; GFX908: VGPRBlocks: 7
; GFX90A: VGPRBlocks: 6
; GFX908: NumVGPRsForWavesPerEU: 32
; GFX90A: NumVGPRsForWavesPerEU: 56
; GFX90A: AccumOffset: 32
; GFX908: Occupancy: 8
; GFX908: TotalNumVgprs: 24
; GFX90A: TotalNumVgprs: 48
; GFX908: VGPRBlocks: 5
; GFX90A: VGPRBlocks: 5
; GFX908: NumVGPRsForWavesPerEU: 24
; GFX90A: NumVGPRsForWavesPerEU: 48
; GFX90A: AccumOffset: 24
; GFX908: Occupancy: 10
; GFX90A: Occupancy: 8
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()

View File

@ -334,4 +334,4 @@ attributes #3 = { nounwind }
; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" }
; HSA: attributes #18 = { nounwind }
; HSA: attributes #19 = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" }
; HSA: attributes #20 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
; HSA: attributes #20 = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" }

View File

@ -80,15 +80,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
; HSA: buffer_load_ubyte [[VAR:v[0-9]+]]
; HSA-DAG: s_mov_b32 s32, 0
; HSA: s_mov_b32 s32, 0
; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
; MESA-DAG: s_mov_b32 s32, 0{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
; MESA-DAG: v_bfe_i32 v0, v0, 0, 1
; HSA: v_bfe_i32 v0, v3, 0, 1
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
@ -100,24 +99,18 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; FIXME: load should be scheduled before getpc
; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
; HSA: buffer_load_ubyte v3
; HSA: buffer_load_ubyte v0
; HSA-DAG: s_mov_b32 s32, 0{{$}}
; MESA: buffer_load_ubyte v0
; MESA-DAG: s_mov_b32 s32, 0{{$}}
; MESA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; MESA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
; MESA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
; MESA-NEXT: v_and_b32_e32 v0, 1, v0
; MESA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; MESA-NEXT: s_endpgm
; HSA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; HSA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
; HSA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
; HSA-NEXT: v_and_b32_e32 v0, 1, v3
; HSA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; HSA-NEXT: s_endpgm
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
%var = load volatile i1, i1 addrspace(1)* undef
call void @external_void_func_i1_zeroext(i1 %var)
@ -143,8 +136,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; FIXME: don't wait before call
; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
; MESA-DAG: buffer_load_sbyte v0
; HSA-DAG: buffer_load_sbyte v3
; GCN-DAG: buffer_load_sbyte v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12
@ -152,7 +144,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
@ -162,8 +154,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
; MESA-DAG: buffer_load_ubyte v0
; HSA-DAG: buffer_load_ubyte v3
; GCN-DAG: buffer_load_ubyte v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12
@ -171,7 +162,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
%var = load volatile i8, i8 addrspace(1)* undef
@ -192,8 +183,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
; MESA-DAG: buffer_load_sshort v0
; HSA-DAG: buffer_load_sshort v3
; GCN-DAG: buffer_load_sshort v0
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12
@ -201,7 +191,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
@ -218,7 +208,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; GCN-DAG: s_mov_b32 s32, 0
; GCN-NOT: s_waitcnt
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
%var = load volatile i16, i16 addrspace(1)* undef
@ -491,7 +481,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}}
; GCN-NOT: v3,
; GCN-NOT: v3
; GCN-DAG: v_mov_b32_e32 v0, 3
; GCN-DAG: v_mov_b32_e32 v1, 4
; GCN-DAG: v_mov_b32_e32 v2, 5
@ -596,7 +586,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; MESA-NOT: s_waitcnt
; GCN-NOT: s_waitcnt
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
%ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
@ -621,8 +611,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt
; MESA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
; HSA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4
; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
@ -645,11 +634,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
}
; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
; MESA: buffer_load_ubyte v0, off
; MESA-DAG: buffer_load_dword v1, off
; HSA: buffer_load_ubyte v3, off
; HSA-DAG: buffer_load_dword v4, off
; MESA-NOT: s_waitcnt
; GCN: buffer_load_ubyte v0, off
; GCN: buffer_load_dword v1, off
; GCN-NOT: s_waitcnt
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
@ -751,19 +738,15 @@ entry:
}
; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32,
; MESA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
; MESA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
; HSA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32
; HSA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:24
; GCN-NOT: s32
; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
; GCN: s_getpc_b64
; MESA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
; MESA: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
; HSA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:16
; HSA: buffer_store_dword [[VREG1]], off, s[0:3], s32
; GCN-NOT: s32,
; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
entry:
@ -774,16 +757,11 @@ entry:
; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
; GCN-NOT: s32
; MESA: buffer_load_dword v32, off, s[0:3], s32 offset:4
; MESA: buffer_load_dword v33, off, s[0:3], s32{{$}}
; MESA: s_getpc_b64
; MESA: buffer_store_dword v33, off, s[0:3], s32{{$}}
; MESA: buffer_store_dword v32, off, s[0:3], s32 offset:4
; HSA: buffer_load_dword v32, off, s[0:3], s32 offset:8
; HSA: buffer_load_dword v33, off, s[0:3], s32 offset:4
; HSA: s_getpc_b64
; HSA: buffer_store_dword v33, off, s[0:3], s32 offset:4
; HSA: buffer_store_dword v32, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
; GCN: s_getpc_b64
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
@ -793,27 +771,16 @@ entry:
}
; GCN-LABEL: {{^}}stack_12xv3i32:
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; MESA: buffer_store_dword [[REG12]], {{.*$}}
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
; MESA: v_mov_b32_e32 v31, 11
; MESA: s_getpc
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 11
; HSA: buffer_store_dword [[REG12]], {{.*$}}
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
; HSA: s_getpc
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 11
; GCN: s_getpc
define void @stack_12xv3i32() #0 {
entry:
call void @external_void_func_12xv3i32(
@ -833,25 +800,16 @@ entry:
}
; GCN-LABEL: {{^}}stack_12xv3f32:
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; MESA: buffer_store_dword [[REG12]], {{.*$}}
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
; MESA: v_mov_b32_e32 v31, 0x41300000
; MESA: s_getpc
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
; HSA: s_getpc
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 0x41300000
; GCN: s_getpc
define void @stack_12xv3f32() #0 {
entry:
call void @external_void_func_12xv3f32(
@ -872,41 +830,24 @@ entry:
; GCN-LABEL: {{^}}stack_8xv5i32:
; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; MESA: buffer_store_dword [[REG8]], {{.*$}}
; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; MESA: v_mov_b32_e32 v31, 7
; GCN: v_mov_b32_e32 v31, 7
; GCN: s_getpc
define void @stack_8xv5i32() #0 {
entry:
@ -923,42 +864,24 @@ entry:
}
; GCN-LABEL: {{^}}stack_8xv5f32:
; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
; MESA: buffer_store_dword [[REG8]], {{.*$}}
; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
; MESA: v_mov_b32_e32 v31, 0x40e00000
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x40e00000
; HSA: buffer_store_dword [[REG8]], {{.*$}}
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
; GCN: v_mov_b32_e32 v31, 0x40e00000
; GCN: s_getpc
define void @stack_8xv5f32() #0 {
entry:

View File

@ -4,8 +4,8 @@
; FIXME: Emitting unnecessary flat_scratch setup
; GCN-LABEL: {{^}}test_call_undef:
; SDAG: s_mov_b32 flat_scratch_lo, s5
; SDAG: s_add_u32 s4, s4, s7
; SDAG: s_mov_b32 flat_scratch_lo, s11
; SDAG: s_add_u32 s10, s10, s15
; SDAG: s_lshr_b32
; GCN: s_endpgm
define amdgpu_kernel void @test_call_undef() #0 {
@ -26,8 +26,8 @@ define i32 @test_tail_call_undef() #0 {
}
; GCN-LABEL: {{^}}test_call_null:
; SDAG: s_mov_b32 flat_scratch_lo, s5
; SDAG: s_add_u32 s4, s4, s7
; SDAG: s_mov_b32 flat_scratch_lo, s11
; SDAG: s_add_u32 s10, s10, s15
; SDAG: s_lshr_b32
; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}

View File

@ -65,7 +65,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31
; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: s_setpc_b64
define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
@ -78,6 +78,7 @@ define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
; GCN: s_getpc_b64
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12
; GCN: v_or_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v0, 9
; GCN: s_swappc_b64
; GCN: v_add_f32_e32

View File

@ -230,7 +230,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 32
; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@ -244,7 +244,7 @@ entry:
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 32
; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@ -258,7 +258,7 @@ entry:
; CI: NumSgprs: 48
; VI-NOBUG: NumSgprs: 48
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 32
; GCN: NumVgprs: 24
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1

View File

@ -6,15 +6,15 @@
declare hidden void @external_void_func_void() #0
; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; GCN: s_getpc_b64 s[44:45]
; GCN-NEXT: s_add_u32 s44, s44,
; GCN-NEXT: s_addc_u32 s45, s45,
; GCN: s_getpc_b64 s[34:35]
; GCN-NEXT: s_add_u32 s34, s34,
; GCN-NEXT: s_addc_u32 s35, s35,
; GCN-NEXT: s_mov_b32 s32, 0
; GCN: s_swappc_b64 s[30:31], s[44:45]
; GCN: s_swappc_b64 s[30:31], s[34:35]
; GCN-DAG: #ASMSTART
; GCN-DAG: #ASMEND
; GCN-DAG: s_swappc_b64 s[30:31], s[44:45]
; GCN-NEXT: #ASMSTART
; GCN-NEXT: #ASMEND
; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
@ -25,60 +25,24 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; MUBUF: buffer_store_dword
; FLATSCR: scratch_store_dword
; GCN: v_writelane_b32 v41, s33, 15
; GCN-NEXT: v_writelane_b32 v41, s34, 0
; GCN-NEXT: v_writelane_b32 v41, s35, 1
; GCN-NEXT: v_writelane_b32 v41, s36, 2
; GCN-NEXT: v_writelane_b32 v41, s37, 3
; GCN-NEXT: v_writelane_b32 v41, s38, 4
; GCN-NEXT: v_writelane_b32 v41, s39, 5
; GCN-NEXT: v_writelane_b32 v41, s40, 6
; GCN-NEXT: v_writelane_b32 v41, s41, 7
; GCN-NEXT: v_writelane_b32 v41, s42, 8
; GCN-NEXT: v_writelane_b32 v41, s43, 9
; GCN-NEXT: v_writelane_b32 v41, s44, 10
; GCN-NEXT: v_writelane_b32 v41, s46, 11
; GCN-NEXT: v_writelane_b32 v41, s47, 12
; GCN-NEXT: v_writelane_b32 v41, s30, 13
; GCN: v_writelane_b32 v40, s33, 4
; GCN: v_writelane_b32 v40, s34, 0
; GCN: v_writelane_b32 v40, s35, 1
; GCN: v_writelane_b32 v40, s30, 2
; GCN: v_writelane_b32 v40, s31, 3
; GCN: s_swappc_b64
; GCN-DAG: ;;#ASMSTART
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; MUBUF-DAG: v_readlane_b32 s4, v40, 2
; MUBUF-DAG: v_readlane_b32 s5, v40, 3
; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
; GCN: v_readlane_b32 s35, v40, 1
; GCN: v_readlane_b32 s34, v40, 0
; MUBUF-DAG: v_readlane_b32 s4, v41, 13
; MUBUF-DAG: v_readlane_b32 s5, v41, 14
; MUBUF-DAG: v_readlane_b32 s47, v41, 12
; MUBUF-DAG: v_readlane_b32 s46, v41, 11
; MUBUF-DAG: v_readlane_b32 s44, v41, 10
; MUBUF-DAG: v_readlane_b32 s43, v41, 9
; MUBUF-DAG: v_readlane_b32 s42, v41, 8
; MUBUF-DAG: v_readlane_b32 s41, v41, 7
; MUBUF-DAG: v_readlane_b32 s40, v41, 6
; MUBUF-DAG: v_readlane_b32 s39, v41, 5
; MUBUF-DAG: v_readlane_b32 s38, v41, 4
; MUBUF-DAG: v_readlane_b32 s37, v41, 3
; MUBUF-DAG: v_readlane_b32 s36, v41, 2
; MUBUF-DAG: v_readlane_b32 s35, v41, 1
; MUBUF-DAG: v_readlane_b32 s34, v41, 0
; FLATSCR: v_readlane_b32 s0, v41, 13
; FLATSCR-DAG: v_readlane_b32 s1, v41, 14
; FLATSCR-DAG: v_readlane_b32 s47, v41, 12
; FLATSCR-DAG: v_readlane_b32 s46, v41, 11
; FLATSCR-DAG: v_readlane_b32 s44, v41, 10
; FLATSCR-DAG: v_readlane_b32 s43, v41, 9
; FLATSCR-DAG: v_readlane_b32 s42, v41, 8
; FLATSCR-DAG: v_readlane_b32 s41, v41, 7
; FLATSCR-DAG: v_readlane_b32 s40, v41, 6
; FLATSCR-DAG: v_readlane_b32 s39, v41, 5
; FLATSCR-DAG: v_readlane_b32 s38, v41, 4
; FLATSCR-DAG: v_readlane_b32 s37, v41, 3
; FLATSCR-DAG: v_readlane_b32 s36, v41, 2
; FLATSCR-DAG: v_readlane_b32 s35, v41, 1
; FLATSCR-DAG: v_readlane_b32 s34, v41, 0
; FLATSCR-DAG: v_readlane_b32 s33, v41, 15
; GCN: v_readlane_b32 s33, v40, 4
; MUBUF: buffer_load_dword
; FLATSCR: scratch_load_dword
; GCN: s_setpc_b64
@ -90,19 +54,19 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
}
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
; MUBUF: buffer_store_dword v41
; GCN: v_writelane_b32 v41, s33, 15
; MUBUF: buffer_store_dword v40
; FLATSCR: scratch_store_dword off, v40
; GCN: v_writelane_b32 v40, s33, 4
; GCN: s_mov_b32 s33, s32
; FLATSCR: s_add_u32 s32, s32, 16
; FLATSCR: scratch_store_dword off, v40
; MUBUF: s_add_u32 s32, s32, 0x400
; FLATSCR: s_add_u32 s32, s32, 16
; GCN: s_swappc_b64
; GCN-DAG: s_swappc_b64
; GCN-NEXT: s_swappc_b64
; GCN: v_readlane_b32 s33, v41, 15
; MUBUF: buffer_load_dword v41
; FLATSCR: scratch_load_dword v41
; GCN: v_readlane_b32 s33, v40, 4
; MUBUF: buffer_load_dword v40
; FLATSCR: scratch_load_dword v40
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
@ -160,7 +124,7 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v40, v31
; GCN-DAG: s_swappc_b64
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
@ -172,18 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
; FIXME: What is the expected behavior for reserved registers here?
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
; MUBUF: s_getpc_b64 s[18:19]
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[16:17]
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN: #ASMSTART
; GCN-NEXT: ; def s33
; GCN-NEXT: #ASMEND
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use s33
; GCN-NEXT: ;;#ASMEND
@ -199,12 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
; GCN-NOT: s34
; MUBUF: s_getpc_b64 s[18:19]
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[16:17]
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: s34
@ -213,8 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
; GCN-NEXT: ;;#ASMEND
; GCN-NOT: s34
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: s34
@ -232,12 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
; GCN-NOT: v32
; MUBUF: s_getpc_b64 s[18:19]
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[16:17]
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
; MUBUF: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
; FLATSCR: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v40
@ -245,8 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
; GCN-NOT: v40

View File

@ -5,30 +5,20 @@
define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_arg_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_load_dword s14, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, s14
; GCN-NEXT: ds_read_b32 v3, v3
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v3
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
call void @func(i32 %vgpr)
ret void
@ -38,29 +28,21 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_memory_no_dep:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_add_u32 s8, s8, 16
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v3, v3, s[14:15]
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_endpgm
store i32 0, i32 addrspace(1)* %ptr
call void @func(i32 0)
ret void
@ -69,29 +51,21 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
; Should not wait after the call before memory
define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
; GCN: %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_add_u32 s8, s8, 16
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
; GCN-NEXT: s_endpgm
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
; GCN-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, i32 addrspace(1)* %ptr
ret void
@ -100,28 +74,20 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_add_u32 s8, s8, 16
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func.return@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
; GCN-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
store i32 %rv, i32 addrspace(1)* %ptr
ret void
@ -131,27 +97,19 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: call_got_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_add_u32 s8, s8, 16
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_getpc_b64 s[14:15]
; GCN-NEXT: s_add_u32 s14, s14, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s15, s15, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
call void @got.func(i32 0)
ret void
}
@ -160,14 +118,14 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
; GCN-LABEL: tailcall_got_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
tail call void @got.func(i32 0)
ret void
}
@ -176,12 +134,12 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
; GCN-LABEL: tail_call_memory_arg_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_setpc_b64 s[4:5]
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
tail call void @func(i32 %vgpr)
ret void

View File

@ -1,5 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}use_dispatch_ptr:
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]

View File

@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -15,7 +15,7 @@ define void @use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -27,7 +27,7 @@ define void @use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -39,10 +39,9 @@ define void @use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-NEXT: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -56,13 +55,11 @@ define void @use_workitem_id_xy() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-NEXT: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -78,10 +75,9 @@ define void @use_workitem_id_xyz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-NEXT: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -95,10 +91,9 @@ define void @use_workitem_id_xz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -112,9 +107,11 @@ define void @use_workitem_id_yz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
; GCN-NOT: v0
; GCN: s_swappc_b64
; GCN-NOT: v0
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
@ -122,10 +119,14 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NOT: v0
; GCN-NOT: v1
; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1
; UNPACKED-TID-NOT: v0
; UNPACKED-TID-NOT: v1
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 1
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
call void @use_workitem_id_y()
ret void
@ -133,7 +134,11 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
; UNPACKED-TID: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NOT: v0
; GCN-NOT: v2
; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2
; UNPACKED-TID-NOT: v0
; UNPACKED-TID-NOT: v1
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
@ -147,6 +152,8 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
; UNPACKED-TID-NOT: v1
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
call void @use_workitem_id_xy()
@ -157,7 +164,9 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
; UNPACKED-TID-NOT: v0
; UNPACKED-TID-NOT: v2
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]]
; GCN-NOT: v0
; GCN-NOT: v2
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
call void @use_workitem_id_xz()
@ -169,9 +178,9 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
; UNPACKED-TID-NOT: v2
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
; GCN-NOT: v1
; GCN-NOT: v2
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
call void @use_workitem_id_yz()
@ -185,7 +194,8 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]]
; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]]
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN-NOT: v2
; GCN: s_swappc_b64
@ -223,8 +233,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
@ -235,9 +245,8 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.y()
@ -248,9 +257,8 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workitem.id.z()
@ -262,10 +270,11 @@ define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
; GCN: v_mov_b32_e32 v1, v0
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
call void @other_arg_use_workitem_id_x(i32 555)
ret void
@ -275,13 +284,14 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
; PACKED-TID: v_mov_b32_e32 v31, v0
; PACKED-TID: v_mov_b32_e32 v1, v0
; GCN-NOT: v1
; GCN: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: v1
; GCN: s_swappc_b64
; GCN-NOT: v0
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 1
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
@ -290,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
; PACKED-TID-DAG: v_mov_b32_e32 v31, v0
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2
; PACKED-TID-DAG: v_mov_b32_e32 v1, v0
; GCN: s_swappc_b64
; GCN-NOT: v0
@ -302,10 +312,9 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
; GCN: v_and_b32_e32 v31, 0x3ff, v31
; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v0
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
; GCN: v_and_b32_e32 v32, 0x3ff, v32
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -357,11 +366,10 @@ define void @too_many_args_use_workitem_id_x(
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s32, 0
; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
@ -377,7 +385,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s33, s32
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
@ -425,13 +433,13 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
; GFX7: buffer_load_dword v0, off, s[0:3], s32
; GFX90A: buffer_load_dword v32, off, s[0:3], s32
; GFX7: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX90A: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-DAG: s_waitcnt
; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32,
; GFX7: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -486,18 +494,17 @@ define void @too_many_args_use_workitem_id_x_byval(
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
; GCN-DAG: s_movk_i32 s32, 0x400
; GFX7: buffer_store_dword v3, off, s[0:3], s32
; GFX90A: buffer_store_dword v0, off, s[0:3], s32
; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2
; GCN: .amdhsa_system_vgpr_workitem_id 0
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
@ -515,12 +522,11 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: buffer_store_dword v40, off, s[0:3], s32 offset:4
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
@ -541,20 +547,21 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
; GFX90A: v_and_b32_e32 v33, 0x3ff, v31
; GFX90A: v_bfe_u32 v33, v31, 10, 10
; GCN90A: v_bfe_u32 v31, v31, 20, 10
; GFX7: v_and_b32_e32 v32, 0x3ff, v31
; GFX7: v_bfe_u32 v32, v31, 10, 10
; GCN7: v_bfe_u32 v31, v31, 20, 10
; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v12
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v30{{$}}
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v0{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off{{$}}
; GFX90A: v_and_b32_e32 v33, 0x3ff, v32
; GFX90A: v_bfe_u32 v34, v32, 10, 10
; GCN90A: v_bfe_u32 v32, v32, 20, 10
; GFX7: buffer_load_dword v32, off, s[0:3], s32{{$}}
; GFX7: v_and_b32_e32 v33, 0x3ff, v32
; GFX7: v_bfe_u32 v33, v32, 10, 10
; GCN7: v_bfe_u32 v32, v32, 20, 10
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}}
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off{{$}}
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}}
; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}}
; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_xyz(
@ -617,11 +624,11 @@ define void @too_many_args_use_workitem_id_xyz(
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2
; PACKED-TID-NOT: v0
; PACKED-TID-NOT: v1
; PACKED-TID-NOT: v2
; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; GCN: .amdhsa_system_vgpr_workitem_id 2

View File

@ -1,8 +1,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s
; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -14,7 +16,8 @@ define void @use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -26,7 +29,8 @@ define void @use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@ -38,9 +42,11 @@ define void @use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
@ -57,10 +63,13 @@ define void @use_workitem_id_xy() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
@ -80,9 +89,11 @@ define void @use_workitem_id_xyz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
@ -98,9 +109,11 @@ define void @use_workitem_id_xz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
@ -115,31 +128,38 @@ define void @use_workitem_id_yz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
; GCN: enable_vgpr_workitem_id = 2
; VARABI: enable_vgpr_workitem_id = 0
; FIXEDABI: enable_vgpr_workitem_id = 2
; FIXEDA-NOT: v0
; VARABI-NOT: v31
; GCN: s_swappc_b64
; GCN-NOT: v0
; FIXEDABI-NOT: v0
; VARABI-NOT: v31
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
; GCN: enable_vgpr_workitem_id = 2
; VARABI: enable_vgpr_workitem_id = 1
; FIXEDABI: enable_vgpr_workitem_id = 2
; GCN-NOT: v0
; GCN-NOT: v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; VARABI-NOT: v31
; VARABI: v_lshlrev_b32_e32 v0, 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN-NOT: v0
; GCN-NOT: v1
; FIXEDABI-NOT: v0
; FIXEDABI-NOT: v1
; VARABI-NOT: v31
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
@ -150,11 +170,16 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
; VARABI-NOT: v0
; VARABI-NOT: v2
; VARABI: v_lshlrev_b32_e32 v0, 20, v2
; VARABI-NOT: v0
; VARABI-NOT: v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
@ -163,11 +188,17 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
; VARABI-NOT: v0
; VARABI-NOT: v1
; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; VARABI: v_or_b32_e32 v0, v0, [[IDY]]
; VARABI-NOT: v0
; VARABI-NOT: v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
@ -176,12 +207,18 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
; VARABI-NOT: v0
; VARABI-NOT: v2
; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; VARABI: v_or_b32_e32 v0, v0, [[IDZ]]
; VARABI-NOT: v0
; VARABI-NOT: v2
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
@ -190,12 +227,19 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
; VARABI-NOT: v1
; VARABI-NOT: v2
; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
; VARABI-NOT: v1
; VARABI-NOT: v2
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
@ -204,11 +248,21 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
; VARABI-NOT: v0
; VARABI-NOT: v1
; VARABI-NOT: v2
; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]]
; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
; VARABI-NOT: v0
; VARABI-NOT: v1
; VARABI-NOT: v2
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
@ -245,7 +299,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
@ -258,7 +313,8 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
@ -270,7 +326,8 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
@ -282,13 +339,16 @@ define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
; GCN: enable_vgpr_workitem_id = 2
; VARABI: enable_vgpr_workitem_id = 0
; FIXEDABI: enable_vgpr_workitem_id = 2
; VARABI: v_mov_b32_e32 v1, v0
; VARABI: v_mov_b32_e32 v0, 0x22b
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
@ -298,13 +358,20 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
; VARABI: enable_vgpr_workitem_id = 1
; VARABI: v_lshlrev_b32_e32 v1, 10, v1
; VARABI-NOT: v1
; VARABI: v_mov_b32_e32 v0, 0x22b
; VARABI-NOT: v1
; VARABI: s_swappc_b64
; VARABI-NOT: v0
; GCN: enable_vgpr_workitem_id = 2
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI: enable_vgpr_workitem_id = 2
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
@ -313,21 +380,29 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
; VARABI-DAG: v_mov_b32_e32 v0, 0x22b
; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2
; VARABI: s_swappc_b64
; VARABI-NOT: v0
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
call void @other_arg_use_workitem_id_z(i32 555)
ret void
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
; VARABI: v_and_b32_e32 v32, 0x3ff, v32
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; VARABI: s_setpc_b64
; GCN: v_and_b32_e32 v31, 0x3ff, v31
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@ -376,19 +451,23 @@ define void @too_many_args_use_workitem_id_x(
}
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
; VARABI: enable_vgpr_workitem_id = 0
; VARABI: s_mov_b32 s32, 0
; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
; VARABI: s_swappc_b64
; FIXEDABI: enable_vgpr_workitem_id = 2
; FIXEDABI-DAG: s_mov_b32 s32, 0
; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; GCN: enable_vgpr_workitem_id = 2
; GCN-DAG: s_mov_b32 s32, 0
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
@ -403,13 +482,15 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; VARABI: s_mov_b32 s33, s32
; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}}
; Touching the workitem id register is not necessary.
; GCN-NOT: v31
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
; GCN-NOT: v31
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; GCN-NOT: v31
; FIXEDABI-NOT: v31
; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
; FIXEDABI-NOT: v31
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; FIXEDABI-NOT: v31
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
@ -458,15 +539,21 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VARABI-NEXT: s_waitcnt
; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
; VARABI: s_setpc_b64
; GCN: v_and_b32_e32 v31, 0x3ff, v31
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
; GCN: s_setpc_b64
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
; FIXEDABI: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@ -520,27 +607,36 @@ define void @too_many_args_use_workitem_id_x_byval(
; sp[2] = stack passed workitem ID x
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
; VARABI: enable_vgpr_workitem_id = 0
; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; VARABI: s_movk_i32 s32, 0x400{{$}}
; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
; VARABI: s_swappc_b64
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
; GCN: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
; GCN: s_movk_i32 s32, 0x400{{$}}
; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXME: Why this reload?
; GCN: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN-NOT: s32
; GCN: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
; GCN: s_swappc_b64
; FIXEDABI-NOT: s32
; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
@ -558,19 +654,26 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
; VARABI: s_swappc_b64
; FIXED-ABI-NOT: v31
; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
; FIXED-ABI-NOT: v31
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
; FIXED-ABI-NOT: v31
; GCN: s_swappc_b64
; FIXEDABI: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
@ -588,17 +691,29 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
; VARABI-NOT: buffer_load_dword
; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
; VARABI-NOT: buffer_load_dword
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
; VARABI-NOT: buffer_load_dword
; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
; VARABI: s_setpc_b64
; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
; GCN-NOT: buffer_load_dword
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
; GCN-NOT: buffer_load_dword
; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
; FIXEDABI-NOT: buffer_load_dword
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
; FIXEDABI-NOT: buffer_load_dword
; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
define void @too_many_args_use_workitem_id_xyz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -659,10 +774,12 @@ define void @too_many_args_use_workitem_id_xyz(
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]]
; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}}
; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {

View File

@ -64,64 +64,45 @@ entry:
define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_u32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_add_u32 s4, s4, s7
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_getpc_b64 s[4:5]
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
; GFX1010-NEXT: s_add_u32 s4, s4, s7
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX1010-NEXT: s_endpgm
entry:
tail call void @ex() #0
ret void
@ -130,73 +111,54 @@ entry:
define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_u32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_add_u32 s4, s4, s7
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: v_mov_b32_e32 v0, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: s_getpc_b64 s[4:5]
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s4, s4, s7
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX1010-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4
@ -209,7 +171,7 @@ define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_empty:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_mov_b32 s33, 0
@ -271,67 +233,48 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_u32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_add_u32 s4, s4, s7
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: s_getpc_b64 s[4:5]
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT s_add_u32 s12, s12, s17
; GFX1010-NEXT s_mov_b32 s32, 0
; GFX1010-NEXT s_mov_b32 s33, 0
; GFX1010-NEXT s_addc_u32 s13, s13, 0
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT s_add_u32 s0, s0, s17
; GFX1010-NEXT s_addc_u32 s1, s1, 0
; GFX1010-NEXT s_mov_b32 s12, s14
; GFX1010-NEXT s_mov_b32 s13, s15
; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT s_mov_b32 s14, s16
; GFX1010-NEXT s_getpc_b64 s[18:19]
; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT s_endpgm
; GFX1010-NEXT: s_add_u32 s4, s4, s7
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX1010-NEXT: s_endpgm
entry:
tail call void @ex() #2
ret void
@ -340,76 +283,57 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_u32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s12, s14
; GFX803-NEXT: s_add_u32 s4, s4, s7
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: s_mov_b32 s13, s15
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
; GFX803-NEXT: s_mov_b32 s14, s16
; GFX803-NEXT: s_getpc_b64 s[18:19]
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_mov_b32_e32 v0, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s12, s14
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: s_mov_b32 s13, s15
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX900-NEXT: s_mov_b32 s14, s16
; GFX900-NEXT: s_getpc_b64 s[18:19]
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: s_getpc_b64 s[4:5]
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX900-NEXT: s_movk_i32 s32, 0x400
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX900-NEXT: s_endpgm
;
; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_mov_b32 s12, s14
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX1010-NEXT: s_mov_b32 s13, s15
; GFX1010-NEXT: s_mov_b32 s14, s16
; GFX1010-NEXT: s_getpc_b64 s[18:19]
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GFX1010-NEXT: s_endpgm
; GFX1010-NEXT: s_add_u32 s4, s4, s7
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX1010-NEXT: s_endpgm
entry:
%x = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %x, align 4

View File

@ -27,18 +27,18 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-LABEL: call_split_type_used_outside_block_v2f32:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
@ -59,29 +59,30 @@ bb1:
define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-LABEL: call_split_type_used_outside_block_v3f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <3 x float> @func_v3f32()
br label %bb1
@ -93,29 +94,28 @@ bb1:
define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-LABEL: call_split_type_used_outside_block_v4f16:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <4 x half> @func_v4f16()
br label %bb1
@ -128,29 +128,29 @@ bb1:
define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-LABEL: call_split_type_used_outside_block_struct:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
br label %bb1
@ -168,40 +168,32 @@ bb1:
define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-LABEL: v3i16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s12, 1, s12
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_cbranch_vccnz BB4_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch BB4_3
; GCN-NEXT: BB4_2:
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: BB4_3: ; %if.end
; GCN-NEXT: global_store_short v[0:1], v1, off
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccnz BB4_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_branch BB4_3
; GCN-NEXT: BB4_2:
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: BB4_3: ; %if.end
; GCN-NEXT: global_store_short v[0:1], v1, off
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
entry:
br i1 %cond, label %if.then, label %if.else
@ -221,36 +213,32 @@ if.end: ; preds = %if.else, %if.then
define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-LABEL: v3f16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s12, 1, s12
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_cbranch_vccnz BB5_2
; GCN-NEXT: %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch BB5_3
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccnz BB5_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_branch BB5_3
; GCN-NEXT: BB5_2:
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: s_mov_b32 s5, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: BB5_3: ; %if.end
; GCN-NEXT: global_store_short v[0:1], v1, off
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
entry:
br i1 %cond, label %if.then, label %if.else

View File

@ -0,0 +1,25 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: define internal void @indirect() #0 {
define internal void @indirect() {
ret void
}
; GCN-LABEL: define internal void @direct() #1 {
define internal void @direct() {
%fptr = alloca void()*
store void()* @indirect, void()** %fptr
%fp = load void()*, void()** %fptr
call void %fp()
ret void
}
; GCN-LABEL: define amdgpu_kernel void @test_direct_indirect_call() #2 {
define amdgpu_kernel void @test_direct_indirect_call() {
call void @direct()
ret void
}
; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
; attributes #2 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }

View File

@ -0,0 +1,22 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: define internal void @indirect() #0 {
define internal void @indirect() {
ret void
}
; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
define amdgpu_kernel void @test_simple_indirect_call() #0 {
%fptr = alloca void()*
store void()* @indirect, void()** %fptr
%fp = load void()*, void()** %fptr
call void %fp()
ret void
}
attributes #0 = { "amdgpu-dispatch-id" }
; compiler modification to attributes
attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
attributes #1 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }

View File

@ -11,25 +11,25 @@ define float @fdiv_f32(float %a, float %b) #0 {
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
; GCN: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
; GCN: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
; GCN: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
; GCN: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
; GCN: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
; GCN: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
; GCN: $vcc = COPY %14
; GCN: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: $vcc = COPY %7
; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; GCN: $vgpr0 = COPY %28
; GCN: $vgpr0 = COPY %21
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
entry:
@ -44,25 +44,25 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 {
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GCN: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
; GCN: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
; GCN: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
; GCN: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
; GCN: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
; GCN: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
; GCN: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
; GCN: $vcc = COPY %14
; GCN: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: $vcc = COPY %7
; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; GCN: $vgpr0 = COPY %28
; GCN: $vgpr0 = COPY %21
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
entry:

View File

@ -31,7 +31,7 @@ define hidden void @func() #1 {
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 37
; GCN: ; NumVgprs: 32
; GCN: ; NumVgprs: 9
define amdgpu_kernel void @kernel_call() #0 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()
@ -53,7 +53,7 @@ define amdgpu_kernel void @kernel_call() #0 {
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 32
; GCN: ; NumVgprs: 32
; GCN: ; NumVgprs: 9
define void @func_regular_call() #1 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()
@ -63,13 +63,13 @@ define void @func_regular_call() #1 {
; GCN-LABEL: {{^}}func_tail_call:
; GCN: s_waitcnt
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16,
; GCN-NEXT: s_addc_u32 s17,
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4,
; GCN-NEXT: s_addc_u32 s5,
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN: ; NumSgprs: 32
; GCN: ; NumVgprs: 32
; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()
ret void
@ -82,7 +82,7 @@ define void @func_tail_call() #1 {
; GCN: s_setpc_b64
; GCN: ; NumSgprs: 32
; GCN: ; NumVgprs: 32
; GCN: ; NumVgprs: 9
define void @func_call_tail_call() #1 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
tail call void @func()

View File

@ -13,9 +13,9 @@ define void @func_use_lds_global() {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_write_b32 v0, v0
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@ -37,7 +37,7 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX8-NEXT: s_trap 2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -59,8 +59,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
; GCN-LABEL: {{^}}func_implicitarg_ptr:
; GCN: s_waitcnt
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @func_implicitarg_ptr() #0 {
@ -72,8 +71,7 @@ define void @func_implicitarg_ptr() #0 {
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
; GCN: s_waitcnt
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @opencl_func_implicitarg_ptr() #0 {
@ -114,11 +112,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
; HSA: kernarg_segment_byte_size = 112
; MESA: kernarg_segment_byte_size = 128
; HSA: s_add_u32 s8, s8, 0x70
; HSA: s_add_u32 s4, s4, 0x70
; MESA: s_add_u32 s4, s4, 0x70
; HSA: s_addc_u32 s9, s9, 0{{$}}
; MESA: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_implicitarg_ptr()
@ -130,10 +127,8 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
; HSA: kernarg_segment_byte_size = 160
; MESA: kernarg_segment_byte_size = 128
; HSA: s_add_u32 s8, s8, 0x70
; HSA: s_addc_u32 s9, s9, 0{{$}}
; MESA: s_add_u32 s4, s4, 0x70
; MESA: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_add_u32 s4, s4, 0x70
; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
call void @func_implicitarg_ptr()
@ -141,24 +136,18 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #
}
; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
; HSA-NOT: s8
; HSA-NOT: s9
; HSA-NOT: s[8:9]
; MESA-NOT: s4
; MESA-NOT: s5
; MESA-NOT: s[4:5]
; GCN-NOT: s4
; GCN-NOT: s5
; GCN-NOT: s[4:5]
define void @func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
}
; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
; HSA-NOT: s8
; HSA-NOT: s9
; HSA-NOT: s[8:9]
; MESA-NOT: s4
; MESA-NOT: s5
; MESA-NOT: s[4:5]
; GCN-NOT: s4
; GCN-NOT: s5
; GCN-NOT: s[4:5]
define void @opencl_func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
@ -168,8 +157,7 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {
; GCN: s_waitcnt
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_waitcnt lgkmcnt(0)
define void @func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
@ -185,8 +173,7 @@ define void @func_kernarg_implicitarg_ptr() #0 {
; GCN: s_waitcnt
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; GCN: s_waitcnt lgkmcnt(0)
define void @opencl_func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
@ -199,10 +186,8 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
}
; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
; HSA: s_add_u32 s8, s8, 0x70
; HSA: s_addc_u32 s9, s9, 0
; MESA: s_add_u32 s4, s4, 0x70
; MESA: s_addc_u32 s5, s5, 0
; GCN: s_add_u32 s4, s4, 0x70
; GCN: s_addc_u32 s5, s5, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_kernarg_implicitarg_ptr()

View File

@ -187,98 +187,49 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-LABEL: slsr1_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-NEXT: v_writelane_b32 v44, s33, 15
; GFX9-NEXT: v_writelane_b32 v44, s34, 0
; GFX9-NEXT: v_writelane_b32 v44, s35, 1
; GFX9-NEXT: v_writelane_b32 v44, s36, 2
; GFX9-NEXT: v_writelane_b32 v44, s37, 3
; GFX9-NEXT: v_writelane_b32 v44, s38, 4
; GFX9-NEXT: v_writelane_b32 v44, s39, 5
; GFX9-NEXT: v_writelane_b32 v44, s40, 6
; GFX9-NEXT: v_writelane_b32 v44, s41, 7
; GFX9-NEXT: v_writelane_b32 v44, s42, 8
; GFX9-NEXT: v_writelane_b32 v44, s43, 9
; GFX9-NEXT: v_writelane_b32 v44, s44, 10
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5]
; GFX9-NEXT: v_writelane_b32 v44, s46, 11
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
; GFX9-NEXT: v_writelane_b32 v44, s47, 12
; GFX9-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v41, v1
; GFX9-NEXT: v_mov_b32_e32 v42, v0
; GFX9-NEXT: v_writelane_b32 v44, s30, 13
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX9-NEXT: v_writelane_b32 v44, s31, 14
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: s_mov_b32 s42, s14
; GFX9-NEXT: s_mov_b32 s43, s13
; GFX9-NEXT: s_mov_b32 s44, s12
; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11]
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s44
; GFX9-NEXT: s_mov_b32 s13, s43
; GFX9-NEXT: s_mov_b32 s14, s42
; GFX9-NEXT: v_mov_b32_e32 v31, v40
; GFX9-NEXT: v_mov_b32_e32 v0, v41
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
; GFX9-NEXT: v_add_u32_e32 v0, v41, v43
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
; GFX9-NEXT: s_mov_b32 s12, s44
; GFX9-NEXT: s_mov_b32 s13, s43
; GFX9-NEXT: s_mov_b32 s14, s42
; GFX9-NEXT: v_mov_b32_e32 v31, v40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s4, v44, 13
; GFX9-NEXT: v_readlane_b32 s5, v44, 14
; GFX9-NEXT: v_readlane_b32 s47, v44, 12
; GFX9-NEXT: v_readlane_b32 s46, v44, 11
; GFX9-NEXT: v_readlane_b32 s44, v44, 10
; GFX9-NEXT: v_readlane_b32 s43, v44, 9
; GFX9-NEXT: v_readlane_b32 s42, v44, 8
; GFX9-NEXT: v_readlane_b32 s41, v44, 7
; GFX9-NEXT: v_readlane_b32 s40, v44, 6
; GFX9-NEXT: v_readlane_b32 s39, v44, 5
; GFX9-NEXT: v_readlane_b32 s38, v44, 4
; GFX9-NEXT: v_readlane_b32 s37, v44, 3
; GFX9-NEXT: v_readlane_b32 s36, v44, 2
; GFX9-NEXT: v_readlane_b32 s35, v44, 1
; GFX9-NEXT: v_readlane_b32 s34, v44, 0
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
; GFX9-NEXT: v_readlane_b32 s33, v44, 15
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v43, s33, 4
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_writelane_b32 v43, s30, 2
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
; GFX9-NEXT: v_writelane_b32 v43, s31, 3
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
; GFX9-NEXT: v_mov_b32_e32 v0, v40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s4, v43, 2
; GFX9-NEXT: v_readlane_b32 s5, v43, 3
; GFX9-NEXT: v_readlane_b32 s35, v43, 1
; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
; GFX9-NEXT: v_readlane_b32 s33, v43, 4
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5]
%b = and i32 %b.arg, 16777215
%s = and i32 %s.arg, 16777215

View File

@ -27,23 +27,23 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s15, s33
; CHECK-NEXT: s_mov_b32 s8, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_u32 s32, s32, 0x400
; CHECK-NEXT: s_getpc_b64 s[18:19]
; CHECK-NEXT: s_add_u32 s18, s18, callee_has_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, callee_has_fp@rel32@hi+12
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 s[16:17], s[30:31]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_sub_u32 s32, s32, 0x400
; CHECK-NEXT: s_mov_b32 s33, s15
; CHECK-NEXT: s_mov_b32 s33, s8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: s_setpc_b64 s[6:7]
bb:
call fastcc void @callee_has_fp()
call void asm sideeffect "; clobber csr v40", "~{v40}"()
@ -53,15 +53,15 @@ bb:
define amdgpu_kernel void @kernel_call() {
; CHECK-LABEL: kernel_call:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-DAG: s_addc_u32 s1, s1, 0
; CHECK-DAG: s_getpc_b64 s[18:19]
; CHECK-DAG: s_add_u32 s18, s18, csr_vgpr_spill_fp_callee@rel32@lo+4
; CHECK-DAG: s_addc_u32 s19, s19, csr_vgpr_spill_fp_callee@rel32@hi+12
; CHECK-DAG: s_mov_b32 s32, 0
; CHECK-DAG: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; CHECK-NEXT: s_add_u32 s0, s0, s7
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_callee()
@ -73,23 +73,23 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: v_writelane_b32 v1, s33, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_setpc_b64 s[4:5]
bb:
call void asm sideeffect "; clobber csr v40", "~{v40}"()
tail call fastcc void @callee_has_fp()
@ -99,15 +99,15 @@ bb:
define amdgpu_kernel void @kernel_tailcall() {
; CHECK-LABEL: kernel_tailcall:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-DAG: s_addc_u32 s1, s1, 0
; CHECK-DAG: s_getpc_b64 s[18:19]
; CHECK-NEXT: s_add_u32 s18, s18, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s19, s19, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; CHECK-NEXT: s_add_u32 s0, s0, s7
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()

View File

@ -238,7 +238,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; MUBUF-NEXT: s_mov_b32 s32, s6
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
@ -275,7 +275,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
@ -331,13 +331,13 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
; MUBUF-NEXT: v_mov_b32_e32 v4, s6
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
; MUBUF-NEXT: v_mov_b32_e32 v5, s6
; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; MUBUF-NEXT: s_mov_b32 s32, s6
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
@ -364,12 +364,12 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
; FLATSCR-NEXT: v_mov_b32_e32 v5, 0
; FLATSCR-NEXT: v_mov_b32_e32 v6, 1
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3

View File

@ -10,7 +10,7 @@ define void @child_function() #0 {
; GCN: v_writelane_b32 v255, s33, 2
; GCN: v_writelane_b32 v255, s30, 0
; GCN: v_writelane_b32 v255, s31, 1
; GCN: s_swappc_b64 s[30:31], s[16:17]
; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: v_readlane_b32 s30, v255, 0
; GCN: v_readlane_b32 s31, v255, 1
; GCN: v_readlane_b32 s33, v255, 2
@ -56,7 +56,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
; GCN: v_writelane_b32 v254, s33, 2
; GCN: v_writelane_b32 v254, s30, 0
; GCN: v_writelane_b32 v254, s31, 1
; GCN: s_swappc_b64 s[30:31], s[16:17]
; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: v_readlane_b32 s30, v254, 0
; GCN: v_readlane_b32 s31, v254, 1
; GCN: v_readlane_b32 s33, v254, 2
@ -150,7 +150,7 @@ ret:
; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call
; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32
; GCN-NOT: v_writelane
; GCN: s_setpc_b64 s[16:17]
; GCN: s_setpc_b64 s[4:5]
define void @reserve_vgpr_with_tail_call() #0 {
%alloca = alloca i32, align 4, addrspace(5)

View File

@ -128,12 +128,12 @@ entry:
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
@ -155,9 +155,12 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
; GCN-NOT: s32
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
; GCN-NOT: s32
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
@ -167,7 +170,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
@ -194,14 +197,15 @@ entry:
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
; GCN: s_mov_b32 s33, s32
; GCN-DAG: s_add_u32 s32, s32, 0x800
; GCN-DAG: s_add_u32 s32, s32, 0x400
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v43, s46, 12
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v42, s34, 0
; GCN-DAG: v_writelane_b32 v42, s35, 1
; GCN-DAG: s_getpc_b64 s[4:5]
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
@ -210,22 +214,22 @@ entry:
; GCN: s_swappc_b64
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
; GCN-DAG: v_readlane_b32 s35, v43, 1
; GCN-DAG: v_readlane_b32 s34, v43, 0
; GCN-DAG: v_readlane_b32 s34, v42, 0
; GCN-DAG: v_readlane_b32 s35, v42, 1
; GCN: s_sub_u32 s32, s32, 0x800
; GCN: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33,
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
entry:
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
@ -243,7 +247,7 @@ entry:
; GCN-NOT: s33
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
; GCN: s_setpc_b64 s[16:17]
; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)
@ -255,10 +259,10 @@ entry:
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
; GCN-NOT: s33
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
; GCN-NOT: s33
; GCN: s_setpc_b64 s[16:17]
; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)

View File

@ -0,0 +1,18 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: define internal void @indirect() #0 {
define internal void @indirect() {
ret void
}
; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
define amdgpu_kernel void @test_simple_indirect_call() {
%fptr = alloca void()*
store void()* @indirect, void()** %fptr
%fp = load void()*, void()** %fptr
call void %fp()
ret void
}
; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }

View File

@ -1,16 +1,16 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=7 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}spill_csr_s5_copy:
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
; GCN: v_writelane_b32 v40, s33, 5
; GCN: v_writelane_b32 v40, s33, 2
; GCN: s_swappc_b64
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GCN: v_readlane_b32 s33, v40, 5
; GCN: v_readlane_b32 s33, v40, 2
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN: s_mov_b64 exec

View File

@ -157,21 +157,19 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
; GCN-DAG: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
; GCN: v_mov_b32_e32 v32, 0
; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
; GCN: s_mov_b32 s34, s32
; GCN: v_mov_b32_e32 v32, 0
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 offset:4
; GCN-DAG: s_add_u32 s32, s32, 0x30000
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN: s_sub_u32 s32, s32, 0x30000
; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2

View File

@ -5,61 +5,61 @@
define hidden void @widget() {
; GCN-LABEL: widget:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz BB0_3
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz BB0_4
; GCN-NEXT: ; %bb.2: ; %bb7
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_branch BB0_7
; GCN-NEXT: BB0_3: ; %bb2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz BB0_6
; GCN-NEXT: BB0_4: ; %bb9
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execnz BB0_7
; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: BB0_6: ; %bb12
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_cbranch_vccz BB0_3
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz BB0_4
; GCN-NEXT: ; %bb.2: ; %bb7
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_branch BB0_7
; GCN-NEXT: BB0_3: ; %bb2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz BB0_6
; GCN-NEXT: BB0_4: ; %bb9
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execnz BB0_7
; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: BB0_6: ; %bb12
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
; SI-OPT-LABEL: @widget(
; SI-OPT-NEXT: bb:
; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
@ -186,124 +186,95 @@ define hidden void @blam() {
; GCN-LABEL: blam:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: v_writelane_b32 v44, s33, 15
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v43, s33, 4
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x800
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: v_writelane_b32 v44, s34, 0
; GCN-NEXT: v_writelane_b32 v44, s35, 1
; GCN-NEXT: v_writelane_b32 v44, s36, 2
; GCN-NEXT: v_writelane_b32 v44, s38, 3
; GCN-NEXT: v_writelane_b32 v44, s39, 4
; GCN-NEXT: v_writelane_b32 v44, s40, 5
; GCN-NEXT: v_writelane_b32 v44, s41, 6
; GCN-NEXT: v_writelane_b32 v44, s42, 7
; GCN-NEXT: v_writelane_b32 v44, s43, 8
; GCN-NEXT: v_writelane_b32 v44, s44, 9
; GCN-NEXT: v_writelane_b32 v44, s45, 10
; GCN-NEXT: v_writelane_b32 v44, s46, 11
; GCN-NEXT: v_writelane_b32 v44, s47, 12
; GCN-NEXT: v_writelane_b32 v44, s48, 13
; GCN-NEXT: v_writelane_b32 v44, s49, 14
; GCN-NEXT: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_mov_b32 s34, s14
; GCN-NEXT: s_mov_b32 s35, s13
; GCN-NEXT: s_mov_b32 s36, s12
; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: v_writelane_b32 v43, s34, 0
; GCN-NEXT: v_writelane_b32 v43, s35, 1
; GCN-NEXT: v_writelane_b32 v43, s36, 2
; GCN-NEXT: v_writelane_b32 v43, s37, 3
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40
; GCN-NEXT: flat_load_dword v41, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v43, 0
; GCN-NEXT: s_getpc_b64 s[48:49]
; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4
; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12
; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: flat_load_dword v40, v[1:2]
; GCN-NEXT: v_mov_b32_e32 v42, 0
; GCN-NEXT: s_getpc_b64 s[36:37]
; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4
; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12
; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_f32_e64 s[46:47], 0, v41
; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40
; GCN-NEXT: s_branch BB1_3
; GCN-NEXT: BB1_1: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: BB1_1: ; %bb10
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: BB1_2: ; %bb18
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: BB1_2: ; %bb18
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: BB1_3: ; %bb2
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
; GCN-NEXT: BB1_3: ; %bb2
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: BB1_4: ; %bb2
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
; GCN-NEXT: flat_load_dword v0, v[42:43]
; GCN-NEXT: BB1_4: ; %bb2
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
; GCN-NEXT: flat_load_dword v0, v[41:42]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_cbranch_execz BB1_6
; GCN-NEXT: %bb.5: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: ; %bb.5: ; %bb8
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GCN-NEXT: s_cbranch_execnz BB1_4
; GCN-NEXT: s_branch BB1_1
; GCN-NEXT: BB1_6: ; %bb6
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: BB1_6: ; %bb6
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB1_4
; GCN-NEXT: %bb.7: ; %bb11
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: _or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
; GCN-NEXT: s_mov_b32 s12, s36
; GCN-NEXT: s_mov_b32 s13, s35
; GCN-NEXT: s_mov_b32 s14, s34
; GCN-NEXT: v_mov_b32_e32 v31, v40
; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49]
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_cbranch_execnz BB1_4
; GCN-NEXT: ; %bb.8: ; %bb14
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[46:47]
; GCN-NEXT: s_cbranch_execnz BB1_10
; GCN-NEXT: ; %bb.9: ; %bb16
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: BB1_10: ; %bb17
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
; GCN-NEXT: s_branch BB1_2
; GCN-NEXT: ; %bb.7: ; %bb11
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37]
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: s_mov_b64 s[6:7], 0
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_cbranch_execnz BB1_4
; GCN-NEXT: ; %bb.8: ; %bb14
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35]
; GCN-NEXT: s_cbranch_execnz BB1_10
; GCN-NEXT: ; %bb.9: ; %bb16
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: BB1_10: ; %bb17
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0
; GCN-NEXT: s_branch BB1_2
bb:
%tmp = load float, float* null, align 16
br label %bb2

View File

@ -21,14 +21,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_mov_b32_e32 v32, v12
; GFX9: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: v_writelane_b32 v44, s30, 0
; GFX9: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@ -53,14 +53,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
; GFX10: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10: buffer_load_dword v43, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
@ -100,14 +100,14 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_mov_b32_e32 v40, v12
; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
@ -127,29 +127,22 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10: s_getpc_b64 s[16:17]
; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
; GFX10-NEXT: s_mov_b32 s37, s36
; GFX10-NEXT: s_mov_b32 s38, s36
; GFX10-NEXT: s_mov_b32 s39, s36
; GFX10-NEXT: s_mov_b32 s40, s36
; GFX10-NEXT: s_mov_b32 s41, s36
; GFX10-NEXT: s_mov_b32 s42, s36
; GFX10-NEXT: s_mov_b32 s43, s36
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_writelane_b32 v45, s30, 8
; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v40, v16
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v41, v15
; GFX10-NEXT: v_mov_b32_e32 v42, v14
; GFX10-NEXT: v_mov_b32_e32 v43, v13
; GFX10-NEXT: v_writelane_b32 v45, s31, 9
; GFX10-NEXT: v_mov_b32_e32 v44, v12
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10: buffer_load_dword v44, off, s[0:3], s33

View File

@ -20,18 +20,10 @@
# FULL-NEXT: stackPtrOffsetReg: '$sgpr13'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@ -55,18 +47,10 @@
# SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13'
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
name: kernel0
@ -112,16 +96,6 @@ body: |
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@ -137,16 +111,6 @@ body: |
# SIMPLE-NEXT: maxKernArgAlign: 1
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
@ -175,16 +139,6 @@ body: |
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@ -200,16 +154,6 @@ body: |
# SIMPLE-NEXT: maxKernArgAlign: 1
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
@ -239,16 +183,6 @@ body: |
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
# FULL-NEXT: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@ -265,16 +199,6 @@ body: |
# SIMPLE-NEXT: isEntryFunction: true
# SIMPLE-NEXT: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: occupancy: 10
# SIMPLE-NEXT: body:
@ -311,31 +235,13 @@ body: |
# FULL: argumentInfo:
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# FULL-NEXT: flatScratchInit: { offset: 4 }
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
# SIMPLE: argumentInfo:
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
# SIMPLE-NEXT: flatScratchInit: { offset: 4 }
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
name: fake_stack_arginfo
machineFunctionInfo:
argumentInfo: