forked from OSchip/llvm-project
[AMDGPU] Set implicit arg attributes for indirect calls
This patch adds attributes corresponding to implicits to functions/kernels if 1. it has an indirect call OR 2. it's address is taken. Once such attributes are set, rest of the codegen would work out-of-box for indirect calls. This patch eliminates the potential overhead -fixed-abi imposes even though indirect functions calls are not used. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D99347
This commit is contained in:
parent
0f42675c86
commit
5682ae2fc6
|
@ -25,6 +25,13 @@
|
|||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
static constexpr StringLiteral ImplicitAttrNames[] = {
|
||||
// X ids unnecessarily propagated to kernels.
|
||||
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
|
||||
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
|
||||
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
|
||||
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
|
||||
"amdgpu-implicitarg-ptr"};
|
||||
|
||||
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
|
||||
private:
|
||||
|
@ -194,18 +201,10 @@ static bool handleAttr(Function &Parent, const Function &Callee,
|
|||
|
||||
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
|
||||
bool &NeedQueuePtr) {
|
||||
// X ids unnecessarily propagated to kernels.
|
||||
static constexpr StringLiteral AttrNames[] = {
|
||||
"amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
|
||||
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
|
||||
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
|
||||
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
|
||||
"amdgpu-implicitarg-ptr"};
|
||||
|
||||
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
|
||||
NeedQueuePtr = true;
|
||||
|
||||
for (StringRef AttrName : AttrNames)
|
||||
for (StringRef AttrName : ImplicitAttrNames)
|
||||
handleAttr(Parent, Callee, AttrName);
|
||||
}
|
||||
|
||||
|
@ -268,7 +267,20 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|||
bool Changed = false;
|
||||
bool NeedQueuePtr = false;
|
||||
bool HaveCall = false;
|
||||
bool HasIndirectCall = false;
|
||||
bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
|
||||
|
||||
// If this function hasAddressTaken() = true
|
||||
// then add all attributes corresponding to the implicit args.
|
||||
if (CallingConvSupportsAllImplicits &&
|
||||
F.hasAddressTaken(nullptr, true, true, true)) {
|
||||
for (StringRef AttrName : ImplicitAttrNames) {
|
||||
F.addFnAttr(AttrName);
|
||||
}
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
for (BasicBlock &BB : F) {
|
||||
for (Instruction &I : BB) {
|
||||
|
@ -281,10 +293,12 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|||
const Function *Callee =
|
||||
dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
|
||||
|
||||
// TODO: Do something with indirect calls.
|
||||
// Note the occurence of indirect call.
|
||||
if (!Callee) {
|
||||
if (!CB->isInlineAsm())
|
||||
if (!CB->isInlineAsm()) {
|
||||
HasIndirectCall = true;
|
||||
HaveCall = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -351,6 +365,28 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
|
|||
Changed = true;
|
||||
}
|
||||
|
||||
// This pass cannot copy attributes from callees to callers
|
||||
// if there is an indirect call and in thus such cases,
|
||||
// hasAddressTaken() would be false for kernels and functions
|
||||
// making an indirect call (if they are themselves not indirectly called).
|
||||
// We must tag all such kernels/functions with all implicits attributes
|
||||
// for correctness.
|
||||
// e.g.
|
||||
// 1. Kernel K1 makes an indirect call to function F1.
|
||||
// Without detecting an indirect call in K1, this pass will not
|
||||
// add all implicit args to K1 (which is incorrect).
|
||||
// 2. Kernel K1 makes direct call to F1 which makes indirect call to function
|
||||
// F2.
|
||||
// Without detecting an indirect call in F1 (whose hasAddressTaken() is
|
||||
// false), the pass will not add all implicit args to F1 (which is
|
||||
// essential for correctness).
|
||||
if (CallingConvSupportsAllImplicits && HasIndirectCall) {
|
||||
for (StringRef AttrName : ImplicitAttrNames) {
|
||||
F.addFnAttr(AttrName);
|
||||
}
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
|
|
@ -388,10 +388,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
|
|||
else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
|
||||
MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
|
||||
}
|
||||
// Set -fixed-function-abi to true if not provided..
|
||||
if (TT.getOS() == Triple::AMDHSA &&
|
||||
EnableAMDGPUFixedFunctionABIOpt.getNumOccurrences() == 0)
|
||||
EnableFixedFunctionABI = true;
|
||||
}
|
||||
|
||||
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
|
||||
|
|
|
@ -53,9 +53,9 @@ define i32 @asm_vgpr_early_clobber() {
|
|||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
|
||||
; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]]
|
||||
; CHECK: $vgpr0 = COPY [[ADD]](s32)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
|
@ -87,8 +87,8 @@ define i32 @test_single_vgpr_output() nounwind {
|
|||
; CHECK: bb.1.entry:
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
|
||||
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
|
||||
|
@ -102,8 +102,8 @@ define i32 @test_single_sgpr_output_s32() nounwind {
|
|||
; CHECK: bb.1.entry:
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: $vgpr0 = COPY [[COPY1]](s32)
|
||||
; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0
|
||||
|
@ -118,9 +118,9 @@ define float @test_multiple_register_outputs_same() #0 {
|
|||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
|
||||
; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]]
|
||||
; CHECK: $vgpr0 = COPY [[FADD]](s32)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
|
@ -138,9 +138,9 @@ define double @test_multiple_register_outputs_mixed() #0 {
|
|||
; CHECK: bb.1 (%ir-block.0):
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2883594 /* regdef:VReg_64 */, def %9
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %9
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2883594 /* regdef:VReg_64 */, def %2
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY %2
|
||||
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64)
|
||||
; CHECK: $vgpr0 = COPY [[UV]](s32)
|
||||
; CHECK: $vgpr1 = COPY [[UV1]](s32)
|
||||
|
@ -209,8 +209,8 @@ define float @test_input_vgpr(i32 %src) nounwind {
|
|||
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
|
||||
; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %9
|
||||
; CHECK: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]]
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %2
|
||||
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
|
||||
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
|
||||
|
@ -225,8 +225,8 @@ define i32 @test_memory_constraint(i32 addrspace(3)* %a) nounwind {
|
|||
; CHECK: liveins: $vgpr0, $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3)
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %9
|
||||
; CHECK: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3)
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %2
|
||||
; CHECK: $vgpr0 = COPY [[COPY2]](s32)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
|
||||
|
@ -243,8 +243,8 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
|
|||
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
|
||||
; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
|
||||
; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %11
|
||||
; CHECK: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %4
|
||||
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
|
||||
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
|
||||
|
@ -258,14 +258,14 @@ define i32 @test_sgpr_matching_constraint() nounwind {
|
|||
; CHECK: bb.1.entry:
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %10
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3
|
||||
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY %3
|
||||
; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
|
||||
; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32)
|
||||
; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
|
||||
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %12
|
||||
; CHECK: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3)
|
||||
; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY %5
|
||||
; CHECK: $vgpr0 = COPY [[COPY5]](s32)
|
||||
; CHECK: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY6]], implicit $vgpr0
|
||||
|
@ -288,10 +288,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
|
|||
; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
|
||||
; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
|
||||
; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
|
||||
; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
|
||||
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %11
|
||||
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %12
|
||||
; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %13
|
||||
; CHECK: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5)
|
||||
; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY %4
|
||||
; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY %5
|
||||
; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY %6
|
||||
; CHECK: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
|
||||
; CHECK: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
|
||||
; CHECK: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1)
|
||||
|
@ -312,11 +312,11 @@ define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
|
|||
; CHECK: bb.1.entry:
|
||||
; CHECK: liveins: $sgpr30_sgpr31
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %8
|
||||
; CHECK: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1
|
||||
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %1
|
||||
; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %10
|
||||
; CHECK: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3)
|
||||
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY %3
|
||||
; CHECK: $vgpr0 = COPY [[COPY3]](s32)
|
||||
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
|
||||
|
|
|
@ -14,7 +14,7 @@ define void @func_use_lds_global() {
|
|||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NEXT: s_trap 2
|
||||
; GFX8-NEXT: ds_write_b32 v0, v0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -37,10 +37,10 @@ define void @func_use_lds_global_constexpr_cast() {
|
|||
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NEXT: s_trap 2
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: func_use_lds_global_constexpr_cast:
|
||||
|
|
|
@ -101,7 +101,7 @@ bb2:
|
|||
|
||||
; ALL-LABEL: {{^}}test_workitem_id_x_func:
|
||||
; ALL: s_waitcnt
|
||||
; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31
|
||||
; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
|
||||
; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2
|
||||
define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -110,7 +110,7 @@ define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 {
|
|||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_workitem_id_y_func:
|
||||
; HSA: v_lshrrev_b32_e32 v2, 10, v31
|
||||
; HSA: v_lshrrev_b32_e32 v2, 10, v2
|
||||
; MESA: v_lshrrev_b32_e32 v2, 10, v2
|
||||
define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
|
@ -119,7 +119,7 @@ define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 {
|
|||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_workitem_id_z_func:
|
||||
; HSA: v_lshrrev_b32_e32 v2, 20, v31
|
||||
; HSA: v_lshrrev_b32_e32 v2, 20, v2
|
||||
; MESA: v_lshrrev_b32_e32 v2, 20, v2
|
||||
define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
|
|
|
@ -172,7 +172,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
|||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4
|
||||
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
|
||||
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
; GCN-NEXT: global_store_dword v[0:1], v2, off
|
||||
|
@ -227,14 +227,14 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
|||
; GCN-NEXT: s_add_u32 s6, s32, 0x1000
|
||||
; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3
|
||||
; GCN-NEXT: v_add_u32_e32 v2, s6, v2
|
||||
; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
; GCN-NEXT: global_store_dword v[0:1], v2, off
|
||||
|
|
|
@ -42,7 +42,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
|
|||
|
||||
; Test handling inside a non-kernel
|
||||
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
|
||||
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
|
||||
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
|
||||
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
||||
; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
|
||||
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
|
||||
|
|
|
@ -122,18 +122,18 @@ bb:
|
|||
|
||||
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
|
||||
; GFX908: .amdhsa_next_free_vgpr 32
|
||||
; GFX90A: .amdhsa_accum_offset 32
|
||||
; GCN: NumVgprs: 32
|
||||
; GFX90A: .amdhsa_accum_offset 12
|
||||
; GCN: NumVgprs: 9
|
||||
; GCN: NumAgprs: 32
|
||||
; GFX908: TotalNumVgprs: 32
|
||||
; GFX90A: TotalNumVgprs: 64
|
||||
; GFX90A: TotalNumVgprs: 44
|
||||
; GFX908: VGPRBlocks: 7
|
||||
; GFX90A: VGPRBlocks: 7
|
||||
; GFX90A: VGPRBlocks: 5
|
||||
; GFX908: NumVGPRsForWavesPerEU: 32
|
||||
; GFX90A: NumVGPRsForWavesPerEU: 64
|
||||
; GFX90A: AccumOffset: 32
|
||||
; GFX90A: NumVGPRsForWavesPerEU: 44
|
||||
; GFX90A: AccumOffset: 12
|
||||
; GCN: Occupancy: 8
|
||||
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
|
||||
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2
|
||||
define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
|
||||
bb:
|
||||
call void @func_32_agprs() #0
|
||||
|
@ -141,10 +141,10 @@ bb:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_func_32_agprs:
|
||||
; GCN: NumVgprs: 32
|
||||
; GCN: NumVgprs: 9
|
||||
; GCN: NumAgprs: 32
|
||||
; GFX908: TotalNumVgprs: 32
|
||||
; GFX90A: TotalNumVgprs: 64
|
||||
; GFX90A: TotalNumVgprs: 44
|
||||
define void @func_call_func_32_agprs() #0 {
|
||||
bb:
|
||||
call void @func_32_agprs() #0
|
||||
|
@ -154,21 +154,21 @@ bb:
|
|||
declare void @undef_func()
|
||||
|
||||
; GCN-LABEL: {{^}}kernel_call_undef_func:
|
||||
; GFX908: .amdhsa_next_free_vgpr 32
|
||||
; GFX90A: .amdhsa_next_free_vgpr 56
|
||||
; GFX90A: .amdhsa_accum_offset 32
|
||||
; GCN: NumVgprs: 32
|
||||
; GFX908: .amdhsa_next_free_vgpr 24
|
||||
; GFX90A: .amdhsa_next_free_vgpr 48
|
||||
; GFX90A: .amdhsa_accum_offset 24
|
||||
; GCN: NumVgprs: 24
|
||||
; GCN: NumAgprs: 24
|
||||
; GFX908: TotalNumVgprs: 32
|
||||
; GFX90A: TotalNumVgprs: 56
|
||||
; GFX908: VGPRBlocks: 7
|
||||
; GFX90A: VGPRBlocks: 6
|
||||
; GFX908: NumVGPRsForWavesPerEU: 32
|
||||
; GFX90A: NumVGPRsForWavesPerEU: 56
|
||||
; GFX90A: AccumOffset: 32
|
||||
; GFX908: Occupancy: 8
|
||||
; GFX908: TotalNumVgprs: 24
|
||||
; GFX90A: TotalNumVgprs: 48
|
||||
; GFX908: VGPRBlocks: 5
|
||||
; GFX90A: VGPRBlocks: 5
|
||||
; GFX908: NumVGPRsForWavesPerEU: 24
|
||||
; GFX90A: NumVGPRsForWavesPerEU: 48
|
||||
; GFX90A: AccumOffset: 24
|
||||
; GFX908: Occupancy: 10
|
||||
; GFX90A: Occupancy: 8
|
||||
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
|
||||
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5
|
||||
define amdgpu_kernel void @kernel_call_undef_func() #0 {
|
||||
bb:
|
||||
call void @undef_func()
|
||||
|
|
|
@ -334,4 +334,4 @@ attributes #3 = { nounwind }
|
|||
; HSA: attributes #17 = { nounwind "uniform-work-group-size"="false" }
|
||||
; HSA: attributes #18 = { nounwind }
|
||||
; HSA: attributes #19 = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" }
|
||||
; HSA: attributes #20 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
|
||||
; HSA: attributes #20 = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" }
|
||||
|
|
|
@ -80,15 +80,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
|
|||
; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
|
||||
|
||||
; HSA: buffer_load_ubyte [[VAR:v[0-9]+]]
|
||||
; HSA-DAG: s_mov_b32 s32, 0
|
||||
; HSA: s_mov_b32 s32, 0
|
||||
; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
|
||||
; MESA-DAG: s_mov_b32 s32, 0{{$}}
|
||||
|
||||
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
|
||||
; MESA-DAG: v_bfe_i32 v0, v0, 0, 1
|
||||
; HSA: v_bfe_i32 v0, v3, 0, 1
|
||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
|
@ -100,24 +99,18 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
|
|||
; FIXME: load should be scheduled before getpc
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
|
||||
|
||||
; HSA: buffer_load_ubyte v3
|
||||
; HSA: buffer_load_ubyte v0
|
||||
; HSA-DAG: s_mov_b32 s32, 0{{$}}
|
||||
|
||||
; MESA: buffer_load_ubyte v0
|
||||
; MESA-DAG: s_mov_b32 s32, 0{{$}}
|
||||
|
||||
; MESA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; MESA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
|
||||
; MESA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
|
||||
; MESA-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; MESA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; MESA-NEXT: s_endpgm
|
||||
; HSA: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; HSA-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
|
||||
; HSA-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
|
||||
; HSA-NEXT: v_and_b32_e32 v0, 1, v3
|
||||
; HSA-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; HSA-NEXT: s_endpgm
|
||||
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
|
||||
%var = load volatile i1, i1 addrspace(1)* undef
|
||||
call void @external_void_func_i1_zeroext(i1 %var)
|
||||
|
@ -143,8 +136,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
|
|||
; FIXME: don't wait before call
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
|
||||
|
||||
; MESA-DAG: buffer_load_sbyte v0
|
||||
; HSA-DAG: buffer_load_sbyte v3
|
||||
; GCN-DAG: buffer_load_sbyte v0
|
||||
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
|
||||
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12
|
||||
|
@ -152,7 +144,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
|
|||
; GCN-DAG: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
|
||||
%var = load volatile i8, i8 addrspace(1)* undef
|
||||
|
@ -162,8 +154,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
|
||||
|
||||
; MESA-DAG: buffer_load_ubyte v0
|
||||
; HSA-DAG: buffer_load_ubyte v3
|
||||
; GCN-DAG: buffer_load_ubyte v0
|
||||
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
|
||||
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12
|
||||
|
@ -171,7 +162,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
|
|||
; GCN-DAG: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
|
||||
%var = load volatile i8, i8 addrspace(1)* undef
|
||||
|
@ -192,8 +183,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
|
||||
|
||||
; MESA-DAG: buffer_load_sshort v0
|
||||
; HSA-DAG: buffer_load_sshort v3
|
||||
; GCN-DAG: buffer_load_sshort v0
|
||||
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
|
||||
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12
|
||||
|
@ -201,7 +191,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
|
|||
; GCN-DAG: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
|
||||
%var = load volatile i16, i16 addrspace(1)* undef
|
||||
|
@ -218,7 +208,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
|
|||
; GCN-DAG: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN-DAG: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
|
||||
%var = load volatile i16, i16 addrspace(1)* undef
|
||||
|
@ -491,7 +481,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}}
|
||||
|
||||
; GCN-NOT: v3,
|
||||
; GCN-NOT: v3
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 3
|
||||
; GCN-DAG: v_mov_b32_e32 v1, 4
|
||||
; GCN-DAG: v_mov_b32_e32 v2, 5
|
||||
|
@ -596,7 +586,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
|
|||
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
|
||||
; MESA-NOT: s_waitcnt
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
||||
%ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
|
||||
|
@ -621,8 +611,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
|
|||
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
|
||||
|
||||
; GCN: s_waitcnt
|
||||
; MESA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
|
||||
; HSA: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4
|
||||
; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
|
||||
|
@ -645,11 +634,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
|
||||
; MESA: buffer_load_ubyte v0, off
|
||||
; MESA-DAG: buffer_load_dword v1, off
|
||||
; HSA: buffer_load_ubyte v3, off
|
||||
; HSA-DAG: buffer_load_dword v4, off
|
||||
; MESA-NOT: s_waitcnt
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN: buffer_load_dword v1, off
|
||||
; GCN-NOT: s_waitcnt
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
|
||||
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
|
||||
|
@ -751,19 +738,15 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}tail_call_byval_align16:
|
||||
; GCN-NOT: s32,
|
||||
; MESA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
|
||||
; MESA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
|
||||
; HSA: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32
|
||||
; HSA: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:24
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8
|
||||
; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12
|
||||
|
||||
; GCN: s_getpc_b64
|
||||
|
||||
; MESA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
|
||||
; MESA: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
|
||||
; HSA: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:16
|
||||
; HSA: buffer_store_dword [[VREG1]], off, s[0:3], s32
|
||||
; GCN-NOT: s32,
|
||||
; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
|
||||
entry:
|
||||
|
@ -774,16 +757,11 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
|
||||
; GCN-NOT: s32
|
||||
; MESA: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; MESA: buffer_load_dword v33, off, s[0:3], s32{{$}}
|
||||
; MESA: s_getpc_b64
|
||||
; MESA: buffer_store_dword v33, off, s[0:3], s32{{$}}
|
||||
; MESA: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
||||
; HSA: buffer_load_dword v32, off, s[0:3], s32 offset:8
|
||||
; HSA: buffer_load_dword v33, off, s[0:3], s32 offset:4
|
||||
; HSA: s_getpc_b64
|
||||
; HSA: buffer_store_dword v33, off, s[0:3], s32 offset:4
|
||||
; HSA: buffer_store_dword v32, off, s[0:3], s32 offset:8
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
|
||||
; GCN: s_getpc_b64
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
|
||||
|
@ -793,27 +771,16 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_12xv3i32:
|
||||
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; MESA: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; MESA: v_mov_b32_e32 v31, 11
|
||||
; MESA: s_getpc
|
||||
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 11
|
||||
; HSA: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
|
||||
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
|
||||
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
|
||||
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
|
||||
; HSA: s_getpc
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 v31, 11
|
||||
; GCN: s_getpc
|
||||
define void @stack_12xv3i32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_12xv3i32(
|
||||
|
@ -833,25 +800,16 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_12xv3f32:
|
||||
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; MESA: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; MESA: v_mov_b32_e32 v31, 0x41300000
|
||||
; MESA: s_getpc
|
||||
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:4
|
||||
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:8
|
||||
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:12
|
||||
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:16
|
||||
; HSA: s_getpc
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 v31, 0x41300000
|
||||
; GCN: s_getpc
|
||||
define void @stack_12xv3f32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_12xv3f32(
|
||||
|
@ -872,41 +830,24 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}stack_8xv5i32:
|
||||
|
||||
; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; MESA: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
|
||||
; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
|
||||
; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
|
||||
; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
|
||||
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
|
||||
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
|
||||
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
|
||||
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
|
||||
|
||||
; MESA: v_mov_b32_e32 v31, 7
|
||||
; GCN: v_mov_b32_e32 v31, 7
|
||||
; GCN: s_getpc
|
||||
define void @stack_8xv5i32() #0 {
|
||||
entry:
|
||||
|
@ -923,42 +864,24 @@ entry:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_8xv5f32:
|
||||
; MESA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
|
||||
; MESA: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; MESA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; MESA: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; MESA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; MESA: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; MESA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; MESA: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; MESA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; MESA: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; MESA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; MESA: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; MESA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; MESA: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; MESA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; MESA: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
; MESA: v_mov_b32_e32 v31, 0x40e00000
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
|
||||
|
||||
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x40e00000
|
||||
; HSA: buffer_store_dword [[REG8]], {{.*$}}
|
||||
; HSA: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
|
||||
; HSA: buffer_store_dword [[REG8]], {{.*}} offset:4
|
||||
; HSA: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; HSA: buffer_store_dword [[REG9]], {{.*}} offset:8
|
||||
; HSA: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; HSA: buffer_store_dword [[REG10]], {{.*}} offset:12
|
||||
; HSA: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; HSA: buffer_store_dword [[REG11]], {{.*}} offset:16
|
||||
; HSA: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; HSA: buffer_store_dword [[REG12]], {{.*}} offset:20
|
||||
; HSA: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; HSA: buffer_store_dword [[REG13]], {{.*}} offset:24
|
||||
; HSA: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; HSA: buffer_store_dword [[REG14]], {{.*}} offset:28
|
||||
; HSA: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; HSA: buffer_store_dword [[REG15]], {{.*}} offset:32
|
||||
; GCN: v_mov_b32_e32 v31, 0x40e00000
|
||||
; GCN: s_getpc
|
||||
define void @stack_8xv5f32() #0 {
|
||||
entry:
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
; FIXME: Emitting unnecessary flat_scratch setup
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_undef:
|
||||
; SDAG: s_mov_b32 flat_scratch_lo, s5
|
||||
; SDAG: s_add_u32 s4, s4, s7
|
||||
; SDAG: s_mov_b32 flat_scratch_lo, s11
|
||||
; SDAG: s_add_u32 s10, s10, s15
|
||||
; SDAG: s_lshr_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @test_call_undef() #0 {
|
||||
|
@ -26,8 +26,8 @@ define i32 @test_tail_call_undef() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_null:
|
||||
; SDAG: s_mov_b32 flat_scratch_lo, s5
|
||||
; SDAG: s_add_u32 s4, s4, s7
|
||||
; SDAG: s_mov_b32 flat_scratch_lo, s11
|
||||
; SDAG: s_add_u32 s10, s10, s15
|
||||
; SDAG: s_lshr_b32
|
||||
|
||||
; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
|
|
|
@ -65,7 +65,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
|
||||
|
@ -78,6 +78,7 @@ define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
|
|||
; GCN: s_getpc_b64
|
||||
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
|
||||
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12
|
||||
; GCN: v_or_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v0, 9
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: v_add_f32_e32
|
||||
|
|
|
@ -230,7 +230,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
|
|||
; CI: NumSgprs: 48
|
||||
; VI-NOBUG: NumSgprs: 48
|
||||
; VI-BUG: NumSgprs: 96
|
||||
; GCN: NumVgprs: 32
|
||||
; GCN: NumVgprs: 24
|
||||
define amdgpu_kernel void @count_use_sgpr96_external_call() {
|
||||
entry:
|
||||
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
|
||||
|
@ -244,7 +244,7 @@ entry:
|
|||
; CI: NumSgprs: 48
|
||||
; VI-NOBUG: NumSgprs: 48
|
||||
; VI-BUG: NumSgprs: 96
|
||||
; GCN: NumVgprs: 32
|
||||
; GCN: NumVgprs: 24
|
||||
define amdgpu_kernel void @count_use_sgpr160_external_call() {
|
||||
entry:
|
||||
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
|
||||
|
@ -258,7 +258,7 @@ entry:
|
|||
; CI: NumSgprs: 48
|
||||
; VI-NOBUG: NumSgprs: 48
|
||||
; VI-BUG: NumSgprs: 96
|
||||
; GCN: NumVgprs: 32
|
||||
; GCN: NumVgprs: 24
|
||||
define amdgpu_kernel void @count_use_vgpr160_external_call() {
|
||||
entry:
|
||||
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
|
||||
|
|
|
@ -6,15 +6,15 @@
|
|||
declare hidden void @external_void_func_void() #0
|
||||
|
||||
; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
|
||||
; GCN: s_getpc_b64 s[44:45]
|
||||
; GCN-NEXT: s_add_u32 s44, s44,
|
||||
; GCN-NEXT: s_addc_u32 s45, s45,
|
||||
; GCN: s_getpc_b64 s[34:35]
|
||||
; GCN-NEXT: s_add_u32 s34, s34,
|
||||
; GCN-NEXT: s_addc_u32 s35, s35,
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN: s_swappc_b64 s[30:31], s[44:45]
|
||||
; GCN: s_swappc_b64 s[30:31], s[34:35]
|
||||
|
||||
; GCN-DAG: #ASMSTART
|
||||
; GCN-DAG: #ASMEND
|
||||
; GCN-DAG: s_swappc_b64 s[30:31], s[44:45]
|
||||
; GCN-NEXT: #ASMSTART
|
||||
; GCN-NEXT: #ASMEND
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
|
||||
call void @external_void_func_void()
|
||||
call void asm sideeffect "", ""() #0
|
||||
|
@ -25,60 +25,24 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
|
|||
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
|
||||
; MUBUF: buffer_store_dword
|
||||
; FLATSCR: scratch_store_dword
|
||||
; GCN: v_writelane_b32 v41, s33, 15
|
||||
; GCN-NEXT: v_writelane_b32 v41, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v41, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v41, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v41, s37, 3
|
||||
; GCN-NEXT: v_writelane_b32 v41, s38, 4
|
||||
; GCN-NEXT: v_writelane_b32 v41, s39, 5
|
||||
; GCN-NEXT: v_writelane_b32 v41, s40, 6
|
||||
; GCN-NEXT: v_writelane_b32 v41, s41, 7
|
||||
; GCN-NEXT: v_writelane_b32 v41, s42, 8
|
||||
; GCN-NEXT: v_writelane_b32 v41, s43, 9
|
||||
; GCN-NEXT: v_writelane_b32 v41, s44, 10
|
||||
; GCN-NEXT: v_writelane_b32 v41, s46, 11
|
||||
; GCN-NEXT: v_writelane_b32 v41, s47, 12
|
||||
; GCN-NEXT: v_writelane_b32 v41, s30, 13
|
||||
; GCN: v_writelane_b32 v40, s33, 4
|
||||
; GCN: v_writelane_b32 v40, s34, 0
|
||||
; GCN: v_writelane_b32 v40, s35, 1
|
||||
; GCN: v_writelane_b32 v40, s30, 2
|
||||
; GCN: v_writelane_b32 v40, s31, 3
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-DAG: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: s_swappc_b64
|
||||
; MUBUF-DAG: v_readlane_b32 s4, v40, 2
|
||||
; MUBUF-DAG: v_readlane_b32 s5, v40, 3
|
||||
; FLATSCR-DAG: v_readlane_b32 s0, v40, 2
|
||||
; FLATSCR-DAG: v_readlane_b32 s1, v40, 3
|
||||
; GCN: v_readlane_b32 s35, v40, 1
|
||||
; GCN: v_readlane_b32 s34, v40, 0
|
||||
|
||||
; MUBUF-DAG: v_readlane_b32 s4, v41, 13
|
||||
; MUBUF-DAG: v_readlane_b32 s5, v41, 14
|
||||
; MUBUF-DAG: v_readlane_b32 s47, v41, 12
|
||||
; MUBUF-DAG: v_readlane_b32 s46, v41, 11
|
||||
; MUBUF-DAG: v_readlane_b32 s44, v41, 10
|
||||
; MUBUF-DAG: v_readlane_b32 s43, v41, 9
|
||||
; MUBUF-DAG: v_readlane_b32 s42, v41, 8
|
||||
; MUBUF-DAG: v_readlane_b32 s41, v41, 7
|
||||
; MUBUF-DAG: v_readlane_b32 s40, v41, 6
|
||||
; MUBUF-DAG: v_readlane_b32 s39, v41, 5
|
||||
; MUBUF-DAG: v_readlane_b32 s38, v41, 4
|
||||
; MUBUF-DAG: v_readlane_b32 s37, v41, 3
|
||||
; MUBUF-DAG: v_readlane_b32 s36, v41, 2
|
||||
; MUBUF-DAG: v_readlane_b32 s35, v41, 1
|
||||
; MUBUF-DAG: v_readlane_b32 s34, v41, 0
|
||||
|
||||
; FLATSCR: v_readlane_b32 s0, v41, 13
|
||||
; FLATSCR-DAG: v_readlane_b32 s1, v41, 14
|
||||
; FLATSCR-DAG: v_readlane_b32 s47, v41, 12
|
||||
; FLATSCR-DAG: v_readlane_b32 s46, v41, 11
|
||||
; FLATSCR-DAG: v_readlane_b32 s44, v41, 10
|
||||
; FLATSCR-DAG: v_readlane_b32 s43, v41, 9
|
||||
; FLATSCR-DAG: v_readlane_b32 s42, v41, 8
|
||||
; FLATSCR-DAG: v_readlane_b32 s41, v41, 7
|
||||
; FLATSCR-DAG: v_readlane_b32 s40, v41, 6
|
||||
; FLATSCR-DAG: v_readlane_b32 s39, v41, 5
|
||||
; FLATSCR-DAG: v_readlane_b32 s38, v41, 4
|
||||
; FLATSCR-DAG: v_readlane_b32 s37, v41, 3
|
||||
; FLATSCR-DAG: v_readlane_b32 s36, v41, 2
|
||||
; FLATSCR-DAG: v_readlane_b32 s35, v41, 1
|
||||
; FLATSCR-DAG: v_readlane_b32 s34, v41, 0
|
||||
; FLATSCR-DAG: v_readlane_b32 s33, v41, 15
|
||||
|
||||
; GCN: v_readlane_b32 s33, v40, 4
|
||||
; MUBUF: buffer_load_dword
|
||||
; FLATSCR: scratch_load_dword
|
||||
; GCN: s_setpc_b64
|
||||
|
@ -90,19 +54,19 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
|
||||
; MUBUF: buffer_store_dword v41
|
||||
; GCN: v_writelane_b32 v41, s33, 15
|
||||
; MUBUF: buffer_store_dword v40
|
||||
; FLATSCR: scratch_store_dword off, v40
|
||||
; GCN: v_writelane_b32 v40, s33, 4
|
||||
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; FLATSCR: s_add_u32 s32, s32, 16
|
||||
; FLATSCR: scratch_store_dword off, v40
|
||||
; MUBUF: s_add_u32 s32, s32, 0x400
|
||||
; FLATSCR: s_add_u32 s32, s32, 16
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-DAG: s_swappc_b64
|
||||
; GCN-NEXT: s_swappc_b64
|
||||
|
||||
; GCN: v_readlane_b32 s33, v41, 15
|
||||
; MUBUF: buffer_load_dword v41
|
||||
; FLATSCR: scratch_load_dword v41
|
||||
; GCN: v_readlane_b32 s33, v40, 4
|
||||
; MUBUF: buffer_load_dword v40
|
||||
; FLATSCR: scratch_load_dword v40
|
||||
define void @test_func_call_external_void_funcx2() #0 {
|
||||
call void @external_void_func_void()
|
||||
call void @external_void_func_void()
|
||||
|
@ -160,7 +124,7 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
|
|||
|
||||
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
|
||||
; GCN: v_mov_b32_e32 v40, v31
|
||||
; GCN-DAG: s_swappc_b64
|
||||
; GCN-NEXT: s_swappc_b64
|
||||
; GCN-NEXT: v_mov_b32_e32 v31, v40
|
||||
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
|
||||
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
|
||||
|
@ -172,18 +136,18 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
|
|||
; FIXME: What is the expected behavior for reserved registers here?
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
|
||||
; MUBUF: s_getpc_b64 s[18:19]
|
||||
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[16:17]
|
||||
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GCN: #ASMSTART
|
||||
; GCN-NEXT: ; def s33
|
||||
; GCN-NEXT: #ASMEND
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GCN: ;;#ASMSTART
|
||||
; GCN-NEXT: ; use s33
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
@ -199,12 +163,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
|
|||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
|
||||
; GCN-NOT: s34
|
||||
|
||||
; MUBUF: s_getpc_b64 s[18:19]
|
||||
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[16:17]
|
||||
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
|
||||
; GCN-NOT: s34
|
||||
|
@ -213,8 +177,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
|
|||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; GCN-NOT: s34
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
|
||||
; GCN-NOT: s34
|
||||
|
||||
|
@ -232,12 +196,12 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
|
|||
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
|
||||
|
||||
; GCN-NOT: v32
|
||||
; MUBUF: s_getpc_b64 s[18:19]
|
||||
; MUBUF-NEXT: s_add_u32 s18, s18, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s19, s19, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[16:17]
|
||||
; FLATSCR-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
|
||||
; MUBUF: s_getpc_b64 s[4:5]
|
||||
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; FLATSCR: s_getpc_b64 s[0:1]
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
|
||||
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GCN-NOT: v40
|
||||
|
||||
|
@ -245,8 +209,8 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
|
|||
; GCN-NEXT: ; def v40
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[18:19]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[16:17]
|
||||
; MUBUF: s_swappc_b64 s[30:31], s[4:5]
|
||||
; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
|
||||
|
||||
; GCN-NOT: v40
|
||||
|
||||
|
|
|
@ -5,30 +5,20 @@
|
|||
define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: call_memory_arg_load:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: s_load_dword s14, s[8:9], 0x0
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 8
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, s14
|
||||
; GCN-NEXT: ds_read_b32 v3, v3
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: ds_read_b32 v0, v0
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
|
||||
call void @func(i32 %vgpr)
|
||||
ret void
|
||||
|
@ -38,29 +28,21 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
|
|||
define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: call_memory_no_dep:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_store_dword v3, v3, s[14:15]
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[6:7]
|
||||
; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GCN-NEXT: s_endpgm
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @func(i32 0)
|
||||
ret void
|
||||
|
@ -69,29 +51,21 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 {
|
|||
; Should not wait after the call before memory
|
||||
define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: call_no_wait_after_call:
|
||||
; GCN: %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 16
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
|
||||
; GCN-NEXT: s_endpgm
|
||||
call void @func(i32 0)
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
ret void
|
||||
|
@ -100,28 +74,20 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
|
|||
define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: call_no_wait_after_call_return_val:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 16
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func.return@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func.return@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
|
||||
; GCN-NEXT: s_endpgm
|
||||
%rv = call i32 @func.return(i32 0)
|
||||
store i32 %rv, i32 addrspace(1)* %ptr
|
||||
ret void
|
||||
|
@ -131,27 +97,19 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
|
|||
define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: call_got_load:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 16
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: s_getpc_b64 s[14:15]
|
||||
; GCN-NEXT: s_add_u32 s14, s14, got.func@gotpcrel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s15, s15, got.func@gotpcrel32@hi+12
|
||||
; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_endpgm
|
||||
call void @got.func(i32 0)
|
||||
ret void
|
||||
}
|
||||
|
@ -160,14 +118,14 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 {
|
|||
define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: tailcall_got_load:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, got.func@gotpcrel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, got.func@gotpcrel32@hi+12
|
||||
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
tail call void @got.func(i32 0)
|
||||
ret void
|
||||
}
|
||||
|
@ -176,12 +134,12 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
|
|||
define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
|
||||
; GCN-LABEL: tail_call_memory_arg_load:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_read_b32 v0, v0
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, func@rel32@hi+12
|
||||
; GCN-NEXT: s_setpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_read_b32 v0, v0
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
|
||||
tail call void @func(i32 %vgpr)
|
||||
ret void
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-fixed-function-abi=0 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; GCN-LABEL: {{^}}use_dispatch_ptr:
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[4:5]
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -15,7 +15,7 @@ define void @use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -27,7 +27,7 @@ define void @use_workitem_id_y() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -39,10 +39,9 @@ define void @use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xy:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -56,13 +55,11 @@ define void @use_workitem_id_xy() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xyz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -78,10 +75,9 @@ define void @use_workitem_id_xyz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -95,10 +91,9 @@ define void @use_workitem_id_xz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_yz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -112,9 +107,11 @@ define void @use_workitem_id_yz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 0
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
|
||||
call void @use_workitem_id_x()
|
||||
ret void
|
||||
|
@ -122,10 +119,14 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
|
||||
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1
|
||||
; UNPACKED-TID-NOT: v0
|
||||
; UNPACKED-TID-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 1
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
||||
call void @use_workitem_id_y()
|
||||
ret void
|
||||
|
@ -133,7 +134,11 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
|
||||
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2
|
||||
; UNPACKED-TID-NOT: v0
|
||||
; UNPACKED-TID-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
|
@ -147,6 +152,8 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
|
|||
; UNPACKED-TID-NOT: v1
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
|
||||
call void @use_workitem_id_xy()
|
||||
|
@ -157,7 +164,9 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
|
|||
; UNPACKED-TID-NOT: v0
|
||||
; UNPACKED-TID-NOT: v2
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
|
||||
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
|
||||
call void @use_workitem_id_xz()
|
||||
|
@ -169,9 +178,9 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
|
|||
; UNPACKED-TID-NOT: v2
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]]
|
||||
; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
|
||||
call void @use_workitem_id_yz()
|
||||
|
@ -185,7 +194,8 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
|
|||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]]
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
|
@ -223,8 +233,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -235,9 +245,8 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
|
@ -248,9 +257,8 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
|
@ -262,10 +270,11 @@ define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
|
||||
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 0
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
||||
call void @other_arg_use_workitem_id_x(i32 555)
|
||||
ret void
|
||||
|
@ -275,13 +284,14 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
|||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
|
||||
|
||||
; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; PACKED-TID: v_mov_b32_e32 v31, v0
|
||||
; PACKED-TID: v_mov_b32_e32 v1, v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 1
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
||||
call void @other_arg_use_workitem_id_y(i32 555)
|
||||
ret void
|
||||
|
@ -290,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
|||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; PACKED-TID-DAG: v_mov_b32_e32 v31, v0
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2
|
||||
; PACKED-TID-DAG: v_mov_b32_e32 v1, v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
|
||||
|
@ -302,10 +312,9 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
|
||||
; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v0
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; GCN: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
|
@ -357,11 +366,10 @@ define void @too_many_args_use_workitem_id_x(
|
|||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
|
||||
|
||||
; GCN: s_mov_b32 s32, 0
|
||||
; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
|
||||
; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 0
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
|
||||
call void @too_many_args_use_workitem_id_x(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
|
@ -377,7 +385,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
|
@ -425,13 +433,13 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
|||
; frame[2] = VGPR spill slot
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
|
||||
; GFX7: buffer_load_dword v0, off, s[0:3], s32
|
||||
; GFX90A: buffer_load_dword v32, off, s[0:3], s32
|
||||
; GFX7: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; GFX90A: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; GCN-DAG: s_waitcnt
|
||||
; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32,
|
||||
; GFX7: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
|
||||
; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
|
||||
; GCN: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
|
@ -486,18 +494,17 @@ define void @too_many_args_use_workitem_id_x_byval(
|
|||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
|
||||
; GCN-DAG: s_movk_i32 s32, 0x400
|
||||
|
||||
; GFX7: buffer_store_dword v3, off, s[0:3], s32
|
||||
; GFX90A: buffer_store_dword v0, off, s[0:3], s32
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
|
||||
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
|
||||
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 0
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
store volatile i32 999, i32 addrspace(5)* %alloca
|
||||
|
@ -515,12 +522,11 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
|
||||
; GCN: buffer_store_dword v40, off, s[0:3], s32 offset:4
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
|
||||
; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
|
||||
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
|
||||
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
|
@ -541,20 +547,21 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
|
||||
; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GFX90A: v_and_b32_e32 v33, 0x3ff, v31
|
||||
; GFX90A: v_bfe_u32 v33, v31, 10, 10
|
||||
; GCN90A: v_bfe_u32 v31, v31, 20, 10
|
||||
; GFX7: v_and_b32_e32 v32, 0x3ff, v31
|
||||
; GFX7: v_bfe_u32 v32, v31, 10, 10
|
||||
; GCN7: v_bfe_u32 v31, v31, 20, 10
|
||||
; GFX7: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v12
|
||||
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v30{{$}}
|
||||
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v0{{$}}
|
||||
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off{{$}}
|
||||
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off{{$}}
|
||||
; GFX90A: v_and_b32_e32 v33, 0x3ff, v32
|
||||
; GFX90A: v_bfe_u32 v34, v32, 10, 10
|
||||
; GCN90A: v_bfe_u32 v32, v32, 20, 10
|
||||
; GFX7: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GFX7: v_and_b32_e32 v33, 0x3ff, v32
|
||||
; GFX7: v_bfe_u32 v33, v32, 10, 10
|
||||
; GCN7: v_bfe_u32 v32, v32, 20, 10
|
||||
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}}
|
||||
; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}}
|
||||
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off{{$}}
|
||||
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off{{$}}
|
||||
; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}}
|
||||
|
||||
; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}}
|
||||
; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}}
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_xyz(
|
||||
|
@ -617,11 +624,11 @@ define void @too_many_args_use_workitem_id_xyz(
|
|||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1
|
||||
; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2
|
||||
; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2
|
||||
; PACKED-TID-NOT: v0
|
||||
; PACKED-TID-NOT: v1
|
||||
; PACKED-TID-NOT: v2
|
||||
; GFX7: buffer_store_dword v3, off, s[0:3], s32{{$}}
|
||||
; GFX90A: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: .amdhsa_system_vgpr_workitem_id 2
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s
|
||||
; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
|
||||
; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -14,7 +16,8 @@ define void @use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
|
||||
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -26,7 +29,8 @@ define void @use_workitem_id_y() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
|
||||
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
|
@ -38,9 +42,11 @@ define void @use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xy:
|
||||
; GCN: s_waitcnt
|
||||
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
|
@ -57,10 +63,13 @@ define void @use_workitem_id_xy() #1 {
|
|||
; GCN-LABEL: {{^}}use_workitem_id_xyz:
|
||||
; GCN: s_waitcnt
|
||||
|
||||
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
|
||||
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
|
@ -80,9 +89,11 @@ define void @use_workitem_id_xyz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xz:
|
||||
; GCN: s_waitcnt
|
||||
; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
|
@ -98,9 +109,11 @@ define void @use_workitem_id_xz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_yz:
|
||||
; GCN: s_waitcnt
|
||||
; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
|
@ -115,31 +128,38 @@ define void @use_workitem_id_yz() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
; VARABI: enable_vgpr_workitem_id = 0
|
||||
; FIXEDABI: enable_vgpr_workitem_id = 2
|
||||
|
||||
; FIXEDA-NOT: v0
|
||||
; VARABI-NOT: v31
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
; FIXEDABI-NOT: v0
|
||||
; VARABI-NOT: v31
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
|
||||
call void @use_workitem_id_x()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
; VARABI: enable_vgpr_workitem_id = 1
|
||||
; FIXEDABI: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; FIXEDABI-NOT: v0
|
||||
; FIXEDABI-NOT: v1
|
||||
|
||||
; VARABI-NOT: v31
|
||||
; VARABI: v_lshlrev_b32_e32 v0, 10, v1
|
||||
|
||||
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; FIXEDABI-NOT: v0
|
||||
; FIXEDABI-NOT: v1
|
||||
; VARABI-NOT: v31
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
||||
|
@ -150,11 +170,16 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
|||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v2
|
||||
; VARABI: v_lshlrev_b32_e32 v0, 20, v2
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v1
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
|
||||
|
@ -163,11 +188,17 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v1
|
||||
; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; VARABI: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v1
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
|
||||
|
@ -176,12 +207,18 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v2
|
||||
; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; VARABI: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v2
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
|
||||
|
@ -190,12 +227,19 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
|
||||
; VARABI-NOT: v1
|
||||
; VARABI-NOT: v2
|
||||
; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
|
||||
; VARABI-NOT: v1
|
||||
; VARABI-NOT: v2
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
|
||||
|
@ -204,11 +248,21 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v1
|
||||
; VARABI-NOT: v2
|
||||
; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; VARABI-NOT: v0
|
||||
; VARABI-NOT: v1
|
||||
; VARABI-NOT: v2
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
|
||||
|
@ -245,7 +299,8 @@ define void @func_indirect_use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
|
||||
; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
|
||||
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
|
@ -258,7 +313,8 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
||||
|
@ -270,7 +326,8 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
|
||||
; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
||||
|
@ -282,13 +339,16 @@ define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
|||
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
; VARABI: enable_vgpr_workitem_id = 0
|
||||
; FIXEDABI: enable_vgpr_workitem_id = 2
|
||||
|
||||
; VARABI: v_mov_b32_e32 v1, v0
|
||||
; VARABI: v_mov_b32_e32 v0, 0x22b
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
||||
|
@ -298,13 +358,20 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
|||
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
|
||||
; VARABI: enable_vgpr_workitem_id = 1
|
||||
|
||||
; VARABI: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; VARABI-NOT: v1
|
||||
; VARABI: v_mov_b32_e32 v0, 0x22b
|
||||
; VARABI-NOT: v1
|
||||
; VARABI: s_swappc_b64
|
||||
; VARABI-NOT: v0
|
||||
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI: enable_vgpr_workitem_id = 2
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
||||
call void @other_arg_use_workitem_id_y(i32 555)
|
||||
ret void
|
||||
|
@ -313,21 +380,29 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
|||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; VARABI-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2
|
||||
; VARABI: s_swappc_b64
|
||||
; VARABI-NOT: v0
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
||||
call void @other_arg_use_workitem_id_z(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
|
||||
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; VARABI: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
; GCN: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
||||
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
||||
define void @too_many_args_use_workitem_id_x(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
|
@ -376,19 +451,23 @@ define void @too_many_args_use_workitem_id_x(
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
|
||||
; VARABI: enable_vgpr_workitem_id = 0
|
||||
|
||||
; VARABI: s_mov_b32 s32, 0
|
||||
; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; VARABI: s_swappc_b64
|
||||
|
||||
|
||||
; FIXEDABI: enable_vgpr_workitem_id = 2
|
||||
; FIXEDABI-DAG: s_mov_b32 s32, 0
|
||||
; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
; GCN-DAG: s_mov_b32 s32, 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
; FIXEDABI: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
|
||||
call void @too_many_args_use_workitem_id_x(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
|
@ -403,13 +482,15 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
|
||||
; VARABI: s_mov_b32 s33, s32
|
||||
; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
||||
|
||||
; Touching the workitem id register is not necessary.
|
||||
; GCN-NOT: v31
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
|
||||
; GCN-NOT: v31
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
; GCN-NOT: v31
|
||||
; FIXEDABI-NOT: v31
|
||||
; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
|
||||
; FIXEDABI-NOT: v31
|
||||
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
; FIXEDABI-NOT: v31
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
|
||||
|
@ -458,15 +539,21 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
|||
; frame[2] = VGPR spill slot
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
|
||||
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; VARABI-NEXT: s_waitcnt
|
||||
; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
|
||||
; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
|
||||
; GCN: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
|
||||
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
|
||||
|
||||
; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; GCN: s_setpc_b64
|
||||
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; FIXEDABI: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
|
@ -520,27 +607,36 @@ define void @too_many_args_use_workitem_id_x_byval(
|
|||
; sp[2] = stack passed workitem ID x
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
|
||||
; VARABI: enable_vgpr_workitem_id = 0
|
||||
; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; VARABI: s_movk_i32 s32, 0x400{{$}}
|
||||
; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
|
||||
; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
|
||||
|
||||
; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
|
||||
; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; VARABI: s_swappc_b64
|
||||
|
||||
|
||||
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
|
||||
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
|
||||
; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
|
||||
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
|
||||
|
||||
; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
|
||||
; GCN: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
|
||||
; GCN: s_movk_i32 s32, 0x400{{$}}
|
||||
; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
|
||||
|
||||
; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
|
||||
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
|
||||
|
||||
; FIXME: Why this reload?
|
||||
; GCN: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
|
||||
; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}}
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; GCN: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
|
||||
; GCN: s_swappc_b64
|
||||
; FIXEDABI-NOT: s32
|
||||
; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
|
||||
; FIXEDABI: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
store volatile i32 999, i32 addrspace(5)* %alloca
|
||||
|
@ -558,19 +654,26 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
|
||||
; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
|
||||
; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
||||
; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
|
||||
; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
|
||||
; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; VARABI: s_swappc_b64
|
||||
|
||||
|
||||
; FIXED-ABI-NOT: v31
|
||||
; GCN: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
|
||||
; GCN: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
|
||||
; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
|
||||
; GCN: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
|
||||
; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
|
||||
; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
|
||||
; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
|
||||
; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
|
||||
; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
|
||||
|
||||
; FIXED-ABI-NOT: v31
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
|
||||
; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
|
||||
; FIXED-ABI-NOT: v31
|
||||
; GCN: s_swappc_b64
|
||||
; FIXEDABI: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
store volatile i32 999, i32 addrspace(5)* %alloca
|
||||
|
@ -588,17 +691,29 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
|
||||
; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
||||
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; VARABI-NOT: buffer_load_dword
|
||||
|
||||
; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
|
||||
; VARABI-NOT: buffer_load_dword
|
||||
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
|
||||
; VARABI-NOT: buffer_load_dword
|
||||
; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
|
||||
; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
|
||||
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
|
||||
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
|
||||
|
||||
; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-NOT: buffer_load_dword
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
|
||||
; GCN-NOT: buffer_load_dword
|
||||
; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
|
||||
; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
|
||||
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
|
||||
; FIXEDABI-NOT: buffer_load_dword
|
||||
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
|
||||
; FIXEDABI-NOT: buffer_load_dword
|
||||
; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
|
||||
; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
|
||||
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
|
||||
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
|
||||
define void @too_many_args_use_workitem_id_xyz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
|
@ -659,10 +774,12 @@ define void @too_many_args_use_workitem_id_xyz(
|
|||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
|
||||
; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]]
|
||||
; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}}
|
||||
|
||||
; GCN-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
|
||||
; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
|
||||
; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
|
||||
|
|
|
@ -64,64 +64,45 @@ entry:
|
|||
define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
|
||||
; GFX803-LABEL: test_kern_call:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: s_mov_b32 s12, s14
|
||||
; GFX803-NEXT: s_mov_b32 s13, s15
|
||||
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
|
||||
; GFX803-NEXT: s_mov_b32 s14, s16
|
||||
; GFX803-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_mov_b32 s32, 0
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_kern_call:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: s_mov_b32 s12, s14
|
||||
; GFX900-NEXT: s_mov_b32 s13, s15
|
||||
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX900-NEXT: s_mov_b32 s14, s16
|
||||
; GFX900-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_mov_b32 s32, 0
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
|
||||
;
|
||||
; GFX1010-LABEL: test_kern_call:
|
||||
; GFX1010: ; %bb.0: ; %entry
|
||||
; GFX1010-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX1010-NEXT: s_mov_b32 s32, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_mov_b32 s12, s14
|
||||
; GFX1010-NEXT: s_mov_b32 s13, s15
|
||||
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX1010-NEXT: s_mov_b32 s14, s16
|
||||
; GFX1010-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX1010-NEXT: s_mov_b32 s32, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
tail call void @ex() #0
|
||||
ret void
|
||||
|
@ -130,73 +111,54 @@ entry:
|
|||
define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
|
||||
; GFX803-LABEL: test_kern_stack_and_call:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: s_mov_b32 s12, s14
|
||||
; GFX803-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX803-NEXT: s_mov_b32 s13, s15
|
||||
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
|
||||
; GFX803-NEXT: s_mov_b32 s14, s16
|
||||
; GFX803-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX803-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_kern_stack_and_call:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: s_mov_b32 s12, s14
|
||||
; GFX900-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX900-NEXT: s_mov_b32 s13, s15
|
||||
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX900-NEXT: s_mov_b32 s14, s16
|
||||
; GFX900-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
|
||||
;
|
||||
; GFX1010-LABEL: test_kern_stack_and_call:
|
||||
; GFX1010: ; %bb.0: ; %entry
|
||||
; GFX1010-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX1010-NEXT: s_movk_i32 s32, 0x200
|
||||
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_mov_b32 s12, s14
|
||||
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX1010-NEXT: s_mov_b32 s13, s15
|
||||
; GFX1010-NEXT: s_mov_b32 s14, s16
|
||||
; GFX1010-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
; GFX1010: ; %bb.0: ; %entry
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX1010-NEXT: s_movk_i32 s32, 0x200
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
%x = alloca i32, align 4, addrspace(5)
|
||||
store volatile i32 0, i32 addrspace(5)* %x, align 4
|
||||
|
@ -209,7 +171,7 @@ define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
|
|||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_mov_b32 s33, 0
|
||||
; GFX803-NEXT: s_endpgm
|
||||
|
||||
;
|
||||
; GFX900-LABEL: test_force_fp_kern_empty:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_mov_b32 s33, 0
|
||||
|
@ -271,67 +233,48 @@ entry:
|
|||
define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
|
||||
; GFX803-LABEL: test_force_fp_kern_call:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: s_mov_b32 s12, s14
|
||||
; GFX803-NEXT: s_mov_b32 s13, s15
|
||||
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
|
||||
; GFX803-NEXT: s_mov_b32 s14, s16
|
||||
; GFX803-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_mov_b32 s32, 0
|
||||
; GFX803-NEXT: s_mov_b32 s33, 0
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_force_fp_kern_call:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: s_mov_b32 s12, s14
|
||||
; GFX900-NEXT: s_mov_b32 s13, s15
|
||||
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX900-NEXT: s_mov_b32 s14, s16
|
||||
; GFX900-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_mov_b32 s32, 0
|
||||
; GFX900-NEXT: s_mov_b32 s33, 0
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1010-LABEL: test_force_fp_kern_call:
|
||||
; GFX1010: ; %bb.0: ; %entry
|
||||
; GFX1010-NEXT s_add_u32 s12, s12, s17
|
||||
; GFX1010-NEXT s_mov_b32 s32, 0
|
||||
; GFX1010-NEXT s_mov_b32 s33, 0
|
||||
; GFX1010-NEXT s_addc_u32 s13, s13, 0
|
||||
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
|
||||
; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
|
||||
; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX1010-NEXT s_add_u32 s0, s0, s17
|
||||
; GFX1010-NEXT s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT s_mov_b32 s12, s14
|
||||
; GFX1010-NEXT s_mov_b32 s13, s15
|
||||
; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
|
||||
; GFX1010-NEXT s_mov_b32 s14, s16
|
||||
; GFX1010-NEXT s_getpc_b64 s[18:19]
|
||||
; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX1010-NEXT s_endpgm
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX1010-NEXT: s_mov_b32 s32, 0
|
||||
; GFX1010-NEXT: s_mov_b32 s33, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
tail call void @ex() #2
|
||||
ret void
|
||||
|
@ -340,76 +283,57 @@ entry:
|
|||
define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
|
||||
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: s_mov_b32 s12, s14
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX803-NEXT: s_mov_b32 s33, 0
|
||||
; GFX803-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX803-NEXT: s_mov_b32 s13, s15
|
||||
; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
|
||||
; GFX803-NEXT: s_mov_b32 s14, s16
|
||||
; GFX803-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: s_mov_b32 s12, s14
|
||||
; GFX900-NEXT: s_mov_b32 s33, 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX900-NEXT: s_mov_b32 s13, s15
|
||||
; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX900-NEXT: s_mov_b32 s14, s16
|
||||
; GFX900-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
|
||||
; GFX1010: ; %bb.0: ; %entry
|
||||
; GFX1010-NEXT: s_add_u32 s12, s12, s17
|
||||
; GFX1010-NEXT: s_movk_i32 s32, 0x200
|
||||
; GFX1010-NEXT: s_mov_b32 s33, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_mov_b32 s12, s14
|
||||
; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GFX1010-NEXT: s_mov_b32 s13, s15
|
||||
; GFX1010-NEXT: s_mov_b32 s14, s16
|
||||
; GFX1010-NEXT: s_getpc_b64 s[18:19]
|
||||
; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, s7
|
||||
; GFX1010-NEXT: s_movk_i32 s32, 0x200
|
||||
; GFX1010-NEXT: s_mov_b32 s33, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
|
||||
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
%x = alloca i32, align 4, addrspace(5)
|
||||
store volatile i32 0, i32 addrspace(5)* %x, align 4
|
||||
|
|
|
@ -27,18 +27,18 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
|
|||
; GCN-LABEL: call_split_type_used_outside_block_v2f32:
|
||||
; GCN: ; %bb.0: ; %bb0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
|
@ -59,29 +59,30 @@ bb1:
|
|||
|
||||
define float @call_split_type_used_outside_block_v3f32() #0 {
|
||||
; GCN-LABEL: call_split_type_used_outside_block_v3f32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; GCN: ; %bb.0: ; %bb0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
bb0:
|
||||
%split.ret.type = call <3 x float> @func_v3f32()
|
||||
br label %bb1
|
||||
|
||||
|
@ -93,29 +94,28 @@ bb1:
|
|||
define half @call_split_type_used_outside_block_v4f16() #0 {
|
||||
; GCN-LABEL: call_split_type_used_outside_block_v4f16:
|
||||
; GCN: ; %bb.0: ; %bb0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
bb0:
|
||||
%split.ret.type = call <4 x half> @func_v4f16()
|
||||
br label %bb1
|
||||
|
@ -128,29 +128,29 @@ bb1:
|
|||
define { i32, half } @call_split_type_used_outside_block_struct() #0 {
|
||||
; GCN-LABEL: call_split_type_used_outside_block_struct:
|
||||
; GCN: ; %bb.0: ; %bb0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
bb0:
|
||||
%split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
|
||||
br label %bb1
|
||||
|
@ -168,40 +168,32 @@ bb1:
|
|||
define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
|
||||
; GCN-LABEL: v3i16_registers:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_and_b32 s12, 1, s12
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
|
||||
; GCN-NEXT: s_cbranch_vccnz BB4_2
|
||||
; GCN-NEXT: ; %bb.1: ; %if.else
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 8
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: s_branch BB4_3
|
||||
; GCN-NEXT: BB4_2:
|
||||
; GCN-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: BB4_3: ; %if.end
|
||||
; GCN-NEXT: global_store_short v[0:1], v1, off
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_and_b32 s4, 1, s4
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_cbranch_vccnz BB4_2
|
||||
; GCN-NEXT: ; %bb.1: ; %if.else
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_branch BB4_3
|
||||
; GCN-NEXT: BB4_2:
|
||||
; GCN-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: BB4_3: ; %if.end
|
||||
; GCN-NEXT: global_store_short v[0:1], v1, off
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
br i1 %cond, label %if.then, label %if.else
|
||||
|
||||
|
@ -221,36 +213,32 @@ if.end: ; preds = %if.else, %if.then
|
|||
define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
|
||||
; GCN-LABEL: v3f16_registers:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_and_b32 s12, 1, s12
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 1
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
|
||||
; GCN-NEXT: s_cbranch_vccnz BB5_2
|
||||
; GCN-NEXT: %bb.1: ; %if.else
|
||||
; GCN-NEXT: s_add_u32 s8, s8, 8
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GCN-NEXT: s_mov_b32 s12, s14
|
||||
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
|
||||
; GCN-NEXT: s_mov_b32 s13, s15
|
||||
; GCN-NEXT: s_mov_b32 s14, s16
|
||||
; GCN-NEXT: s_getpc_b64 s[18:19]
|
||||
; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; GCN-NEXT: s_branch BB5_3
|
||||
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
|
||||
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
|
||||
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s9
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_and_b32 s4, 1, s4
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GCN-NEXT: s_mov_b32 s32, 0
|
||||
; GCN-NEXT: s_cbranch_vccnz BB5_2
|
||||
; GCN-NEXT: ; %bb.1: ; %if.else
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_branch BB5_3
|
||||
; GCN-NEXT: BB5_2:
|
||||
; GCN-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GCN-NEXT: BB5_3: ; %if.end
|
||||
; GCN-NEXT: global_store_short v[0:1], v1, off
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
br i1 %cond, label %if.then, label %if.else
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: define internal void @indirect() #0 {
|
||||
define internal void @indirect() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: define internal void @direct() #1 {
|
||||
define internal void @direct() {
|
||||
%fptr = alloca void()*
|
||||
store void()* @indirect, void()** %fptr
|
||||
%fp = load void()*, void()** %fptr
|
||||
call void %fp()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: define amdgpu_kernel void @test_direct_indirect_call() #2 {
|
||||
define amdgpu_kernel void @test_direct_indirect_call() {
|
||||
call void @direct()
|
||||
ret void
|
||||
}
|
||||
|
||||
; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
|
||||
; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
|
||||
; attributes #2 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
|
|
@ -0,0 +1,22 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: define internal void @indirect() #0 {
|
||||
define internal void @indirect() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
|
||||
define amdgpu_kernel void @test_simple_indirect_call() #0 {
|
||||
%fptr = alloca void()*
|
||||
store void()* @indirect, void()** %fptr
|
||||
%fp = load void()*, void()** %fptr
|
||||
call void %fp()
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-dispatch-id" }
|
||||
|
||||
; compiler modification to attributes
|
||||
attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
|
||||
attributes #1 = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
|
||||
|
|
@ -11,25 +11,25 @@ define float @fdiv_f32(float %a, float %b) #0 {
|
|||
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
|
||||
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
|
||||
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
|
||||
; GCN: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
|
||||
; GCN: $vcc = COPY %14
|
||||
; GCN: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
|
||||
; GCN: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: $vcc = COPY %7
|
||||
; GCN: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
|
||||
; GCN: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN: $vgpr0 = COPY %28
|
||||
; GCN: $vgpr0 = COPY %21
|
||||
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
|
||||
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
|
||||
entry:
|
||||
|
@ -44,25 +44,25 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 {
|
|||
; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
|
||||
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
|
||||
; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
|
||||
; GCN: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
|
||||
; GCN: $vcc = COPY %14
|
||||
; GCN: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec
|
||||
; GCN: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: $vcc = COPY %7
|
||||
; GCN: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec
|
||||
; GCN: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN: $vgpr0 = COPY %28
|
||||
; GCN: $vgpr0 = COPY %21
|
||||
; GCN: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
|
||||
; GCN: S_SETPC_B64_return [[COPY4]], implicit $vgpr0
|
||||
entry:
|
||||
|
|
|
@ -31,7 +31,7 @@ define hidden void @func() #1 {
|
|||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
|
||||
|
||||
; GCN: ; NumSgprs: 37
|
||||
; GCN: ; NumVgprs: 32
|
||||
; GCN: ; NumVgprs: 9
|
||||
define amdgpu_kernel void @kernel_call() #0 {
|
||||
%vgpr = load volatile i32, i32 addrspace(1)* undef
|
||||
tail call void @func()
|
||||
|
@ -53,7 +53,7 @@ define amdgpu_kernel void @kernel_call() #0 {
|
|||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
|
||||
|
||||
; GCN: ; NumSgprs: 32
|
||||
; GCN: ; NumVgprs: 32
|
||||
; GCN: ; NumVgprs: 9
|
||||
define void @func_regular_call() #1 {
|
||||
%vgpr = load volatile i32, i32 addrspace(1)* undef
|
||||
tail call void @func()
|
||||
|
@ -63,13 +63,13 @@ define void @func_regular_call() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}func_tail_call:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16,
|
||||
; GCN-NEXT: s_addc_u32 s17,
|
||||
; GCN-NEXT: s_setpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4,
|
||||
; GCN-NEXT: s_addc_u32 s5,
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
|
||||
; GCN: ; NumSgprs: 32
|
||||
; GCN: ; NumVgprs: 32
|
||||
; GCN: ; NumVgprs: 8
|
||||
define void @func_tail_call() #1 {
|
||||
tail call void @func()
|
||||
ret void
|
||||
|
@ -82,7 +82,7 @@ define void @func_tail_call() #1 {
|
|||
; GCN: s_setpc_b64
|
||||
|
||||
; GCN: ; NumSgprs: 32
|
||||
; GCN: ; NumVgprs: 32
|
||||
; GCN: ; NumVgprs: 9
|
||||
define void @func_call_tail_call() #1 {
|
||||
%vgpr = load volatile i32, i32 addrspace(1)* undef
|
||||
tail call void @func()
|
||||
|
|
|
@ -13,9 +13,9 @@ define void @func_use_lds_global() {
|
|||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NEXT: ds_write_b32 v0, v0
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NEXT: s_trap 2
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -37,7 +37,7 @@ define void @func_use_lds_global_constexpr_cast() {
|
|||
; GFX8-LABEL: func_use_lds_global_constexpr_cast:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NEXT: s_trap 2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
|
|
@ -59,8 +59,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}func_implicitarg_ptr:
|
||||
; GCN: s_waitcnt
|
||||
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
|
||||
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @func_implicitarg_ptr() #0 {
|
||||
|
@ -72,8 +71,7 @@ define void @func_implicitarg_ptr() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
|
||||
; GCN: s_waitcnt
|
||||
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
|
||||
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @opencl_func_implicitarg_ptr() #0 {
|
||||
|
@ -114,11 +112,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
|
|||
; HSA: kernarg_segment_byte_size = 112
|
||||
; MESA: kernarg_segment_byte_size = 128
|
||||
|
||||
; HSA: s_add_u32 s8, s8, 0x70
|
||||
; HSA: s_add_u32 s4, s4, 0x70
|
||||
; MESA: s_add_u32 s4, s4, 0x70
|
||||
|
||||
; HSA: s_addc_u32 s9, s9, 0{{$}}
|
||||
; MESA: s_addc_u32 s5, s5, 0{{$}}
|
||||
; GCN: s_addc_u32 s5, s5, 0{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
|
||||
call void @func_implicitarg_ptr()
|
||||
|
@ -130,10 +127,8 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
|
|||
; HSA: kernarg_segment_byte_size = 160
|
||||
; MESA: kernarg_segment_byte_size = 128
|
||||
|
||||
; HSA: s_add_u32 s8, s8, 0x70
|
||||
; HSA: s_addc_u32 s9, s9, 0{{$}}
|
||||
; MESA: s_add_u32 s4, s4, 0x70
|
||||
; MESA: s_addc_u32 s5, s5, 0{{$}}
|
||||
; GCN: s_add_u32 s4, s4, 0x70
|
||||
; GCN: s_addc_u32 s5, s5, 0{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
|
||||
call void @func_implicitarg_ptr()
|
||||
|
@ -141,24 +136,18 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
|
||||
; HSA-NOT: s8
|
||||
; HSA-NOT: s9
|
||||
; HSA-NOT: s[8:9]
|
||||
; MESA-NOT: s4
|
||||
; MESA-NOT: s5
|
||||
; MESA-NOT: s[4:5]
|
||||
; GCN-NOT: s4
|
||||
; GCN-NOT: s5
|
||||
; GCN-NOT: s[4:5]
|
||||
define void @func_call_implicitarg_ptr_func() #0 {
|
||||
call void @func_implicitarg_ptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
|
||||
; HSA-NOT: s8
|
||||
; HSA-NOT: s9
|
||||
; HSA-NOT: s[8:9]
|
||||
; MESA-NOT: s4
|
||||
; MESA-NOT: s5
|
||||
; MESA-NOT: s[4:5]
|
||||
; GCN-NOT: s4
|
||||
; GCN-NOT: s5
|
||||
; GCN-NOT: s[4:5]
|
||||
define void @opencl_func_call_implicitarg_ptr_func() #0 {
|
||||
call void @func_implicitarg_ptr()
|
||||
ret void
|
||||
|
@ -168,8 +157,7 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {
|
|||
; GCN: s_waitcnt
|
||||
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
|
||||
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
|
||||
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
|
||||
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
define void @func_kernarg_implicitarg_ptr() #0 {
|
||||
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
|
||||
|
@ -185,8 +173,7 @@ define void @func_kernarg_implicitarg_ptr() #0 {
|
|||
; GCN: s_waitcnt
|
||||
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
|
||||
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
|
||||
; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
|
||||
; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; GCN: s_waitcnt lgkmcnt(0)
|
||||
define void @opencl_func_kernarg_implicitarg_ptr() #0 {
|
||||
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
|
||||
|
@ -199,10 +186,8 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
|
||||
; HSA: s_add_u32 s8, s8, 0x70
|
||||
; HSA: s_addc_u32 s9, s9, 0
|
||||
; MESA: s_add_u32 s4, s4, 0x70
|
||||
; MESA: s_addc_u32 s5, s5, 0
|
||||
; GCN: s_add_u32 s4, s4, 0x70
|
||||
; GCN: s_addc_u32 s5, s5, 0
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
|
||||
call void @func_kernarg_implicitarg_ptr()
|
||||
|
|
|
@ -187,98 +187,49 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
|
|||
define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
|
||||
; GFX9-LABEL: slsr1_1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s33, 15
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s34, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s35, 1
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s36, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s37, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s38, 4
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s39, 5
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s40, 6
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s41, 7
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s42, 8
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s43, 9
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s44, 10
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: s_mov_b64 s[40:41], s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s46, 11
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s47, 12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v42, v0
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s30, 13
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s31, 14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v40, v31
|
||||
; GFX9-NEXT: s_mov_b32 s42, s14
|
||||
; GFX9-NEXT: s_mov_b32 s43, s13
|
||||
; GFX9-NEXT: s_mov_b32 s44, s12
|
||||
; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
|
||||
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
|
||||
; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
|
||||
; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
|
||||
; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
|
||||
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
|
||||
; GFX9-NEXT: s_mov_b32 s12, s44
|
||||
; GFX9-NEXT: s_mov_b32 s13, s43
|
||||
; GFX9-NEXT: s_mov_b32 s14, s42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v41
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v41, v43
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[40:41]
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39]
|
||||
; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37]
|
||||
; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
|
||||
; GFX9-NEXT: s_mov_b32 s12, s44
|
||||
; GFX9-NEXT: s_mov_b32 s13, s43
|
||||
; GFX9-NEXT: s_mov_b32 s14, s42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[46:47]
|
||||
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v44, 13
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v44, 14
|
||||
; GFX9-NEXT: v_readlane_b32 s47, v44, 12
|
||||
; GFX9-NEXT: v_readlane_b32 s46, v44, 11
|
||||
; GFX9-NEXT: v_readlane_b32 s44, v44, 10
|
||||
; GFX9-NEXT: v_readlane_b32 s43, v44, 9
|
||||
; GFX9-NEXT: v_readlane_b32 s42, v44, 8
|
||||
; GFX9-NEXT: v_readlane_b32 s41, v44, 7
|
||||
; GFX9-NEXT: v_readlane_b32 s40, v44, 6
|
||||
; GFX9-NEXT: v_readlane_b32 s39, v44, 5
|
||||
; GFX9-NEXT: v_readlane_b32 s38, v44, 4
|
||||
; GFX9-NEXT: v_readlane_b32 s37, v44, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s36, v44, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s35, v44, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v44, 0
|
||||
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v44, 15
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s33, 4
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: v_mov_b32_e32 v40, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s30, 2
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
|
||||
; GFX9-NEXT: v_writelane_b32 v43, s31, 3
|
||||
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v40
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v43, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v43, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s35, v43, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v43, 0
|
||||
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v43, 4
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
%b = and i32 %b.arg, 16777215
|
||||
%s = and i32 %s.arg, 16777215
|
||||
|
||||
|
|
|
@ -27,23 +27,23 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
|
|||
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s15, s33
|
||||
; CHECK-NEXT: s_mov_b32 s8, s33
|
||||
; CHECK-NEXT: s_mov_b32 s33, s32
|
||||
; CHECK-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; CHECK-NEXT: s_getpc_b64 s[18:19]
|
||||
; CHECK-NEXT: s_add_u32 s18, s18, callee_has_fp@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s19, s19, callee_has_fp@rel32@hi+12
|
||||
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
||||
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
|
||||
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 s[16:17], s[30:31]
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; CHECK-NEXT: s_mov_b64 s[6:7], s[30:31]
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; clobber csr v40
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; CHECK-NEXT: s_mov_b32 s33, s15
|
||||
; CHECK-NEXT: s_mov_b32 s33, s8
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[16:17]
|
||||
; CHECK-NEXT: s_setpc_b64 s[6:7]
|
||||
bb:
|
||||
call fastcc void @callee_has_fp()
|
||||
call void asm sideeffect "; clobber csr v40", "~{v40}"()
|
||||
|
@ -53,15 +53,15 @@ bb:
|
|||
define amdgpu_kernel void @kernel_call() {
|
||||
; CHECK-LABEL: kernel_call:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s17
|
||||
; CHECK-DAG: s_addc_u32 s1, s1, 0
|
||||
; CHECK-DAG: s_getpc_b64 s[18:19]
|
||||
; CHECK-DAG: s_add_u32 s18, s18, csr_vgpr_spill_fp_callee@rel32@lo+4
|
||||
; CHECK-DAG: s_addc_u32 s19, s19, csr_vgpr_spill_fp_callee@rel32@hi+12
|
||||
; CHECK-DAG: s_mov_b32 s32, 0
|
||||
; CHECK-DAG: s_swappc_b64 s[30:31], s[18:19]
|
||||
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s7
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
||||
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12
|
||||
; CHECK-NEXT: s_mov_b32 s32, 0
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; CHECK-NEXT: s_endpgm
|
||||
bb:
|
||||
tail call fastcc void @csr_vgpr_spill_fp_callee()
|
||||
|
@ -73,23 +73,23 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
|
|||
; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; clobber csr v40
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: v_writelane_b32 v1, s33, 0
|
||||
; CHECK-NEXT: s_getpc_b64 s[16:17]
|
||||
; CHECK-NEXT: s_add_u32 s16, s16, callee_has_fp@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
|
||||
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
||||
; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
|
||||
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
|
||||
; CHECK-NEXT: s_setpc_b64 s[16:17]
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; CHECK-NEXT: s_setpc_b64 s[4:5]
|
||||
bb:
|
||||
call void asm sideeffect "; clobber csr v40", "~{v40}"()
|
||||
tail call fastcc void @callee_has_fp()
|
||||
|
@ -99,15 +99,15 @@ bb:
|
|||
define amdgpu_kernel void @kernel_tailcall() {
|
||||
; CHECK-LABEL: kernel_tailcall:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
|
||||
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s17
|
||||
; CHECK-DAG: s_addc_u32 s1, s1, 0
|
||||
; CHECK-DAG: s_getpc_b64 s[18:19]
|
||||
; CHECK-NEXT: s_add_u32 s18, s18, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s19, s19, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
|
||||
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
|
||||
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s7
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
||||
; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
|
||||
; CHECK-NEXT: s_mov_b32 s32, 0
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19]
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; CHECK-NEXT: s_endpgm
|
||||
bb:
|
||||
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
|
||||
|
|
|
@ -238,7 +238,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
|||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5
|
||||
; MUBUF-NEXT: s_mov_b32 s32, s6
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
|
@ -275,7 +275,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
|||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s4
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
|
@ -331,13 +331,13 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
|||
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
|
||||
; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v4, s6
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v5, s6
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4
|
||||
; MUBUF-NEXT: s_mov_b32 s32, s6
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
|
@ -364,12 +364,12 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
|||
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
|
||||
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v5, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v6, 1
|
||||
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
|
||||
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2
|
||||
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
|
||||
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
|
||||
; FLATSCR-NEXT: s_mov_b32 s32, s2
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
|
||||
|
|
|
@ -10,7 +10,7 @@ define void @child_function() #0 {
|
|||
; GCN: v_writelane_b32 v255, s33, 2
|
||||
; GCN: v_writelane_b32 v255, s30, 0
|
||||
; GCN: v_writelane_b32 v255, s31, 1
|
||||
; GCN: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN: v_readlane_b32 s30, v255, 0
|
||||
; GCN: v_readlane_b32 s31, v255, 1
|
||||
; GCN: v_readlane_b32 s33, v255, 2
|
||||
|
@ -56,7 +56,7 @@ define void @reserve_vgpr_with_no_lower_vgpr_available() #0 {
|
|||
; GCN: v_writelane_b32 v254, s33, 2
|
||||
; GCN: v_writelane_b32 v254, s30, 0
|
||||
; GCN: v_writelane_b32 v254, s31, 1
|
||||
; GCN: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN: v_readlane_b32 s30, v254, 0
|
||||
; GCN: v_readlane_b32 s31, v254, 1
|
||||
; GCN: v_readlane_b32 s33, v254, 2
|
||||
|
@ -150,7 +150,7 @@ ret:
|
|||
; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call
|
||||
; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32
|
||||
; GCN-NOT: v_writelane
|
||||
; GCN: s_setpc_b64 s[16:17]
|
||||
; GCN: s_setpc_b64 s[4:5]
|
||||
|
||||
define void @reserve_vgpr_with_tail_call() #0 {
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
|
|
|
@ -128,12 +128,12 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:8
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
|
||||
|
||||
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
|
||||
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
|
||||
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
|
||||
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
|
||||
|
||||
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
|
||||
|
@ -155,9 +155,12 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
|
|||
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
|
||||
|
||||
; GCN-NOT: s32
|
||||
|
||||
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
|
||||
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
|
||||
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
|
@ -167,7 +170,7 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
|
||||
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
|
||||
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
|
@ -194,14 +197,15 @@ entry:
|
|||
; Have another non-tail in the function
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
|
||||
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec
|
||||
; GCN: s_mov_b32 s33, s32
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x800
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x400
|
||||
|
||||
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-DAG: v_writelane_b32 v43, s46, 12
|
||||
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-DAG: v_writelane_b32 v42, s34, 0
|
||||
; GCN-DAG: v_writelane_b32 v42, s35, 1
|
||||
|
||||
; GCN-DAG: s_getpc_b64 s[4:5]
|
||||
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
|
||||
|
@ -210,22 +214,22 @@ entry:
|
|||
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
|
||||
; GCN: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
|
||||
; GCN: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
|
||||
|
||||
; GCN-DAG: v_readlane_b32 s35, v43, 1
|
||||
; GCN-DAG: v_readlane_b32 s34, v43, 0
|
||||
; GCN-DAG: v_readlane_b32 s34, v42, 0
|
||||
; GCN-DAG: v_readlane_b32 s35, v42, 1
|
||||
|
||||
; GCN: s_sub_u32 s32, s32, 0x800
|
||||
; GCN: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33,
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
|
||||
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[18:19]
|
||||
; GCN-NEXT: s_setpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
|
@ -243,7 +247,7 @@ entry:
|
|||
; GCN-NOT: s33
|
||||
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
|
||||
; GCN: s_setpc_b64 s[16:17]
|
||||
; GCN: s_setpc_b64 s[4:5]
|
||||
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
||||
|
@ -255,10 +259,10 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
|
||||
; GCN-NOT: s33
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
|
||||
|
||||
; GCN-NOT: s33
|
||||
; GCN: s_setpc_b64 s[16:17]
|
||||
; GCN: s_setpc_b64 s[4:5]
|
||||
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: define internal void @indirect() #0 {
|
||||
define internal void @indirect() {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: define amdgpu_kernel void @test_simple_indirect_call() #1 {
|
||||
define amdgpu_kernel void @test_simple_indirect_call() {
|
||||
%fptr = alloca void()*
|
||||
store void()* @indirect, void()** %fptr
|
||||
%fp = load void()*, void()** %fptr
|
||||
call void %fp()
|
||||
ret void
|
||||
}
|
||||
|
||||
; attributes #0 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
|
||||
; attributes #1 = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" }
|
|
@ -1,16 +1,16 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=7 < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}spill_csr_s5_copy:
|
||||
; GCN: s_or_saveexec_b64
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec
|
||||
; GCN: v_writelane_b32 v40, s33, 5
|
||||
; GCN: v_writelane_b32 v40, s33, 2
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
|
||||
|
||||
; GCN: v_readlane_b32 s33, v40, 5
|
||||
; GCN: v_readlane_b32 s33, v40, 2
|
||||
; GCN: s_or_saveexec_b64
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GCN: s_mov_b64 exec
|
||||
|
|
|
@ -157,21 +157,19 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
|
|||
|
||||
; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
|
||||
; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
|
||||
; GCN-DAG: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
|
||||
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
|
||||
; GCN: v_mov_b32_e32 v32, 0
|
||||
; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
|
||||
; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
|
||||
; GCN: s_mov_b32 s34, s32
|
||||
; GCN: v_mov_b32_e32 v32, 0
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
|
||||
; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 offset:4
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x30000
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
|
||||
; GCN: s_sub_u32 s32, s32, 0x30000
|
||||
; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
|
||||
|
|
|
@ -5,61 +5,61 @@
|
|||
define hidden void @widget() {
|
||||
; GCN-LABEL: widget:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccz BB0_3
|
||||
; GCN-NEXT: ; %bb.1: ; %bb4
|
||||
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz BB0_4
|
||||
; GCN-NEXT: ; %bb.2: ; %bb7
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: s_branch BB0_7
|
||||
; GCN-NEXT: BB0_3: ; %bb2
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz BB0_6
|
||||
; GCN-NEXT: BB0_4: ; %bb9
|
||||
; GCN-NEXT: s_getpc_b64 s[16:17]
|
||||
; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-NEXT: s_cbranch_execnz BB0_7
|
||||
; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: BB0_6: ; %bb12
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v2
|
||||
; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GCN-NEXT: s_cbranch_vccz BB0_3
|
||||
; GCN-NEXT: ; %bb.1: ; %bb4
|
||||
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz BB0_4
|
||||
; GCN-NEXT: ; %bb.2: ; %bb7
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_branch BB0_7
|
||||
; GCN-NEXT: BB0_3: ; %bb2
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz BB0_6
|
||||
; GCN-NEXT: BB0_4: ; %bb9
|
||||
; GCN-NEXT: s_getpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-NEXT: s_cbranch_execnz BB0_7
|
||||
; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: BB0_6: ; %bb12
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v2
|
||||
; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; SI-OPT-LABEL: @widget(
|
||||
; SI-OPT-NEXT: bb:
|
||||
; SI-OPT-NEXT: [[TMP:%.*]] = load i32, i32 addrspace(1)* null, align 16
|
||||
|
@ -186,124 +186,95 @@ define hidden void @blam() {
|
|||
; GCN-LABEL: blam:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
|
||||
; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[16:17]
|
||||
; GCN-NEXT: v_writelane_b32 v44, s33, 15
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v43, s33, 4
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v44, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v44, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v44, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v44, s38, 3
|
||||
; GCN-NEXT: v_writelane_b32 v44, s39, 4
|
||||
; GCN-NEXT: v_writelane_b32 v44, s40, 5
|
||||
; GCN-NEXT: v_writelane_b32 v44, s41, 6
|
||||
; GCN-NEXT: v_writelane_b32 v44, s42, 7
|
||||
; GCN-NEXT: v_writelane_b32 v44, s43, 8
|
||||
; GCN-NEXT: v_writelane_b32 v44, s44, 9
|
||||
; GCN-NEXT: v_writelane_b32 v44, s45, 10
|
||||
; GCN-NEXT: v_writelane_b32 v44, s46, 11
|
||||
; GCN-NEXT: v_writelane_b32 v44, s47, 12
|
||||
; GCN-NEXT: v_writelane_b32 v44, s48, 13
|
||||
; GCN-NEXT: v_writelane_b32 v44, s49, 14
|
||||
; GCN-NEXT: v_mov_b32_e32 v40, v31
|
||||
; GCN-NEXT: s_mov_b32 s34, s14
|
||||
; GCN-NEXT: s_mov_b32 s35, s13
|
||||
; GCN-NEXT: s_mov_b32 s36, s12
|
||||
; GCN-NEXT: s_mov_b64 s[38:39], s[10:11]
|
||||
; GCN-NEXT: s_mov_b64 s[40:41], s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 s[42:43], s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[44:45], s[4:5]
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: v_writelane_b32 v43, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v43, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v43, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v43, s37, 3
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40
|
||||
; GCN-NEXT: flat_load_dword v41, v[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v43, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[48:49]
|
||||
; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GCN-NEXT: flat_load_dword v40, v[1:2]
|
||||
; GCN-NEXT: v_mov_b32_e32 v42, 0
|
||||
; GCN-NEXT: s_getpc_b64 s[36:37]
|
||||
; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_cmp_eq_f32_e64 s[46:47], 0, v41
|
||||
; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v40
|
||||
; GCN-NEXT: s_branch BB1_3
|
||||
; GCN-NEXT: BB1_1: ; %bb10
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: BB1_1: ; %bb10
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_2: ; %bb18
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: BB1_2: ; %bb18
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_3: ; %bb2
|
||||
; GCN-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
|
||||
; GCN-NEXT: BB1_3: ; %bb2
|
||||
; GCN-NEXT: ; =>This Loop Header: Depth=1
|
||||
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GCN-NEXT: BB1_4: ; %bb2
|
||||
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
|
||||
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GCN-NEXT: flat_load_dword v0, v[42:43]
|
||||
; GCN-NEXT: BB1_4: ; %bb2
|
||||
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
|
||||
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; GCN-NEXT: flat_load_dword v0, v[41:42]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: s_cbranch_execz BB1_6
|
||||
; GCN-NEXT: %bb.5: ; %bb8
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: ; %bb.5: ; %bb8
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
||||
; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_4
|
||||
; GCN-NEXT: s_branch BB1_1
|
||||
; GCN-NEXT: BB1_6: ; %bb6
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: BB1_6: ; %bb6
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
|
||||
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_4
|
||||
; GCN-NEXT: %bb.7: ; %bb11
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: _or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[44:45]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[42:43]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[40:41]
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[38:39]
|
||||
; GCN-NEXT: s_mov_b32 s12, s36
|
||||
; GCN-NEXT: s_mov_b32 s13, s35
|
||||
; GCN-NEXT: s_mov_b32 s14, s34
|
||||
; GCN-NEXT: v_mov_b32_e32 v31, v40
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49]
|
||||
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_4
|
||||
; GCN-NEXT: ; %bb.8: ; %bb14
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[46:47]
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_10
|
||||
; GCN-NEXT: ; %bb.9: ; %bb16
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_10: ; %bb17
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0
|
||||
; GCN-NEXT: s_branch BB1_2
|
||||
|
||||
; GCN-NEXT: ; %bb.7: ; %bb11
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37]
|
||||
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_4
|
||||
; GCN-NEXT: ; %bb.8: ; %bb14
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[34:35]
|
||||
; GCN-NEXT: s_cbranch_execnz BB1_10
|
||||
; GCN-NEXT: ; %bb.9: ; %bb16
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: BB1_10: ; %bb17
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], 0
|
||||
; GCN-NEXT: s_branch BB1_2
|
||||
bb:
|
||||
%tmp = load float, float* null, align 16
|
||||
br label %bb2
|
||||
|
|
|
@ -21,14 +21,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
|
|||
; GFX9-NEXT: v_mov_b32_e32 v32, v12
|
||||
; GFX9: ;;#ASMSTART
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1
|
||||
; GFX9-NEXT: s_getpc_b64 s[16:17]
|
||||
; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
||||
; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_writelane_b32 v44, s30, 0
|
||||
; GFX9: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
|
||||
; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
|
@ -53,14 +53,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
|
|||
; GFX10: ;;#ASMSTART
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
|
||||
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[16:23], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
|
||||
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_getpc_b64 s[16:17]
|
||||
; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
|
||||
; GFX10: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
|
||||
; GFX10: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GFX10: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
|
||||
; GFX10: buffer_load_dword v43, off, s[0:3], s33
|
||||
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
|
||||
|
@ -100,14 +100,14 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
|
|||
; GFX9-NEXT: v_mov_b32_e32 v40, v12
|
||||
|
||||
; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
|
||||
; GFX9-NEXT: s_getpc_b64 s[16:17]
|
||||
; GFX9-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GFX9: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1
|
||||
|
||||
; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
|
@ -127,29 +127,22 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
|
|||
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
|
||||
; GFX10: s_getpc_b64 s[16:17]
|
||||
; GFX10-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_mov_b32 s37, s36
|
||||
; GFX10-NEXT: s_mov_b32 s38, s36
|
||||
; GFX10-NEXT: s_mov_b32 s39, s36
|
||||
; GFX10-NEXT: s_mov_b32 s40, s36
|
||||
; GFX10-NEXT: s_mov_b32 s41, s36
|
||||
; GFX10-NEXT: s_mov_b32 s42, s36
|
||||
; GFX10-NEXT: s_mov_b32 s43, s36
|
||||
; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
|
||||
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_writelane_b32 v45, s30, 8
|
||||
|
||||
; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: v_mov_b32_e32 v40, v16
|
||||
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v41, v15
|
||||
; GFX10-NEXT: v_mov_b32_e32 v42, v14
|
||||
; GFX10-NEXT: v_mov_b32_e32 v43, v13
|
||||
; GFX10-NEXT: v_writelane_b32 v45, s31, 9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v44, v12
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
|
||||
|
||||
; GFX10: buffer_load_dword v44, off, s[0:3], s33
|
||||
|
|
|
@ -20,18 +20,10 @@
|
|||
# FULL-NEXT: stackPtrOffsetReg: '$sgpr13'
|
||||
# FULL-NEXT: argumentInfo:
|
||||
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
|
||||
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' }
|
||||
# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
|
||||
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr0' }
|
||||
# FULL-NEXT: mode:
|
||||
# FULL-NEXT: ieee: true
|
||||
# FULL-NEXT: dx10-clamp: true
|
||||
|
@ -55,18 +47,10 @@
|
|||
# SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13'
|
||||
# SIMPLE-NEXT: argumentInfo:
|
||||
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
|
||||
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' }
|
||||
# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
|
||||
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' }
|
||||
# SIMPLE-NEXT: occupancy: 10
|
||||
# SIMPLE-NEXT: body:
|
||||
name: kernel0
|
||||
|
@ -112,16 +96,6 @@ body: |
|
|||
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
|
||||
# FULL-NEXT: argumentInfo:
|
||||
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# FULL-NEXT: mode:
|
||||
# FULL-NEXT: ieee: true
|
||||
# FULL-NEXT: dx10-clamp: true
|
||||
|
@ -137,16 +111,6 @@ body: |
|
|||
# SIMPLE-NEXT: maxKernArgAlign: 1
|
||||
# SIMPLE-NEXT: argumentInfo:
|
||||
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# SIMPLE-NEXT: occupancy: 10
|
||||
# SIMPLE-NEXT: body:
|
||||
|
||||
|
@ -175,16 +139,6 @@ body: |
|
|||
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
|
||||
# FULL-NEXT: argumentInfo:
|
||||
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# FULL-NEXT: mode:
|
||||
# FULL-NEXT: ieee: true
|
||||
# FULL-NEXT: dx10-clamp: true
|
||||
|
@ -200,16 +154,6 @@ body: |
|
|||
# SIMPLE-NEXT: maxKernArgAlign: 1
|
||||
# SIMPLE-NEXT: argumentInfo:
|
||||
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# SIMPLE-NEXT: occupancy: 10
|
||||
# SIMPLE-NEXT: body:
|
||||
|
||||
|
@ -239,16 +183,6 @@ body: |
|
|||
# FULL-NEXT: stackPtrOffsetReg: '$sp_reg'
|
||||
# FULL-NEXT: argumentInfo:
|
||||
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# FULL-NEXT: mode:
|
||||
# FULL-NEXT: ieee: true
|
||||
# FULL-NEXT: dx10-clamp: true
|
||||
|
@ -265,16 +199,6 @@ body: |
|
|||
# SIMPLE-NEXT: isEntryFunction: true
|
||||
# SIMPLE-NEXT: argumentInfo:
|
||||
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
||||
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# SIMPLE-NEXT: occupancy: 10
|
||||
# SIMPLE-NEXT: body:
|
||||
|
||||
|
@ -311,31 +235,13 @@ body: |
|
|||
|
||||
# FULL: argumentInfo:
|
||||
# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# FULL-NEXT: flatScratchInit: { offset: 4 }
|
||||
# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
|
||||
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
|
||||
|
||||
# SIMPLE: argumentInfo:
|
||||
# SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
||||
# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
||||
# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
||||
# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
||||
# SIMPLE-NEXT: flatScratchInit: { offset: 4 }
|
||||
# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
||||
# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
||||
# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
||||
# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
||||
# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
|
||||
# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
||||
# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 }
|
||||
name: fake_stack_arginfo
|
||||
machineFunctionInfo:
|
||||
argumentInfo:
|
||||
|
|
Loading…
Reference in New Issue