forked from OSchip/llvm-project
[AMDGPU] Changes the AMDGPU_Gfx calling convention by making the SGPRs 4..29 callee-save. This is to avoid superfluous s_movs when executing amdgpu_gfx function calls as the callee is likely not going to change the argument values.
This patch changes the AMDGPU_Gfx calling convention. It defines the SGPR registers s[4:29] as callee-save and leaves some SGPRs usable for callers. The intention is to avoid unneccessary s_mov instructions for arguments the caller would otherwise save and restore in these registers. Reviewed By: sebastian-ne Differential Revision: https://reviews.llvm.org/D111637
This commit is contained in:
parent
d788c44f5c
commit
76cbe62262
|
@ -355,14 +355,23 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
|
|||
|
||||
auto const &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
|
||||
unsigned ReturnOpc =
|
||||
IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
|
||||
unsigned ReturnOpc = 0;
|
||||
if (IsShader)
|
||||
ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG;
|
||||
else if (CC == CallingConv::AMDGPU_Gfx)
|
||||
ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx;
|
||||
else
|
||||
ReturnOpc = AMDGPU::S_SETPC_B64_return;
|
||||
|
||||
auto Ret = B.buildInstrNoInsert(ReturnOpc);
|
||||
Register ReturnAddrVReg;
|
||||
if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
|
||||
ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
|
||||
Ret.addUse(ReturnAddrVReg);
|
||||
} else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
|
||||
ReturnAddrVReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass);
|
||||
Ret.addUse(ReturnAddrVReg);
|
||||
}
|
||||
|
||||
if (!FLI.CanLowerReturn)
|
||||
|
@ -370,7 +379,8 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
|
|||
else if (!lowerReturnVal(B, Val, VRegs, Ret))
|
||||
return false;
|
||||
|
||||
if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
|
||||
if (ReturnOpc == AMDGPU::S_SETPC_B64_return ||
|
||||
ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
|
||||
&AMDGPU::SGPR_64RegClass);
|
||||
|
|
|
@ -20,11 +20,13 @@ def CC_SI_Gfx : CallingConv<[
|
|||
// 0-3 are reserved for the stack buffer descriptor
|
||||
// 30-31 are reserved for the return address
|
||||
// 32 is reserved for the stack pointer
|
||||
// 33 is reserved for the frame pointer
|
||||
// 34 is reserved for the base pointer
|
||||
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
|
||||
SGPR4, SGPR5, SGPR6, SGPR7,
|
||||
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
|
||||
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
|
||||
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
|
||||
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
|
||||
]>>>,
|
||||
|
||||
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
|
||||
|
@ -41,17 +43,6 @@ def RetCC_SI_Gfx : CallingConv<[
|
|||
CCIfType<[i1], CCPromoteToType<i32>>,
|
||||
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
||||
|
||||
// 0-3 are reserved for the stack buffer descriptor
|
||||
// 32 is reserved for the stack pointer
|
||||
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
|
||||
SGPR4, SGPR5, SGPR6, SGPR7,
|
||||
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
|
||||
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
|
||||
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
|
||||
SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
|
||||
SGPR40, SGPR41, SGPR42, SGPR43
|
||||
]>>>,
|
||||
|
||||
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
|
||||
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
|
||||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
|
@ -165,6 +156,14 @@ def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
|
|||
(sequence "SGPR%u", 32, 105)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs<
|
||||
(sequence "SGPR%u", 4, 29)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs<
|
||||
(sequence "SGPR%u", 64, 105)
|
||||
>;
|
||||
|
||||
// Just to get the regmask, not for calling convention purposes.
|
||||
def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs<
|
||||
(sequence "VGPR%u", 0, 255)
|
||||
|
@ -190,6 +189,14 @@ def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs<
|
|||
(add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
|
||||
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs<
|
||||
(add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
|
||||
|
||||
// Calling convention for leaf functions
|
||||
|
|
|
@ -4352,6 +4352,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(TC_RETURN)
|
||||
NODE_NAME_CASE(TRAP)
|
||||
NODE_NAME_CASE(RET_FLAG)
|
||||
NODE_NAME_CASE(RET_GFX_FLAG)
|
||||
NODE_NAME_CASE(RETURN_TO_EPILOG)
|
||||
NODE_NAME_CASE(ENDPGM)
|
||||
NODE_NAME_CASE(DWORDADDR)
|
||||
|
|
|
@ -343,7 +343,7 @@ namespace AMDGPUISD {
|
|||
enum NodeType : unsigned {
|
||||
// AMDIL ISD Opcodes
|
||||
FIRST_NUMBER = ISD::BUILTIN_OP_END,
|
||||
UMUL, // 32bit unsigned multiplication
|
||||
UMUL, // 32bit unsigned multiplication
|
||||
BRANCH_COND,
|
||||
// End AMDIL ISD Opcodes
|
||||
|
||||
|
@ -366,6 +366,9 @@ enum NodeType : unsigned {
|
|||
// Return with values from a non-entry function.
|
||||
RET_FLAG,
|
||||
|
||||
// Return with values from a non-entry function (AMDGPU_Gfx CC).
|
||||
RET_GFX_FLAG,
|
||||
|
||||
DWORDADDR,
|
||||
FRACT,
|
||||
|
||||
|
@ -422,10 +425,10 @@ enum NodeType : unsigned {
|
|||
DOT4,
|
||||
CARRY,
|
||||
BORROW,
|
||||
BFE_U32, // Extract range of bits with zero extension to 32-bits.
|
||||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
BFI, // (src0 & src1) | (~src0 & src2)
|
||||
BFM, // Insert a range of bits into a 32-bit word.
|
||||
BFE_U32, // Extract range of bits with zero extension to 32-bits.
|
||||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
BFI, // (src0 & src1) | (~src0 & src2)
|
||||
BFM, // Insert a range of bits into a 32-bit word.
|
||||
FFBH_U32, // ctlz with -1 if input is zero.
|
||||
FFBH_I32,
|
||||
FFBL_B32, // cttz with -1 if input is zero.
|
||||
|
@ -534,7 +537,6 @@ enum NodeType : unsigned {
|
|||
LAST_AMDGPU_ISD_NUMBER
|
||||
};
|
||||
|
||||
|
||||
} // End namespace AMDGPUISD
|
||||
|
||||
} // End namespace llvm
|
||||
|
|
|
@ -359,6 +359,10 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt
|
|||
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
|
||||
>;
|
||||
|
||||
def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
|
||||
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
|
||||
>;
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Intrinsic/Custom node compatibility PatFrags
|
||||
|
|
|
@ -120,7 +120,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
|||
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
|
||||
// need to select it to the subtarget specific version, and there's no way to
|
||||
// do that with a single pseudo source operation.
|
||||
if (Opcode == AMDGPU::S_SETPC_B64_return)
|
||||
if (Opcode == AMDGPU::S_SETPC_B64_return ||
|
||||
Opcode == AMDGPU::S_SETPC_B64_return_gfx)
|
||||
Opcode = AMDGPU::S_SETPC_B64;
|
||||
else if (Opcode == AMDGPU::SI_CALL) {
|
||||
// SI_CALL is just S_SWAPPC_B64 with an additional operand to track the
|
||||
|
|
|
@ -2616,9 +2616,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|||
SDValue ReturnAddrReg = CreateLiveInRegister(
|
||||
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
|
||||
|
||||
SDValue ReturnAddrVirtualReg = DAG.getRegister(
|
||||
MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
|
||||
MVT::i64);
|
||||
SDValue ReturnAddrVirtualReg =
|
||||
DAG.getRegister(MF.getRegInfo().createVirtualRegister(
|
||||
CallConv != CallingConv::AMDGPU_Gfx
|
||||
? &AMDGPU::CCR_SGPR_64RegClass
|
||||
: &AMDGPU::Gfx_CCR_SGPR_64RegClass),
|
||||
MVT::i64);
|
||||
Chain =
|
||||
DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
|
||||
Flag = Chain.getValue(1);
|
||||
|
@ -2681,8 +2684,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|||
RetOps.push_back(Flag);
|
||||
|
||||
unsigned Opc = AMDGPUISD::ENDPGM;
|
||||
if (!IsWaveEnd)
|
||||
Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
|
||||
if (!IsWaveEnd) {
|
||||
if (IsShader)
|
||||
Opc = AMDGPUISD::RETURN_TO_EPILOG;
|
||||
else if (CallConv == CallingConv::AMDGPU_Gfx)
|
||||
Opc = AMDGPUISD::RET_GFX_FLAG;
|
||||
else
|
||||
Opc = AMDGPUISD::RET_FLAG;
|
||||
}
|
||||
|
||||
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
|
||||
}
|
||||
|
||||
|
|
|
@ -963,6 +963,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
|
|||
// with knowledge of the called routines.
|
||||
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
|
||||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
|
||||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
|
||||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
|
||||
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
|
||||
}
|
||||
|
|
|
@ -360,10 +360,13 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
|
|||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
case CallingConv::Cold:
|
||||
case CallingConv::AMDGPU_Gfx:
|
||||
return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
|
||||
? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
|
||||
: CSR_AMDGPU_HighRegs_SaveList;
|
||||
case CallingConv::AMDGPU_Gfx:
|
||||
return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
|
||||
? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList
|
||||
: CSR_AMDGPU_SI_Gfx_SaveList;
|
||||
default: {
|
||||
// Dummy to not crash RegisterClassInfo.
|
||||
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
|
||||
|
@ -383,10 +386,13 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
|
|||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
case CallingConv::Cold:
|
||||
case CallingConv::AMDGPU_Gfx:
|
||||
return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
|
||||
? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
|
||||
: CSR_AMDGPU_HighRegs_RegMask;
|
||||
case CallingConv::AMDGPU_Gfx:
|
||||
return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
|
||||
? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask
|
||||
: CSR_AMDGPU_SI_Gfx_RegMask;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -691,6 +691,14 @@ def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
|
|||
let AllocationPriority = SGPR_64.AllocationPriority;
|
||||
}
|
||||
|
||||
// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
|
||||
def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
|
||||
(add (trunc (shl SGPR_64, 15), 1), // s[30:31]
|
||||
(trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
|
||||
let CopyCost = SGPR_64.CopyCost;
|
||||
let AllocationPriority = SGPR_64.AllocationPriority;
|
||||
}
|
||||
|
||||
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
|
||||
(add TTMP_64Regs)> {
|
||||
let isAllocatable = 0;
|
||||
|
|
|
@ -265,6 +265,7 @@ def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
|
|||
let isReturn = 1 in {
|
||||
// Define variant marked as return rather than branch.
|
||||
def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>;
|
||||
def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>;
|
||||
}
|
||||
} // End isTerminator = 1, isBarrier = 1
|
||||
|
||||
|
|
|
@ -18,10 +18,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
|
|||
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_void()
|
||||
ret void
|
||||
}
|
||||
|
@ -39,10 +39,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
|
|||
; CHECK-NEXT: $vgpr0 = COPY [[C]](s32)
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
|
||||
ret void
|
||||
}
|
||||
|
@ -60,10 +60,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
|
|||
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
|
||||
ret void
|
||||
}
|
||||
|
@ -88,10 +88,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
|
|||
; CHECK-NEXT: $vgpr1 = COPY [[LOAD2]](s32)
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
|
||||
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
|
||||
call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
|
||||
|
@ -118,10 +118,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
|
|||
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
|
||||
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
|
||||
call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
|
||||
|
|
|
@ -141,12 +141,12 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)
|
|||
; GCN-NEXT: $vgpr0 = COPY [[C]](s32)
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
|
||||
; GCN-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
|
||||
; GCN-NEXT: S_SETPC_B64_return [[COPY5]]
|
||||
; GCN-NEXT: [[COPY5:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
|
||||
; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY5]]
|
||||
%val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42)
|
||||
store volatile i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
|
@ -219,13 +219,13 @@ define amdgpu_gfx void @test_gfx_call_external_i1_func_void() #0 {
|
|||
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
|
||||
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1)
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
%val = call amdgpu_gfx i1 @external_gfx_i1_func_void()
|
||||
store volatile i1 %val, i1 addrspace(1)* undef
|
||||
ret void
|
||||
|
@ -415,14 +415,14 @@ define amdgpu_gfx void @test_gfx_call_external_i8_func_void() #0 {
|
|||
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
|
||||
; GCN-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
|
||||
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1)
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
%val = call amdgpu_gfx i8 @external_gfx_i8_func_void()
|
||||
store volatile i8 %val, i8 addrspace(1)* undef
|
||||
ret void
|
||||
|
@ -784,12 +784,12 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_void() #0 {
|
|||
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: G_STORE [[COPY2]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
%val = call amdgpu_gfx i32 @external_gfx_i32_func_void()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
|
@ -2480,7 +2480,7 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
|
|||
; GCN-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
|
||||
; GCN-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
|
||||
; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
|
||||
; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr2
|
||||
|
@ -2488,8 +2488,8 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
|
|||
; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; GCN-NEXT: G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
|
||||
; GCN-NEXT: G_STORE [[MV]](s64), [[COPY1]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
|
||||
; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return [[COPY6]]
|
||||
; GCN-NEXT: [[COPY6:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY6]]
|
||||
%val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void()
|
||||
%val.0 = extractvalue { i32, i64 } %val, 0
|
||||
%val.1 = extractvalue { i32, i64 } %val, 1
|
||||
|
|
|
@ -156,10 +156,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 {
|
|||
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_void()
|
||||
ret void
|
||||
}
|
||||
|
@ -899,10 +899,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 {
|
|||
; CHECK-NEXT: $vgpr0 = COPY [[C]](s32)
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_i32(i32 42)
|
||||
ret void
|
||||
}
|
||||
|
@ -920,10 +920,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
|
|||
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY3]]
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]]
|
||||
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
|
||||
ret void
|
||||
}
|
||||
|
@ -3893,10 +3893,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 {
|
|||
; CHECK-NEXT: $vgpr1 = COPY [[LOAD2]](s32)
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
|
||||
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
|
||||
call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val)
|
||||
|
@ -3923,10 +3923,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
|
|||
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY2]]
|
||||
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]]
|
||||
%ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
|
||||
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
|
||||
call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val)
|
||||
|
|
|
@ -66,10 +66,10 @@ define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
|
|||
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return [[COPY4]]
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]]
|
||||
; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY4]]
|
||||
call amdgpu_gfx void %fptr()
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -153,14 +153,14 @@ attributes #0 = { nounwind }
|
|||
; GCN-NEXT: .shader_functions:
|
||||
; GCN-NEXT: dynamic_stack:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x3{{$}}
|
||||
; GCN-NEXT: dynamic_stack_loop:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; SDAG-NEXT: .sgpr_count: 0x22{{$}}
|
||||
; GISEL-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; SDAG-NEXT: .sgpr_count: 0x25{{$}}
|
||||
; GISEL-NEXT: .sgpr_count: 0x26{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x3{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x4{{$}}
|
||||
|
@ -176,26 +176,26 @@ attributes #0 = { nounwind }
|
|||
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: no_stack_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x21{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x26{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2{{$}}
|
||||
; GCN-NEXT: no_stack_extern_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x29{{$}}
|
||||
; GCN-NEXT: no_stack_extern_call_many_args:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
; GCN-NEXT: no_stack_indirect_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
|
@ -206,7 +206,7 @@ attributes #0 = { nounwind }
|
|||
; GCN-NEXT: .vgpr_count: 0x1{{$}}
|
||||
; GCN-NEXT: simple_lds_recurse:
|
||||
; GCN-NEXT: .lds_size: 0x100{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x26{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x29{{$}}
|
||||
; GCN-NEXT: simple_stack:
|
||||
|
@ -216,25 +216,25 @@ attributes #0 = { nounwind }
|
|||
; GCN-NEXT: .vgpr_count: 0x2{{$}}
|
||||
; GCN-NEXT: simple_stack_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x22{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x26{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x3{{$}}
|
||||
; GCN-NEXT: simple_stack_extern_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GCN-NEXT: simple_stack_indirect_call:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
|
||||
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
||||
; SDAG-NEXT: .vgpr_count: 0x2b{{$}}
|
||||
; GISEL-NEXT: .vgpr_count: 0x34{{$}}
|
||||
; GCN-NEXT: simple_stack_recurse:
|
||||
; GCN-NEXT: .lds_size: 0{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x24{{$}}
|
||||
; GCN-NEXT: .sgpr_count: 0x26{{$}}
|
||||
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
|
||||
; GCN-NEXT: .vgpr_count: 0x2a{{$}}
|
||||
; GCN-NEXT: ...
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -8,69 +8,69 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
|
|||
; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 4
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 3
|
||||
; GFX9-NEXT: s_getpc_b64 s[34:35]
|
||||
; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 3
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 4
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 4
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[34:35]
|
||||
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 3
|
||||
; GFX10-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 3
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 4
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "", ""() #0
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
|
@ -81,21 +81,21 @@ define amdgpu_gfx void @void_func_void_clobber_s30_s31() #1 {
|
|||
; GFX9-LABEL: void_func_void_clobber_s30_s31:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; clobber
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: void_func_void_clobber_s30_s31:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; clobber
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
|
||||
ret void
|
||||
}
|
||||
|
@ -104,75 +104,75 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)
|
|||
; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def s31
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_mov_b32 s34, s31
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX9-NEXT: s_mov_b32 s31, s34
|
||||
; GFX9-NEXT: s_mov_b32 s4, s31
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b32 s31, s4
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use s31
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_mayclobber_s31:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX10-NEXT: s_getpc_b64 s[34:35]
|
||||
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def s31
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: s_mov_b32 s34, s31
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX10-NEXT: s_mov_b32 s31, s34
|
||||
; GFX10-NEXT: s_mov_b32 s4, s31
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX10-NEXT: s_mov_b32 s31, s4
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use s31
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%s31 = call i32 asm sideeffect "; def $0", "={s31}"()
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
|
||||
|
@ -183,9 +183,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
|
|||
; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
|
@ -196,33 +196,33 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
|
|||
; GFX9-NEXT: ; def v31
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v31
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v31, v41
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use v31
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_mayclobber_v31:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
|
@ -232,26 +232,26 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)
|
|||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v41, v31
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: v_mov_b32_e32 v31, v41
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use v31
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
|
||||
|
@ -263,67 +263,75 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(i32 addrspace(1)*
|
|||
; GFX9-LABEL: test_call_void_func_void_preserves_s33:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def s33
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_mov_b32 s4, s33
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b32 s33, s4
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use s33
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_preserves_s33:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def s33
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: s_mov_b32 s4, s33
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_mov_b32 s33, s4
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use s33
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%s33 = call i32 asm sideeffect "; def $0", "={s33}"()
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
|
||||
|
@ -334,11 +342,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
|
|||
; GFX9-LABEL: test_call_void_func_void_preserves_s34:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
|
@ -346,59 +354,63 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(i32 addrspace(1)*
|
|||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def s34
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX9-NEXT: s_mov_b32 s4, s34
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX9-NEXT: s_mov_b32 s34, s4
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use s34
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_preserves_s34:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def s34
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: s_mov_b32 s4, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX10-NEXT: s_mov_b32 s34, s4
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use s34
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%s34 = call i32 asm sideeffect "; def $0", "={s34}"()
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "{s34}"(i32 %s34)
|
||||
|
@ -409,9 +421,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
|
|||
; GFX9-LABEL: test_call_void_func_void_preserves_v40:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v41, s33, 2
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
|
@ -421,32 +433,32 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
|
|||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def v40
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use v40
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v41, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v41, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v41, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v41, 1
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v41, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_preserves_v40:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v41, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
|
@ -455,25 +467,25 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(i32 addrspace(1)*
|
|||
; GFX10-NEXT: ; def v40
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v41, s30, 0
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v41, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use v40
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v41, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v41, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v41, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v41, 1
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v41, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%v40 = call i32 asm sideeffect "; def $0", "={v40}"()
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
|
||||
|
@ -568,55 +580,55 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
|
|||
; GFX9-LABEL: test_call_void_func_void_clobber_s33:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s33@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s33@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_clobber_s33:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s33@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s33@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
call amdgpu_gfx void @void_func_void_clobber_s33()
|
||||
ret void
|
||||
}
|
||||
|
@ -625,55 +637,55 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
|
|||
; GFX9-LABEL: test_call_void_func_void_clobber_s34:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s34@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s34@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: test_call_void_func_void_clobber_s34:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s34@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s34@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
call amdgpu_gfx void @void_func_void_clobber_s34()
|
||||
ret void
|
||||
}
|
||||
|
@ -682,11 +694,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
|
|||
; GFX9-LABEL: callee_saved_sgpr_kernel:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s40, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
|
@ -694,59 +706,61 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
|
|||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def s40
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX9-NEXT: s_mov_b32 s4, s40
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use s40
|
||||
; GFX9-NEXT: ; use s4
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s40, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: callee_saved_sgpr_kernel:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s40, 0
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def s40
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: s_mov_b32 s4, s40
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use s40
|
||||
; GFX10-NEXT: ; use s4
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s40, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
|
||||
|
@ -757,11 +771,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
|
|||
; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s40, 0
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
|
||||
|
@ -770,76 +784,78 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
|
|||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def s40
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: s_mov_b32 s4, s40
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; def v32
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v32
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use s40
|
||||
; GFX9-NEXT: ; use s4
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: ;;#ASMSTART
|
||||
; GFX9-NEXT: ; use v41
|
||||
; GFX9-NEXT: ;;#ASMEND
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s40, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 3
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0x200
|
||||
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s40, 0
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def s40
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
|
||||
; GFX10-NEXT: s_mov_b32 s4, s40
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; def v32
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: v_mov_b32_e32 v41, v32
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use s40
|
||||
; GFX10-NEXT: ; use s4
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: ;;#ASMSTART
|
||||
; GFX10-NEXT: ; use v41
|
||||
; GFX10-NEXT: ;;#ASMEND
|
||||
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s40, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
|
||||
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 3
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s34
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
|
||||
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
|
||||
call amdgpu_gfx void @external_void_func_void()
|
||||
|
|
|
@ -23,27 +23,27 @@ define amdgpu_gfx void @call_i1() #0 {
|
|||
; GFX9-LABEL: call_i1:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, return_i1@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, return_i1@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, return_i1@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, return_i1@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: call_i1:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX10-NEXT: s_add_u32 s6, s6, return_i1@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, return_i1@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, return_i1@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, return_i1@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
entry:
|
||||
call amdgpu_gfx i1 @return_i1()
|
||||
ret void
|
||||
|
@ -70,27 +70,27 @@ define amdgpu_gfx void @call_i16() #0 {
|
|||
; GFX9-LABEL: call_i16:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, return_i16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, return_i16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, return_i16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, return_i16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: call_i16:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX10-NEXT: s_add_u32 s6, s6, return_i16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, return_i16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, return_i16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, return_i16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
entry:
|
||||
call amdgpu_gfx i16 @return_i16()
|
||||
ret void
|
||||
|
@ -117,27 +117,27 @@ define amdgpu_gfx void @call_2xi16() #0 {
|
|||
; GFX9-LABEL: call_2xi16:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, return_2xi16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, return_2xi16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, return_2xi16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, return_2xi16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: call_2xi16:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX10-NEXT: s_add_u32 s6, s6, return_2xi16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, return_2xi16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, return_2xi16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, return_2xi16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
entry:
|
||||
call amdgpu_gfx <2 x i16> @return_2xi16()
|
||||
ret void
|
||||
|
@ -166,27 +166,27 @@ define amdgpu_gfx void @call_3xi16() #0 {
|
|||
; GFX9-LABEL: call_3xi16:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, return_3xi16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, return_3xi16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, return_3xi16@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, return_3xi16@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: call_3xi16:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX10-NEXT: s_add_u32 s6, s6, return_3xi16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, return_3xi16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, return_3xi16@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, return_3xi16@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
entry:
|
||||
call amdgpu_gfx <3 x i16> @return_3xi16()
|
||||
ret void
|
||||
|
@ -1241,41 +1241,41 @@ define amdgpu_gfx void @call_512xi32() #0 {
|
|||
; GFX9-LABEL: call_512xi32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s8, s33
|
||||
; GFX9-NEXT: s_mov_b32 s34, s33
|
||||
; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0
|
||||
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000
|
||||
; GFX9-NEXT: s_add_i32 s32, s32, 0x60000
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, return_512xi32@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, return_512xi32@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-NEXT: s_add_u32 s30, s30, return_512xi32@gotpcrel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s31, s31, return_512xi32@gotpcrel32@hi+12
|
||||
; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000
|
||||
; GFX9-NEXT: s_mov_b32 s33, s8
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_mov_b32 s33, s34
|
||||
; GFX9-NEXT: s_setpc_b64 s[36:37]
|
||||
;
|
||||
; GFX10-LABEL: call_512xi32:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_mov_b32 s8, s33
|
||||
; GFX10-NEXT: s_mov_b32 s34, s33
|
||||
; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000
|
||||
; GFX10-NEXT: s_add_i32 s32, s32, 0x30000
|
||||
; GFX10-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX10-NEXT: s_add_u32 s6, s6, return_512xi32@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, return_512xi32@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX10-NEXT: s_add_u32 s30, s30, return_512xi32@gotpcrel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s31, s31, return_512xi32@gotpcrel32@hi+12
|
||||
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000
|
||||
; GFX10-NEXT: s_mov_b32 s33, s8
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_mov_b32 s33, s34
|
||||
; GFX10-NEXT: s_setpc_b64 s[36:37]
|
||||
entry:
|
||||
call amdgpu_gfx <512 x i32> @return_512xi32()
|
||||
ret void
|
||||
|
|
|
@ -1074,41 +1074,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
|
|||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x400
|
||||
; GCN-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GCN-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s7, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GCN-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GCN-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GCN-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GCN-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GCN-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GCN-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GCN-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GCN-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GCN-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GCN-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GCN-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GCN-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GCN-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GCN-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GCN-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GCN-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GCN-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GCN-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GCN-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GCN-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GCN-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GCN-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GCN-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[30:31]
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GCN-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s12, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[12:13], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
; GCN-NEXT: s_cbranch_execnz BB6_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GCN-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GCN-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GCN-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GCN-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GCN-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GCN-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GCN-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GCN-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GCN-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GCN-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GCN-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GCN-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GCN-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GCN-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GCN-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GCN-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GCN-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GCN-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GCN-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GCN-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GCN-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GCN-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GCN-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[4:5]
|
||||
; GCN-NEXT: s_setpc_b64 s[6:7]
|
||||
;
|
||||
; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
|
||||
; GISEL: ; %bb.0:
|
||||
|
@ -1116,41 +1165,90 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(void(i32)* %fptr) {
|
|||
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0x400
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GISEL-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GISEL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s7, v1
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], s[30:31]
|
||||
; GISEL-NEXT: s_movk_i32 s4, 0x7b
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; GISEL-NEXT: s_mov_b64 s[8:9], exec
|
||||
; GISEL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s11, v1
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[12:13]
|
||||
; GISEL-NEXT: s_cbranch_execnz BB6_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GISEL-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GISEL-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GISEL-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GISEL-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GISEL-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GISEL-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GISEL-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GISEL-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GISEL-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GISEL-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GISEL-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GISEL-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GISEL-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GISEL-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GISEL-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GISEL-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GISEL-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GISEL-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GISEL-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GISEL-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GISEL-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GISEL-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GISEL-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GISEL-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GISEL-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GISEL-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GISEL-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GISEL-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-NEXT: s_setpc_b64 s[4:5]
|
||||
; GISEL-NEXT: s_setpc_b64 s[6:7]
|
||||
call amdgpu_gfx void %fptr(i32 inreg 123)
|
||||
ret void
|
||||
}
|
||||
|
@ -1162,7 +1260,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
|
|||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x400
|
||||
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
|
@ -1170,32 +1268,81 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
|
|||
; GCN-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GCN-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GCN-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GCN-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GCN-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GCN-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GCN-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GCN-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GCN-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GCN-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GCN-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GCN-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GCN-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GCN-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GCN-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GCN-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GCN-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GCN-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GCN-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GCN-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GCN-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GCN-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GCN-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GCN-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GCN-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GCN-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v2
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s11, v2
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v41
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: s_cbranch_execnz BB7_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v41
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GCN-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GCN-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GCN-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GCN-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GCN-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GCN-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GCN-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GCN-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GCN-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GCN-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GCN-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GCN-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GCN-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GCN-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GCN-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GCN-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GCN-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GCN-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GCN-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GCN-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GCN-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GCN-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
@ -1208,7 +1355,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
|
|||
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0x400
|
||||
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
|
||||
|
@ -1216,32 +1363,81 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr)
|
|||
; GISEL-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GISEL-NEXT: v_mov_b32_e32 v41, v0
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GISEL-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GISEL-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GISEL-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s5, v2
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s9, v2
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, v41
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
; GISEL-NEXT: s_cbranch_execnz BB7_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, v41
|
||||
; GISEL-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GISEL-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GISEL-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GISEL-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GISEL-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GISEL-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GISEL-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GISEL-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GISEL-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GISEL-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GISEL-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GISEL-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GISEL-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GISEL-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GISEL-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GISEL-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GISEL-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GISEL-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GISEL-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GISEL-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GISEL-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GISEL-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GISEL-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GISEL-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GISEL-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GISEL-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GISEL-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
@ -1262,38 +1458,87 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
|
|||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x400
|
||||
; GCN-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GCN-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GCN-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GCN-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GCN-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GCN-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GCN-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GCN-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GCN-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GCN-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GCN-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GCN-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GCN-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GCN-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GCN-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GCN-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GCN-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GCN-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GCN-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GCN-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GCN-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GCN-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GCN-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GCN-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GCN-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v2
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s11, v2
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2
|
||||
; GCN-NEXT: ; implicit-def: $vgpr0
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: s_cbranch_execnz BB8_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GCN-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GCN-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GCN-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GCN-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GCN-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GCN-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GCN-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GCN-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GCN-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GCN-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GCN-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GCN-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GCN-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GCN-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GCN-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GCN-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GCN-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GCN-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GCN-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GCN-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GCN-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GCN-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
@ -1306,38 +1551,87 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr)
|
|||
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0x400
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GISEL-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GISEL-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GISEL-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s5, v2
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s9, v2
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr0
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
; GISEL-NEXT: s_cbranch_execnz BB8_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GISEL-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GISEL-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GISEL-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GISEL-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GISEL-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GISEL-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GISEL-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GISEL-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GISEL-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GISEL-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GISEL-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GISEL-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GISEL-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GISEL-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GISEL-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GISEL-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GISEL-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GISEL-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GISEL-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GISEL-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GISEL-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GISEL-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GISEL-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GISEL-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GISEL-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GISEL-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GISEL-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
@ -1355,35 +1649,84 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
|
|||
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x400
|
||||
; GCN-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GCN-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GCN-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GCN-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GCN-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GCN-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GCN-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GCN-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GCN-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GCN-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GCN-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GCN-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GCN-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GCN-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GCN-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GCN-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GCN-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GCN-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GCN-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GCN-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GCN-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GCN-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GCN-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GCN-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GCN-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GCN-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GCN-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GCN-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GCN-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GCN-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GCN-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GCN-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GCN-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s11, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GCN-NEXT: s_xor_b64 exec, exec, s[8:9]
|
||||
; GCN-NEXT: s_cbranch_execnz BB9_1
|
||||
; GCN-NEXT: ; %bb.2:
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GCN-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GCN-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GCN-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GCN-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GCN-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GCN-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GCN-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GCN-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GCN-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GCN-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GCN-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GCN-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GCN-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GCN-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GCN-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GCN-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GCN-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GCN-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GCN-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GCN-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GCN-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GCN-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GCN-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GCN-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GCN-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GCN-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GCN-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GCN-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GCN-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
@ -1396,35 +1739,84 @@ define void @test_indirect_tail_call_vgpr_ptr(void()* %fptr) {
|
|||
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s33, 30
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0x400
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s34, 0
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s35, 1
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s36, 2
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s37, 3
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s30, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s31, 5
|
||||
; GISEL-NEXT: s_mov_b64 s[34:35], exec
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s38, 4
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s39, 5
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s40, 6
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s41, 7
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s42, 8
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s43, 9
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s44, 10
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s45, 11
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s46, 12
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s47, 13
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s48, 14
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s49, 15
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s50, 16
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s51, 17
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s52, 18
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s53, 19
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s54, 20
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s55, 21
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s56, 22
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s57, 23
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s58, 24
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s59, 25
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s60, 26
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s61, 27
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s62, 28
|
||||
; GISEL-NEXT: v_writelane_b32 v40, s63, 29
|
||||
; GISEL-NEXT: s_mov_b64 s[4:5], s[30:31]
|
||||
; GISEL-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GISEL-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[36:37], vcc
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
|
||||
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
|
||||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
||||
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[36:37]
|
||||
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
|
||||
; GISEL-NEXT: s_cbranch_execnz BB9_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GISEL-NEXT: v_readlane_b32 s4, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s5, v40, 5
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GISEL-NEXT: v_readlane_b32 s63, v40, 29
|
||||
; GISEL-NEXT: v_readlane_b32 s62, v40, 28
|
||||
; GISEL-NEXT: v_readlane_b32 s61, v40, 27
|
||||
; GISEL-NEXT: v_readlane_b32 s60, v40, 26
|
||||
; GISEL-NEXT: v_readlane_b32 s59, v40, 25
|
||||
; GISEL-NEXT: v_readlane_b32 s58, v40, 24
|
||||
; GISEL-NEXT: v_readlane_b32 s57, v40, 23
|
||||
; GISEL-NEXT: v_readlane_b32 s56, v40, 22
|
||||
; GISEL-NEXT: v_readlane_b32 s55, v40, 21
|
||||
; GISEL-NEXT: v_readlane_b32 s54, v40, 20
|
||||
; GISEL-NEXT: v_readlane_b32 s53, v40, 19
|
||||
; GISEL-NEXT: v_readlane_b32 s52, v40, 18
|
||||
; GISEL-NEXT: v_readlane_b32 s51, v40, 17
|
||||
; GISEL-NEXT: v_readlane_b32 s50, v40, 16
|
||||
; GISEL-NEXT: v_readlane_b32 s49, v40, 15
|
||||
; GISEL-NEXT: v_readlane_b32 s48, v40, 14
|
||||
; GISEL-NEXT: v_readlane_b32 s47, v40, 13
|
||||
; GISEL-NEXT: v_readlane_b32 s46, v40, 12
|
||||
; GISEL-NEXT: v_readlane_b32 s45, v40, 11
|
||||
; GISEL-NEXT: v_readlane_b32 s44, v40, 10
|
||||
; GISEL-NEXT: v_readlane_b32 s43, v40, 9
|
||||
; GISEL-NEXT: v_readlane_b32 s42, v40, 8
|
||||
; GISEL-NEXT: v_readlane_b32 s41, v40, 7
|
||||
; GISEL-NEXT: v_readlane_b32 s40, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s39, v40, 5
|
||||
; GISEL-NEXT: v_readlane_b32 s38, v40, 4
|
||||
; GISEL-NEXT: v_readlane_b32 s37, v40, 3
|
||||
; GISEL-NEXT: v_readlane_b32 s36, v40, 2
|
||||
; GISEL-NEXT: v_readlane_b32 s35, v40, 1
|
||||
; GISEL-NEXT: v_readlane_b32 s34, v40, 0
|
||||
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 6
|
||||
; GISEL-NEXT: v_readlane_b32 s33, v40, 30
|
||||
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
|
||||
|
|
|
@ -8,15 +8,15 @@
|
|||
define amdgpu_kernel void @s_input_output_i128() {
|
||||
; GFX908-LABEL: name: s_input_output_i128
|
||||
; GFX908: bb.0 (%ir-block.0):
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4
|
||||
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]]
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]]
|
||||
; GFX908-NEXT: S_ENDPGM 0
|
||||
; GFX90A-LABEL: name: s_input_output_i128
|
||||
; GFX90A: bb.0 (%ir-block.0):
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4
|
||||
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]]
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]]
|
||||
; GFX90A-NEXT: S_ENDPGM 0
|
||||
%val = tail call i128 asm sideeffect "; def $0", "=s"()
|
||||
call void asm sideeffect "; use $0", "s"(i128 %val)
|
||||
|
@ -26,15 +26,15 @@ define amdgpu_kernel void @s_input_output_i128() {
|
|||
define amdgpu_kernel void @v_input_output_i128() {
|
||||
; GFX908-LABEL: name: v_input_output_i128
|
||||
; GFX908: bb.0 (%ir-block.0):
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:VReg_128 */, def %4
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:VReg_128 */, def %4
|
||||
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:VReg_128 */, [[COPY]]
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_128 */, [[COPY]]
|
||||
; GFX908-NEXT: S_ENDPGM 0
|
||||
; GFX90A-LABEL: name: v_input_output_i128
|
||||
; GFX90A: bb.0 (%ir-block.0):
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4849674 /* regdef:VReg_128_Align2 */, def %4
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5177354 /* regdef:VReg_128_Align2 */, def %4
|
||||
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4849673 /* reguse:VReg_128_Align2 */, [[COPY]]
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5177353 /* reguse:VReg_128_Align2 */, [[COPY]]
|
||||
; GFX90A-NEXT: S_ENDPGM 0
|
||||
%val = tail call i128 asm sideeffect "; def $0", "=v"()
|
||||
call void asm sideeffect "; use $0", "v"(i128 %val)
|
||||
|
@ -44,15 +44,15 @@ define amdgpu_kernel void @v_input_output_i128() {
|
|||
define amdgpu_kernel void @a_input_output_i128() {
|
||||
; GFX908-LABEL: name: a_input_output_i128
|
||||
; GFX908: bb.0 (%ir-block.0):
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:AReg_128 */, def %4
|
||||
; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4980746 /* regdef:AReg_128 */, def %4
|
||||
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_128 */, [[COPY]]
|
||||
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4980745 /* reguse:AReg_128 */, [[COPY]]
|
||||
; GFX908-NEXT: S_ENDPGM 0
|
||||
; GFX90A-LABEL: name: a_input_output_i128
|
||||
; GFX90A: bb.0 (%ir-block.0):
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:AReg_128_Align2 */, def %4
|
||||
; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:AReg_128_Align2 */, def %4
|
||||
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:AReg_128_Align2 */, [[COPY]]
|
||||
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:AReg_128_Align2 */, [[COPY]]
|
||||
; GFX90A-NEXT: S_ENDPGM 0
|
||||
%val = call i128 asm sideeffect "; def $0", "=a"()
|
||||
call void asm sideeffect "; use $0", "a"(i128 %val)
|
||||
|
|
|
@ -28,31 +28,32 @@ define amdgpu_cs void @test_simple_indirect_call() {
|
|||
; Attributor adds work-group-size attribute. This should be ok.
|
||||
; GFX9-LABEL: test_simple_indirect_call:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_getpc_b64 s[36:37]
|
||||
; GFX9-NEXT: s_mov_b32 s36, s0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10
|
||||
; GFX9-NEXT: s_getpc_b64 s[8:9]
|
||||
; GFX9-NEXT: s_mov_b32 s8, s0
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_mov_b32 s32, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s36, s36, s0
|
||||
; GFX9-NEXT: s_addc_u32 s37, s37, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; GFX9-NEXT: s_add_u32 s8, s8, s0
|
||||
; GFX9-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: test_simple_indirect_call:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_getpc_b64 s[36:37]
|
||||
; GFX10-NEXT: s_mov_b32 s36, s0
|
||||
; GFX10-NEXT: s_getpc_b64 s[8:9]
|
||||
; GFX10-NEXT: s_mov_b32 s8, s0
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10
|
||||
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
|
||||
; GFX10-NEXT: s_mov_b32 s32, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_bitset0_b32 s39, 21
|
||||
; GFX10-NEXT: s_add_u32 s36, s36, s0
|
||||
; GFX10-NEXT: s_addc_u32 s37, s37, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; GFX10-NEXT: s_bitset0_b32 s11, 21
|
||||
; GFX10-NEXT: s_add_u32 s8, s8, s0
|
||||
; GFX10-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
||||
|
|
|
@ -17,12 +17,28 @@ define amdgpu_gfx float @caller(float %arg0) {
|
|||
; GCN-LABEL: caller:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: v_writelane_b32 v1, s33, 1
|
||||
; GCN-NEXT: s_mov_b32 s33, s32
|
||||
; GCN-NEXT: s_addk_i32 s32, 0x400
|
||||
; GCN-NEXT: v_writelane_b32 v1, s4, 0
|
||||
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||||
; GCN-NEXT: s_mov_b32 s4, 2.0
|
||||
; GCN-NEXT: s_getpc_b64 s[6:7]
|
||||
; GCN-NEXT: s_add_u32 s6, s6, callee@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s7, s7, callee@rel32@hi+12
|
||||
; GCN-NEXT: s_setpc_b64 s[6:7]
|
||||
; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GCN-NEXT: s_getpc_b64 s[30:31]
|
||||
; GCN-NEXT: s_add_u32 s30, s30, callee@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s31, s31, callee@rel32@hi+12
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GCN-NEXT: v_readlane_b32 s4, v1, 0
|
||||
; GCN-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GCN-NEXT: v_readlane_b32 s33, v1, 1
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[36:37]
|
||||
%add = fadd float %arg0, 1.0
|
||||
%call = tail call amdgpu_gfx float @callee(float %add, float inreg 2.0)
|
||||
ret float %call
|
||||
|
|
|
@ -261,7 +261,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
|
|||
; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
|
||||
; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
|
||||
; SI-NEXT: $vgpr0 = COPY killed [[PHI5]]
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
|
||||
; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
|
||||
; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc
|
||||
|
@ -294,7 +294,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
|
|||
; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
|
||||
; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
|
||||
; SI-NEXT: $vgpr0 = COPY killed [[PHI7]]
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
|
||||
; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
|
||||
; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
|
||||
|
@ -374,7 +374,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
|
|||
; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
|
||||
; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
|
||||
; SI-NEXT: $vgpr0 = COPY [[COPY4]]
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
|
||||
; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
|
||||
; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc
|
||||
|
@ -406,7 +406,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
|
|||
; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
|
||||
; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
|
||||
; SI-NEXT: $vgpr0 = COPY [[COPY4]]
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
|
||||
; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
|
||||
; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
|
||||
; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
|
||||
|
|
|
@ -158,61 +158,61 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
|
|||
; SI-LABEL: loop:
|
||||
; SI: ; %bb.0: ; %main_body
|
||||
; SI-NEXT: v_mov_b32_e32 v6, v0
|
||||
; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; SI-NEXT: s_mov_b32 s38, -1
|
||||
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
||||
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
||||
; SI-NEXT: s_mov_b32 s14, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v1
|
||||
; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
|
||||
; SI-NEXT: s_mov_b32 s39, 0x31c16000
|
||||
; SI-NEXT: s_add_u32 s36, s36, s1
|
||||
; SI-NEXT: s_addc_u32 s37, s37, 0
|
||||
; SI-NEXT: s_mov_b32 s15, 0x31c16000
|
||||
; SI-NEXT: s_add_u32 s12, s12, s1
|
||||
; SI-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SI-NEXT: s_mov_b32 s32, 0
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
|
||||
; SI-NEXT: s_xor_b32 s33, exec_lo, s0
|
||||
; SI-NEXT: s_xor_b32 s4, exec_lo, s0
|
||||
; SI-NEXT: s_cbranch_execz BB3_4
|
||||
; SI-NEXT: ; %bb.1: ; %else
|
||||
; SI-NEXT: s_mov_b32 s34, exec_lo
|
||||
; SI-NEXT: s_mov_b32 s5, exec_lo
|
||||
; SI-NEXT: BB3_2: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; SI-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
|
||||
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; SI-NEXT: v_readfirstlane_b32 s6, v4
|
||||
; SI-NEXT: v_readfirstlane_b32 s7, v5
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5]
|
||||
; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[12:13]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v0
|
||||
; SI-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8
|
||||
; SI-NEXT: s_cbranch_execnz BB3_2
|
||||
; SI-NEXT: ; %bb.3:
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s34
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s5
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: BB3_4: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s33, s33
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s33
|
||||
; SI-NEXT: s_or_saveexec_b32 s4, s4
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4
|
||||
; SI-NEXT: s_cbranch_execz BB3_8
|
||||
; SI-NEXT: ; %bb.5: ; %if
|
||||
; SI-NEXT: s_mov_b32 s34, exec_lo
|
||||
; SI-NEXT: s_mov_b32 s5, exec_lo
|
||||
; SI-NEXT: BB3_6: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; SI-NEXT: v_readfirstlane_b32 s5, v3
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
|
||||
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; SI-NEXT: v_readfirstlane_b32 s6, v2
|
||||
; SI-NEXT: v_readfirstlane_b32 s7, v3
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3]
|
||||
; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[12:13]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v0
|
||||
; SI-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8
|
||||
; SI-NEXT: s_cbranch_execnz BB3_6
|
||||
; SI-NEXT: ; %bb.7:
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s34
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s5
|
||||
; SI-NEXT: BB3_8: ; %end
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s33
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v1
|
||||
; SI-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
@ -236,58 +236,58 @@ end:
|
|||
define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
|
||||
; SI-LABEL: loop_with_use:
|
||||
; SI: ; %bb.0: ; %main_body
|
||||
; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
|
||||
; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
|
||||
; SI-NEXT: s_mov_b32 s38, -1
|
||||
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
||||
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
||||
; SI-NEXT: s_mov_b32 s14, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v40, v1
|
||||
; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
|
||||
; SI-NEXT: s_mov_b32 s39, 0x31c16000
|
||||
; SI-NEXT: s_add_u32 s36, s36, s1
|
||||
; SI-NEXT: s_addc_u32 s37, s37, 0
|
||||
; SI-NEXT: s_mov_b32 s15, 0x31c16000
|
||||
; SI-NEXT: s_add_u32 s12, s12, s1
|
||||
; SI-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SI-NEXT: s_mov_b32 s32, 0
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
|
||||
; SI-NEXT: s_xor_b32 s33, exec_lo, s0
|
||||
; SI-NEXT: s_xor_b32 s4, exec_lo, s0
|
||||
; SI-NEXT: s_cbranch_execz BB4_4
|
||||
; SI-NEXT: ; %bb.1: ; %else
|
||||
; SI-NEXT: s_mov_b32 s34, exec_lo
|
||||
; SI-NEXT: s_mov_b32 s5, exec_lo
|
||||
; SI-NEXT: BB4_2: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; SI-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
|
||||
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
|
||||
; SI-NEXT: v_readfirstlane_b32 s6, v4
|
||||
; SI-NEXT: v_readfirstlane_b32 s7, v5
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5]
|
||||
; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v40
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[12:13]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; SI-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8
|
||||
; SI-NEXT: s_cbranch_execnz BB4_2
|
||||
; SI-NEXT: ; %bb.3:
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s34
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s5
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: BB4_4: ; %Flow
|
||||
; SI-NEXT: s_or_saveexec_b32 s33, s33
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s33
|
||||
; SI-NEXT: s_or_saveexec_b32 s4, s4
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4
|
||||
; SI-NEXT: s_cbranch_execz BB4_8
|
||||
; SI-NEXT: ; %bb.5: ; %if
|
||||
; SI-NEXT: s_mov_b32 s34, exec_lo
|
||||
; SI-NEXT: s_mov_b32 s5, exec_lo
|
||||
; SI-NEXT: BB4_6: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; SI-NEXT: v_readfirstlane_b32 s5, v3
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
|
||||
; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo
|
||||
; SI-NEXT: v_readfirstlane_b32 s6, v2
|
||||
; SI-NEXT: v_readfirstlane_b32 s7, v3
|
||||
; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3]
|
||||
; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v40
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[12:13]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; SI-NEXT: s_swappc_b64 s[30:31], s[6:7]
|
||||
; SI-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35
|
||||
; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8
|
||||
; SI-NEXT: s_cbranch_execnz BB4_6
|
||||
; SI-NEXT: ; %bb.7:
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s34
|
||||
; SI-NEXT: s_mov_b32 exec_lo, s5
|
||||
; SI-NEXT: BB4_8: ; %end
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s33
|
||||
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
||||
; SI-NEXT: v_add_f32_e32 v0, v0, v40
|
||||
; SI-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
|
|
@ -6,77 +6,75 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
|
|||
; GFX9-O0-LABEL: strict_wwm_no_cfg:
|
||||
; GFX9-O0: ; %bb.0:
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, s5
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, s9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s7, s8
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[4:7], s8
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s37, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s38, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s39, s7
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[36:39], s34
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
|
||||
; GFX9-O0-NEXT: s_nop 1
|
||||
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v0, v0, v2
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
|
||||
; GFX9-O0-NEXT: s_nop 1
|
||||
; GFX9-O0-NEXT: v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v0, v1, v0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, v4
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s9, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, 2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[40:41], v3, v4
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[40:41]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s35, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s35
|
||||
; GFX9-O0-NEXT: v_and_b32_e32 v3, v3, v4
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[4:7], s8 offset:4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-O3-LABEL: strict_wwm_no_cfg:
|
||||
; GFX9-O3: ; %bb.0:
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
|
||||
|
@ -84,12 +82,12 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
|
|||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-O3-NEXT: v_add_u32_e32 v0, v3, v0
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
|
||||
|
@ -97,14 +95,14 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
|
|||
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4
|
||||
; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4
|
||||
; GFX9-O3-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:4
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
|
||||
%tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
|
||||
|
@ -136,52 +134,51 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
|
|||
; GFX9-O0-LABEL: strict_wwm_cfg:
|
||||
; GFX9-O0: ; %bb.0: ; %entry
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s30, 0
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s31, 1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s11, s7
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[10:11]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 2
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 3
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s6, 4
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s7, 5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 0
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], s4
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s37, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s38, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s39, s7
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 2
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 3
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 4
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s30, 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s30
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s30
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s30
|
||||
; GFX9-O0-NEXT: s_nop 1
|
||||
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, s30
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s30
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 6
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 7
|
||||
; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[30:31], exec
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s30, 6
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v5, s31, 7
|
||||
; GFX9-O0-NEXT: s_and_b64 s[30:31], s[30:31], s[34:35]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GFX9-O0-NEXT: s_cbranch_execz BB1_2
|
||||
; GFX9-O0-NEXT: ; %bb.1: ; %if
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
|
@ -189,100 +186,100 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
|
|||
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: BB1_2: ; %merge
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s6, v5, 6
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s7, v5, 7
|
||||
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[6:7]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 0
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 1
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s8, v5, 2
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s9, v5, 3
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s10, v5, 4
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s11, v5, 5
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 6
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 7
|
||||
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s30, v5, 0
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s31, v5, 1
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 2
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 3
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 4
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 5
|
||||
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, v3
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s6, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, 2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34
|
||||
; GFX9-O0-NEXT: v_and_b32_e32 v0, v0, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, 0
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[8:11], s6 offset:4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 0
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[4:5]
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-O3-LABEL: strict_wwm_cfg:
|
||||
; GFX9-O3: ; %bb.0: ; %entry
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX9-O3-NEXT: s_and_saveexec_b64 s[8:9], vcc
|
||||
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
|
||||
; GFX9-O3-NEXT: ; %bb.1: ; %if
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX9-O3-NEXT: ; %bb.2: ; %merge
|
||||
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
|
||||
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
|
||||
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
|
@ -343,92 +340,89 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
|
|||
; GFX9-O0-LABEL: strict_wwm_call:
|
||||
; GFX9-O0: ; %bb.0:
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2
|
||||
; GFX9-O0-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s8
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s11, s5
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, s11
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, s10
|
||||
; GFX9-O0-NEXT: s_mov_b32 s7, s8
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s37, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s38, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s39, s7
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O0-NEXT: s_getpc_b64 s[12:13]
|
||||
; GFX9-O0-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4
|
||||
; GFX9-O0-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[18:19], s[2:3]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[0:1]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[16:17]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[18:19]
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-O0-NEXT: s_add_u32 s30, s30, strict_wwm_called@rel32@lo+4
|
||||
; GFX9-O0-NEXT: s_addc_u32 s31, s31, strict_wwm_called@rel32@hi+12
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[46:47], s[2:3]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[44:45], s[0:1]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
|
||||
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-O3-LABEL: strict_wwm_call:
|
||||
; GFX9-O3: ; %bb.0:
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O3-NEXT: s_mov_b32 s14, s33
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_mov_b32 s38, s33
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
|
||||
; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31]
|
||||
; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O3-NEXT: s_getpc_b64 s[12:13]
|
||||
; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4
|
||||
; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12
|
||||
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
||||
; GFX9-O3-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-O3-NEXT: s_add_u32 s30, s30, strict_wwm_called@rel32@lo+4
|
||||
; GFX9-O3-NEXT: s_addc_u32 s31, s31, strict_wwm_called@rel32@hi+12
|
||||
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
|
||||
; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s14
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s38
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[10:11]
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[36:37]
|
||||
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
|
||||
%tmp134 = call amdgpu_gfx i32 @strict_wwm_called(i32 %tmp107)
|
||||
%tmp136 = add i32 %tmp134, %tmp107
|
||||
|
@ -449,32 +443,32 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
|
|||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[4:5], v2, v3
|
||||
; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
|
||||
; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[34:35], v2, v3
|
||||
; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[34:35], v0, v1, s[34:35]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 32
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 32
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s34, v[0:1]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5]
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s34, v[4:5]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
|
||||
; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6
|
||||
; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr36
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s35
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2]
|
||||
; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s34, v[1:2]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
|
||||
|
@ -489,12 +483,12 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
|
|||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
|
||||
; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v3
|
||||
; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
|
||||
; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[36:37], v1, v3
|
||||
; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[36:37], v0, v2, s[36:37]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2]
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s34, v[1:2]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -521,7 +515,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
|
|||
; GFX9-O0-LABEL: strict_wwm_call_i64:
|
||||
; GFX9-O0: ; %bb.0:
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -536,78 +530,75 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
|
|||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s33, 9
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s33, 8
|
||||
; GFX9-O0-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s30, 0
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s31, 1
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 2
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s6
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 3
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 4
|
||||
; GFX9-O0-NEXT: s_mov_b32 s12, s5
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 4
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s4
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 2
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s12
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s11, s7
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 5
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 6
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s10, 7
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s11, 8
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s7, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, s8
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s37, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s38, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s39, s7
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s36, 2
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s37, 3
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s38, 4
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s39, 5
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, s9
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[30:31], 0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s30
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s31
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s30, 6
|
||||
; GFX9-O0-NEXT: v_writelane_b32 v11, s31, 7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 32
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[9:10]
|
||||
; GFX9-O0-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4
|
||||
; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s30, 32
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s30, v[9:10]
|
||||
; GFX9-O0-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-O0-NEXT: s_add_u32 s30, s30, strict_wwm_called_i64@gotpcrel32@lo+4
|
||||
; GFX9-O0-NEXT: s_addc_u32 s31, s31, strict_wwm_called_i64@gotpcrel32@hi+12
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[38:39], s[2:3]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 5
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 6
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 7
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s7, v11, 8
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s34, v11, 6
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s35, v11, 7
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s36, v11, 2
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s37, v11, 3
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s38, v11, 4
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s39, v11, 5
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s30, v11, 0
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s31, v11, 1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
|
||||
; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[10:11], v2, v4
|
||||
; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v5, s[10:11]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4
|
||||
; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 0
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], s8 offset:4
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 0
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
|
||||
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s33, v11, 9
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s33, v11, 8
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
|
@ -628,14 +619,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
|
|||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-O3-LABEL: strict_wwm_call_i64:
|
||||
; GFX9-O3: ; %bb.0:
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
|
@ -644,37 +635,37 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
|
|||
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GFX9-O3-NEXT: s_mov_b32 s14, s33
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_mov_b32 s38, s33
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
|
||||
; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31]
|
||||
; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_getpc_b64 s[12:13]
|
||||
; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called_i64@gotpcrel32@lo+4
|
||||
; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called_i64@gotpcrel32@hi+12
|
||||
; GFX9-O3-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: s_getpc_b64 s[30:31]
|
||||
; GFX9-O3-NEXT: s_add_u32 s30, s30, strict_wwm_called_i64@gotpcrel32@lo+4
|
||||
; GFX9-O3-NEXT: s_addc_u32 s31, s31, strict_wwm_called_i64@gotpcrel32@hi+12
|
||||
; GFX9-O3-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13]
|
||||
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[30:31]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
|
||||
; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
|
||||
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s14
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O3-NEXT: s_mov_b32 s33, s38
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1
|
||||
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
|
@ -685,9 +676,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
|
|||
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31]
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[10:11]
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[36:37]
|
||||
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
|
||||
%tmp134 = call amdgpu_gfx i64 @strict_wwm_called_i64(i64 %tmp107)
|
||||
%tmp136 = add i64 %tmp134, %tmp107
|
||||
|
@ -701,38 +692,36 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
|
||||
; GFX9-O0: ; %bb.0:
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, s5
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-O0-NEXT: s_mov_b32 s6, s9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s7, s8
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 5
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s8, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], s8 offen
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, s4
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s37, s5
|
||||
; GFX9-O0-NEXT: s_mov_b32 s38, s6
|
||||
; GFX9-O0-NEXT: s_mov_b32 s39, s7
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 5
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 0
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, 0x7fffffff
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, -1
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
|
||||
; GFX9-O0-NEXT: s_mov_b32 s40, -1
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
|
||||
; GFX9-O0-NEXT: s_mov_b32 s41, s35
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
|
||||
|
@ -743,8 +732,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
|
||||
|
@ -752,8 +741,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
|
||||
; GFX9-O0-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
|
||||
|
@ -765,20 +754,20 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[4:7], s8 offen
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen
|
||||
; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
|
||||
; GFX9-O3: ; %bb.0:
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
|
@ -787,25 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
|
||||
; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
|
||||
; GFX9-O3-NEXT: s_mov_b32 s8, -1
|
||||
; GFX9-O3-NEXT: s_brev_b32 s9, -2
|
||||
; GFX9-O3-NEXT: s_mov_b32 s34, -1
|
||||
; GFX9-O3-NEXT: s_brev_b32 s35, -2
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v5, s8
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35
|
||||
; GFX9-O3-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
|
||||
|
@ -815,7 +804,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
|
||||
; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
|
||||
; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
|
@ -827,7 +816,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
|||
; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_nop 0
|
||||
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
|
||||
%tmp17 = shl i32 %index, 5
|
||||
|
|
Loading…
Reference in New Issue