[AMDGPU] Divergence-driven compare operations instruction selection

Description: This change enables the compare operations to be selected to SALU/VALU form
             dependent of the SDNode divergence flag.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D106079
This commit is contained in:
alex-t 2021-07-15 19:43:56 +03:00
parent 6b94777be5
commit ed0f4415f0
67 changed files with 2573 additions and 2234 deletions

View File

@ -585,10 +585,30 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SOFT_WQM:
case AMDGPU::STRICT_WWM: {
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
if (MI.isCopy()) {
Register SrcReg = MI.getOperand(1).getReg();
if (SrcReg == AMDGPU::SCC) {
Register SCCCopy = MRI->createVirtualRegister(
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
I = BuildMI(*MI.getParent(),
std::next(MachineBasicBlock::iterator(MI)),
MI.getDebugLoc(),
TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64),
SCCCopy)
.addImm(-1)
.addImm(0);
BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),
TII->get(AMDGPU::COPY), DstReg)
.addReg(SCCCopy);
MI.eraseFromParent();
continue;
}
}
if (!DstReg.isVirtual()) {
// If the destination register is a physical register there isn't
// really much we can do to fix this.

View File

@ -4482,20 +4482,20 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@ -4973,13 +4973,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
continue;
}
if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
!isOperandLegal(MI, Idx, &MO)) {
legalizeOpWithMove(MI, Idx);
continue;
}
if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
continue; // VGPRs are legal
// We can use one SGPR in each VOP3 instruction prior to GFX10
@ -5907,18 +5907,18 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
case AMDGPU::S_CBRANCH_SCC0:
case AMDGPU::S_CBRANCH_SCC1:
// Clear unused bits of vcc
if (ST.isWave32())
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
AMDGPU::VCC_LO)
.addReg(AMDGPU::EXEC_LO)
.addReg(AMDGPU::VCC_LO);
else
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
AMDGPU::VCC)
.addReg(AMDGPU::EXEC)
.addReg(AMDGPU::VCC);
case AMDGPU::S_CBRANCH_SCC1: {
// Clear unused bits of vcc
Register CondReg = Inst.getOperand(1).getReg();
bool IsSCC = CondReg == AMDGPU::SCC;
Register VCC = RI.getVCC();
Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
.addReg(EXEC)
.addReg(IsSCC ? VCC : CondReg);
Inst.RemoveOperand(1);
}
break;
case AMDGPU::S_BFE_U64:
@ -6030,8 +6030,36 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
lowerSelect(Worklist, Inst, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_CMP_EQ_I32:
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMP_GT_I32:
case AMDGPU::S_CMP_GE_I32:
case AMDGPU::S_CMP_LT_I32:
case AMDGPU::S_CMP_LE_I32:
case AMDGPU::S_CMP_EQ_U32:
case AMDGPU::S_CMP_LG_U32:
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMP_GE_U32:
case AMDGPU::S_CMP_LT_U32:
case AMDGPU::S_CMP_LE_U32:
case AMDGPU::S_CMP_EQ_U64:
case AMDGPU::S_CMP_LG_U64: {
const MCInstrDesc &NewDesc = get(NewOpcode);
Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
MachineInstr *NewInstr =
BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
.add(Inst.getOperand(0))
.add(Inst.getOperand(1));
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
}
continue;
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
@ -6191,47 +6219,51 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
MachineOperand &Cond = Inst.getOperand(3);
Register SCCSource = Cond.getReg();
// Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
if (!Cond.isUndef()) {
bool IsSCC = (SCCSource == AMDGPU::SCC);
// If this is a trivial select where the condition is effectively not SCC
// (SCCSource is a source of copy to SCC), then the select is semantically
// equivalent to copying SCCSource. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
MRI.replaceRegWith(Dest.getReg(), SCCSource);
return;
}
const TargetRegisterClass *TC =
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register CopySCC = MRI.createVirtualRegister(TC);
if (IsSCC) {
// Now look for the closest SCC def if it is a copy
// replacing the SCCSource with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
Inst.getParent()->rend())) {
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
SCCSource = CandI.getOperand(1).getReg();
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
break;
}
}
}
// If this is a trivial select where the condition is effectively not SCC
// (SCCSource is a source of copy to SCC), then the select is semantically
// equivalent to copying SCCSource. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
Src1.isImm() && (Src1.getImm() == 0)) {
MRI.replaceRegWith(Dest.getReg(), SCCSource);
return;
}
const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
? &AMDGPU::SReg_64_XEXECRegClass
: &AMDGPU::SReg_32_XM0_XEXECRegClass;
Register CopySCC = MRI.createVirtualRegister(TC);
if (SCCSource == AMDGPU::SCC) {
// Insert a trivial select instead of creating a copy, because a copy from
// SCC would semantically mean just copying a single bit, but we may need
// the result to be a vector condition mask that needs preserving.
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
} else {
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
if (!CopyFound) {
// SCC def is not a copy
// Insert a trivial select instead of creating a copy, because a copy from
// SCC would semantically mean just copying a single bit, but we may need
// the result to be a vector condition mask that needs preserving.
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@ -6242,7 +6274,7 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
.add(Src1) // False
.addImm(0)
.add(Src0) // True
.addReg(CopySCC);
.addReg(IsSCC ? CopySCC : SCCSource);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
legalizeOperands(*UpdatedInst, MDT);
@ -6833,8 +6865,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const {
bool SCCUsedImplicitly = false;
SetVectorType &Worklist,
Register NewCond) const {
// Ensure that def inst defines SCC, which is still live.
assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
@ -6846,33 +6878,18 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
SCCDefInst.getParent()->end())) {
// Check if SCC is used first.
if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
if (SCCIdx != -1) {
if (MI.isCopy()) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
Register DestReg = MI.getOperand(0).getReg();
for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
(User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
User.getOperand(4).setReg(RI.getVCC());
Worklist.insert(&User);
} else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
User.getOperand(5).setReg(RI.getVCC());
// No need to add to Worklist.
}
}
MRI.replaceRegWith(DestReg, NewCond);
CopyToDelete.push_back(&MI);
} else {
if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
// This is an implicit use of SCC and it is really expected by
// the SCC users to handle.
// We cannot preserve the edge to the user so add the explicit
// copy: SCC = COPY VCC.
// The copy will be cleaned up during the processing of the user
// in lowerSelect.
SCCUsedImplicitly = true;
}
if (NewCond.isValid())
MI.getOperand(SCCIdx).setReg(NewCond);
Worklist.insert(&MI);
}
@ -6883,12 +6900,6 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
}
for (auto &Copy : CopyToDelete)
Copy->eraseFromParent();
if (SCCUsedImplicitly) {
BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
.addReg(RI.getVCC());
}
}
// Instructions that use SCC may be converted to VALU instructions. When that

View File

@ -122,7 +122,8 @@ private:
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const;
SetVectorType &Worklist,
Register NewCond = Register()) const;
void addSCCDefsToVALUWorklist(MachineOperand &Op,
SetVectorType &Worklist) const;

View File

@ -527,15 +527,7 @@ def atomic_store_local_64_m0 : PatFrag <
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
(setcc node:$lhs, node:$rhs, node:$cond), [{
for (SDNode *Use : N->uses()) {
if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
return false;
unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
if (Reg != AMDGPU::SCC)
return false;
}
return true;
return !N->isDivergent();
}]>;
//===----------------------------------------------------------------------===//

View File

@ -10,7 +10,8 @@
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
; CI-DAG: s_cselect_b64 vcc, -1, 0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@ -22,7 +23,8 @@
; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; GFX9: s_cmp_lg_u32 [[PTR]], -1
; GFX9: s_cselect_b64 vcc, -1, 0
; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@ -76,7 +78,8 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
; CI-DAG: s_cselect_b64 vcc, -1, 0
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@ -89,7 +92,8 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
; GFX9: s_cmp_lg_u32 [[PTR]], -1
; GFX9: s_cselect_b64 vcc, -1, 0
; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]

View File

@ -18,7 +18,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0
; GCN-PROMOTE: v_cmp_eq_u32_e64 vcc, [[IN]], 1
; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1
; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc
; GCN: buffer_store_dword [[RESULT]]

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,8 @@ define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) #0 {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], s0, 0
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
@ -26,7 +27,8 @@ define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) #0 {
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
@ -40,7 +42,8 @@ define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) #0 {
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0

View File

@ -320,7 +320,8 @@ loop:
; GCN: s_load_dword
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64
; GCN-NEXT: s_cmp_lg_u32
; GCN-NEXT: s_cselect_b64
; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]]
; GCN-NEXT: {{BB[0-9]+_[0-9]+}}:
@ -496,8 +497,8 @@ ret:
; GCN: [[LONG_BR_DEST0]]:
; GCN-DAG: v_cmp_lt_i32
; GCN-DAG: v_cmp_ge_i32
; GCN-DAG: s_cmp_lt_i32
; GCN-DAG: s_cmp_ge_i32
; GCN: s_cbranch_vccz
; GCN: s_setpc_b64

View File

@ -5,7 +5,8 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
; Produces error after adding an implicit def to v_cndmask_b32
; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
bb0:

View File

@ -10,7 +10,7 @@
; GCN: s_branch
; GCN-DAG: v_cmp_lt_i32
; GCN-DAG: v_cmp_gt_i32
; GCN-DAG: s_cmp_gt_i32
; GCN: s_and_b64
; GCN: s_mov_b64 exec

View File

@ -1439,13 +1439,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v0, 0xffff
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;

View File

@ -14,10 +14,11 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s4, s6, 0
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_cselect_b64 vcc, 1, 0
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: s_cmp_gt_u32 s6, 31
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_cmp_gt_u32_e64 vcc, s6, 31
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@ -71,15 +72,15 @@ bb:
define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s1, s0, s0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0
; GFX7-NEXT: s_or_b32 s1, vcc_lo, vcc_hi
; GFX7-NEXT: s_cmp_lg_u32 s1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_addc_u32 s0, s0, 0
; GFX7-NEXT: s_add_i32 s0, s2, s2
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
; GFX7-NEXT: s_and_b64 vcc, exec, vcc
; GFX7-NEXT: s_cbranch_vccnz BB1_2
@ -99,14 +100,14 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_i32 s1, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: s_addc_u32 s0, s0, 0
; GFX9-NEXT: s_add_i32 s0, s2, s2
; GFX9-NEXT: s_cmp_lt_u32 s0, s2
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: s_addc_u32 s0, s2, 0
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: s_cbranch_vccnz BB1_2
@ -129,7 +130,8 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_i32 s1, s0, s0
; GFX10-NEXT: v_cmp_lt_u32_e64 s1, s1, s0
; GFX10-NEXT: s_cmp_lt_u32 s1, s0
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX10-NEXT: s_cmpk_lg_u32 s1, 0x0
; GFX10-NEXT: s_addc_u32 s0, s0, 0

View File

@ -2,9 +2,12 @@
; GCN-LABEL: {{^}}float4_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
@ -19,7 +22,8 @@ entry:
; GCN-LABEL: {{^}}int4_extelt:
; GCN-NOT: buffer_
; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
@ -34,9 +38,12 @@ entry:
; GCN-LABEL: {{^}}double4_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
@ -50,10 +57,14 @@ entry:
; GCN-LABEL: {{^}}double5_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
@ -83,7 +94,8 @@ entry:
; GCN-LABEL: {{^}}float2_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
@ -95,7 +107,8 @@ entry:
; GCN-LABEL: {{^}}double2_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN: store_dwordx2 v[{{[0-9:]+}}]
@ -108,13 +121,20 @@ entry:
; GCN-LABEL: {{^}}half8_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@ -132,13 +152,20 @@ entry:
; GCN-LABEL: {{^}}short8_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@ -156,13 +183,20 @@ entry:
; GCN-LABEL: {{^}}float8_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@ -331,21 +365,36 @@ entry:
; GCN-LABEL: {{^}}byte16_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8
; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9
; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10
; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11
; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12
; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13
; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14
; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 8
; GCN-DAG: s_cselect_b64 [[C8:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 9
; GCN-DAG: s_cselect_b64 [[C9:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 10
; GCN-DAG: s_cselect_b64 [[C10:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 11
; GCN-DAG: s_cselect_b64 [[C11:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 12
; GCN-DAG: s_cselect_b64 [[C12:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 13
; GCN-DAG: s_cselect_b64 [[C13:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 14
; GCN-DAG: s_cselect_b64 [[C14:[^,]+]], -1, 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 15
; GCN-DAG: s_cselect_b64 [[C15:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@ -390,9 +439,9 @@ entry:
; GCN-LABEL: {{^}}bit128_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1
; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
; GCN-DAG: v_cmp_ne_u32_e32 [[CL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
; GCN: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
; GCN: s_cselect_b64 [[CL:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
; GCN: v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]
define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {

View File

@ -14,8 +14,10 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out,
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
@ -29,9 +31,12 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]

View File

@ -31,7 +31,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN: store_dwordx2 v[{{[0-9:]+}}]
@ -44,7 +45,8 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out,
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
; GCN: buffer_load_dwordx4
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN: store_dwordx2 v[{{[0-9:]+}}]
@ -58,8 +60,10 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
@ -73,9 +77,12 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out,
; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
; GCN-NOT: buffer_load
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]

View File

@ -147,23 +147,26 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
; GCN-LABEL: no_extract_volatile_load_dynextract:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dword s12, s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_load_dword s12, s[0:1], 0xd
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1
; GCN-NEXT: s_cmp_eq_u32 s12, 1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s12, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s12, 3
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm

View File

@ -57,6 +57,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_movk_i32 s2, 0x100
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
@ -69,7 +70,7 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(i16 addrspace(1
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0x100, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz BB1_1
; GCN-NEXT: ; %bb.2: ; %bb2

View File

@ -9,9 +9,8 @@ define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
; SI-NEXT: ; implicit-def: $sgpr6_sgpr7
; SI-NEXT: ; implicit-def: $sgpr8_sgpr9
; SI-NEXT: s_branch BB0_3
; SI-NEXT: BB0_1: ; %Flow1
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
; SI-NEXT: BB0_1: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: ; implicit-def: $sgpr14
; SI-NEXT: BB0_2: ; %Flow
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_and_b64 s[12:13], exec, s[8:9]
@ -20,13 +19,14 @@ define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec
; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SI-NEXT: s_cbranch_execz BB0_6
; SI-NEXT: s_cbranch_execz BB0_7
; SI-NEXT: BB0_3: ; %for.body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_cmp_lt_u32 s14, 4
; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
; SI-NEXT: s_or_b64 s[8:9], s[8:9], exec
; SI-NEXT: s_cmp_gt_u32 s14, 3
; SI-NEXT: v_cmp_lt_u32_e64 s[10:11], s14, 4
; SI-NEXT: s_cbranch_scc1 BB0_2
; SI-NEXT: s_cbranch_scc1 BB0_1
; SI-NEXT: ; %bb.4: ; %mid.loop
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: v_mov_b32_e32 v1, s14
@ -36,19 +36,21 @@ define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; SI-NEXT: s_mov_b64 s[8:9], -1
; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc
; SI-NEXT: s_cbranch_execz BB0_1
; SI-NEXT: ; %bb.5: ; %end.loop
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_add_i32 s14, s14, 1
; SI-NEXT: s_xor_b64 s[8:9], exec, -1
; SI-NEXT: s_branch BB0_1
; SI-NEXT: BB0_6: ; %for.end
; SI-NEXT: ; %bb.6: ; %Flow1
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
; SI-NEXT: s_branch BB0_2
; SI-NEXT: BB0_7: ; %for.end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_and_saveexec_b64 s[0:1], s[6:7]
; SI-NEXT: s_cbranch_execz BB0_8
; SI-NEXT: ; %bb.7: ; %if
; SI-NEXT: s_cbranch_execz BB0_9
; SI-NEXT: ; %bb.8: ; %if
; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm
; SI-NEXT: BB0_8: ; %end
; SI-NEXT: BB0_9: ; %end
; SI-NEXT: s_endpgm
entry:
br label %for.body

View File

@ -1,7 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; SI-LABEL: {{^}}test_i64_eq:
; GCN-LABEL: {{^}}test_i64_eq:
; VI: s_cmp_eq_u64
; SI: v_cmp_eq_u64
define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp eq i64 %a, %b
@ -10,7 +11,8 @@ define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) n
ret void
}
; SI-LABEL: {{^}}test_i64_ne:
; GCN-LABEL: {{^}}test_i64_ne:
; VI: s_cmp_lg_u64
; SI: v_cmp_ne_u64
define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp ne i64 %a, %b
@ -19,8 +21,8 @@ define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) n
ret void
}
; SI-LABEL: {{^}}test_i64_slt:
; SI: v_cmp_lt_i64
; GCN-LABEL: {{^}}test_i64_slt:
; GCN: v_cmp_lt_i64
define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp slt i64 %a, %b
%result = sext i1 %cmp to i32
@ -28,8 +30,8 @@ define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_ult:
; SI: v_cmp_lt_u64
; GCN-LABEL: {{^}}test_i64_ult:
; GCN: v_cmp_lt_u64
define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp ult i64 %a, %b
%result = sext i1 %cmp to i32
@ -37,8 +39,8 @@ define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_sle:
; SI: v_cmp_le_i64
; GCN-LABEL: {{^}}test_i64_sle:
; GCN: v_cmp_le_i64
define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp sle i64 %a, %b
%result = sext i1 %cmp to i32
@ -46,8 +48,8 @@ define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_ule:
; SI: v_cmp_le_u64
; GCN-LABEL: {{^}}test_i64_ule:
; GCN: v_cmp_le_u64
define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp ule i64 %a, %b
%result = sext i1 %cmp to i32
@ -55,8 +57,8 @@ define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_sgt:
; SI: v_cmp_gt_i64
; GCN-LABEL: {{^}}test_i64_sgt:
; GCN: v_cmp_gt_i64
define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp sgt i64 %a, %b
%result = sext i1 %cmp to i32
@ -64,8 +66,8 @@ define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_ugt:
; SI: v_cmp_gt_u64
; GCN-LABEL: {{^}}test_i64_ugt:
; GCN: v_cmp_gt_u64
define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp ugt i64 %a, %b
%result = sext i1 %cmp to i32
@ -73,8 +75,8 @@ define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_sge:
; SI: v_cmp_ge_i64
; GCN-LABEL: {{^}}test_i64_sge:
; GCN: v_cmp_ge_i64
define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp sge i64 %a, %b
%result = sext i1 %cmp to i32
@ -82,8 +84,8 @@ define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b)
ret void
}
; SI-LABEL: {{^}}test_i64_uge:
; SI: v_cmp_ge_u64
; GCN-LABEL: {{^}}test_i64_uge:
; GCN: v_cmp_ge_u64
define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
%cmp = icmp uge i64 %a, %b
%result = sext i1 %cmp to i32

View File

@ -5,16 +5,16 @@
define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
; GFX9-LABEL: udiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX9-NEXT: s_sub_i32 s4, 0, s5
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX9-NEXT: s_sub_i32 s5, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@ -23,18 +23,18 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0
; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2
; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2
; GFX9-NEXT: v_not_b32_e32 v5, v2
; GFX9-NEXT: v_mul_lo_u32 v5, s5, v5
; GFX9-NEXT: v_mul_lo_u32 v5, s4, v5
; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
; GFX9-NEXT: v_add_u32_e32 v3, s2, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, s2, v5
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: s_add_u32 s2, s2, 1
; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
@ -65,13 +65,13 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2
; GFX10-NEXT: v_not_b32_e32 v3, v2
; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3
; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v4
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
; GFX10-NEXT: s_add_u32 s2, s2, 1
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
@ -211,37 +211,37 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s3, 31
; GFX9-NEXT: s_add_i32 s3, s3, s2
; GFX9-NEXT: s_xor_b32 s4, s3, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX9-NEXT: s_sub_i32 s3, 0, s4
; GFX9-NEXT: s_xor_b32 s3, s3, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s4, 0, s3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: BB2_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v2, s4
; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s3, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2
; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2
; GFX9-NEXT: s_add_i32 s3, s3, 1
; GFX9-NEXT: s_add_i32 s4, s4, 1
; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
; GFX9-NEXT: s_cbranch_scc0 BB2_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm

View File

@ -3,13 +3,17 @@
; GCN-LABEL: {{^}}float4_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
@ -61,9 +65,11 @@ entry:
; GCN-LABEL: {{^}}float2_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
@ -76,21 +82,29 @@ entry:
; GCN-LABEL: {{^}}float8_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
; GCN-DAG: s_cselect_b64 [[CC5:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
; GCN-DAG: s_cselect_b64 [[CC6:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
; GCN-DAG: s_cselect_b64 [[CC7:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
; GCN-DAG: s_cselect_b64 [[CC8:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
@ -156,14 +170,14 @@ entry:
; GCN-LABEL: {{^}}half8_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 1
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 2
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 3
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 4
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 5
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 6
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 7
; GCN-DAG: v_cndmask_b32_e32
; GCN-DAG: v_cndmask_b32_e32
; GCN-DAG: v_cndmask_b32_e32
@ -237,8 +251,8 @@ entry:
; GCN-LABEL: {{^}}byte16_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 15
; GCN-DAG: v_cndmask_b32_e32
; GCN-DAG: v_cndmask_b32_e32
; GCN-DAG: v_cndmask_b32_e32
@ -273,10 +287,12 @@ entry:
; GCN-LABEL: {{^}}double2_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
; GCN-DAG: s_cmp_eq_u32 [[IDX]], 0
; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
@ -372,10 +388,12 @@ entry:
; GCN-LABEL: {{^}}bit128_inselt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
; GCN-DAG: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
; GCN-DAG: s_cselect_b64 [[CCL:[^,]+]], -1, 0
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
entry:

View File

@ -405,10 +405,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -423,10 +425,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_lg_u32 s4, 1
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
@ -446,13 +450,16 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v3, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -467,13 +474,16 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; VI-NEXT: s_cmp_lg_u32 s4, 2
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_cmp_lg_u32 s4, 1
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s8
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; VI-NEXT: s_endpgm
@ -493,16 +503,20 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; SI-NEXT: s_cmp_lg_u32 s4, 3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -517,16 +531,20 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; VI-NEXT: s_cmp_lg_u32 s4, 3
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_cmp_lg_u32 s4, 2
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; VI-NEXT: s_cmp_lg_u32 s4, 1
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
@ -546,28 +564,36 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; SI-NEXT: s_cmp_lg_u32 s4, 3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cmp_lg_u32 s4, 7
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
; SI-NEXT: s_cmp_lg_u32 s4, 6
; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v5, s14
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
; SI-NEXT: s_cmp_lg_u32 s4, 5
; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v5, s13
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
; SI-NEXT: s_cmp_lg_u32 s4, 4
; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
; SI-NEXT: v_mov_b32_e32 v8, s12
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@ -583,28 +609,36 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)*
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; VI-NEXT: s_cmp_lg_u32 s4, 3
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_cmp_lg_u32 s4, 2
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; VI-NEXT: s_cmp_lg_u32 s4, 1
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; VI-NEXT: s_cmp_lg_u32 s4, 7
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
; VI-NEXT: s_cmp_lg_u32 s4, 6
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v5, s14
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
; VI-NEXT: s_cmp_lg_u32 s4, 5
; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
; VI-NEXT: s_cmp_lg_u32 s4, 4
; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
; VI-NEXT: v_mov_b32_e32 v8, s12
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@ -695,10 +729,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -734,13 +770,16 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -780,17 +819,21 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
; SI-NEXT: s_cmp_eq_u32 s6, 3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: s_cmp_eq_u32 s6, 2
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
; SI-NEXT: s_cmp_eq_u32 s6, 1
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
; SI-NEXT: s_cmp_eq_u32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -833,28 +876,36 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; SI-NEXT: s_cmp_lg_u32 s4, 3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cmp_lg_u32 s4, 7
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s15
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
; SI-NEXT: s_cmp_lg_u32 s4, 6
; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s14
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
; SI-NEXT: s_cmp_lg_u32 s4, 5
; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s13
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
; SI-NEXT: s_cmp_lg_u32 s4, 4
; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
; SI-NEXT: v_mov_b32_e32 v4, s12
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@ -1247,96 +1298,112 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s5, s11, 24
; SI-NEXT: s_cmp_lg_u32 s4, 15
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s5, s11, 16
; SI-NEXT: s_cmp_lg_u32 s4, 14
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s6, s11, 8
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_movk_i32 s5, 0xff
; SI-NEXT: s_cmp_lg_u32 s4, 13
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: s_lshr_b32 s6, s11, 8
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_mov_b32_e32 v1, s6
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lg_u32 s4, 12
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v2, s11
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v2, s5, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: s_mov_b32 s6, 0xffff
; SI-NEXT: s_lshr_b32 s7, s10, 24
; SI-NEXT: s_cmp_lg_u32 s4, 11
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s6, v1
; SI-NEXT: s_lshr_b32 s7, s10, 24
; SI-NEXT: v_or_b32_e32 v3, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s10, 16
; SI-NEXT: s_cmp_lg_u32 s4, 10
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_lshr_b32 s7, s10, 8
; SI-NEXT: s_cmp_lg_u32 s4, 9
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: s_lshr_b32 s7, s10, 8
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
; SI-NEXT: s_cmp_lg_u32 s4, 8
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v2, s10
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v2, s5, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: s_lshr_b32 s7, s9, 24
; SI-NEXT: s_cmp_lg_u32 s4, 7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s6, v1
; SI-NEXT: s_lshr_b32 s7, s9, 24
; SI-NEXT: v_or_b32_e32 v2, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s9, 16
; SI-NEXT: s_cmp_lg_u32 s4, 6
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_lshr_b32 s7, s9, 8
; SI-NEXT: s_cmp_lg_u32 s4, 5
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: s_lshr_b32 s7, s9, 8
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
; SI-NEXT: s_cmp_lg_u32 s4, 4
; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s9
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; SI-NEXT: v_and_b32_e32 v4, s5, v4
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: s_lshr_b32 s7, s8, 24
; SI-NEXT: s_cmp_lg_u32 s4, 3
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v1, s6, v1
; SI-NEXT: s_lshr_b32 s7, s8, 24
; SI-NEXT: v_or_b32_e32 v1, v1, v0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_lshr_b32 s7, s8, 16
; SI-NEXT: s_cmp_lg_u32 s4, 2
; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: s_lshr_b32 s7, s8, 8
; SI-NEXT: s_cmp_lg_u32 s4, 1
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: v_and_b32_e32 v4, s5, v4
; SI-NEXT: s_lshr_b32 s7, s8, 8
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; SI-NEXT: v_mov_b32_e32 v5, s8
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_and_b32_e32 v5, s5, v5
@ -1356,81 +1423,97 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s11, 24
; VI-NEXT: s_cmp_lg_u32 s4, 15
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s11, 16
; VI-NEXT: s_cmp_lg_u32 s4, 14
; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s11, 8
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cmp_lg_u32 s4, 13
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_cmp_lg_u32 s4, 12
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v2, s11
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_lshr_b32 s5, s10, 24
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cmp_lg_u32 s4, 11
; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s10, 16
; VI-NEXT: s_cmp_lg_u32 s4, 10
; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s10, 8
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cmp_lg_u32 s4, 9
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_cmp_lg_u32 s4, 8
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_lshr_b32 s5, s9, 24
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cmp_lg_u32 s4, 7
; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s9, 16
; VI-NEXT: s_cmp_lg_u32 s4, 6
; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s9, 8
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cmp_lg_u32 s4, 5
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_cmp_lg_u32 s4, 4
; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_lshr_b32 s5, s8, 24
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cmp_lg_u32 s4, 3
; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s8, 16
; VI-NEXT: s_cmp_lg_u32 s4, 2
; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_lshr_b32 s5, s8, 8
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; VI-NEXT: s_cmp_lg_u32 s4, 1
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v4, s5
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@ -1528,12 +1611,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_eq_u32 s4, 1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: s_cmp_eq_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -1550,12 +1635,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_eq_u32 s4, 1
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -1569,19 +1656,21 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
; SI-LABEL: dynamic_insertelement_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0x8
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
; SI-NEXT: s_load_dword s6, s[4:5], 0x8
; SI-NEXT: s_mov_b32 s3, 0x100f000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s6, 1
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: s_cmp_eq_u32 s6, 0
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
@ -1590,19 +1679,21 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
;
; VI-LABEL: dynamic_insertelement_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s6, s[4:5], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; VI-NEXT: s_load_dword s6, s[4:5], 0x20
; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s6, 1
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: s_cmp_eq_u32 s6, 0
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
@ -1625,16 +1716,19 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; SI-NEXT: s_cmp_eq_u32 s12, 1
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: s_cmp_eq_u32 s12, 0
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_cmp_eq_u32 s12, 2
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v4, s6
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
@ -1652,17 +1746,20 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
; VI-NEXT: s_cmp_eq_u32 s12, 1
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: s_cmp_eq_u32 s12, 0
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_cmp_eq_u32 s12, 2
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, s6
; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
@ -1685,22 +1782,26 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
; SI-NEXT: s_cmp_eq_u32 s4, 1
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v0, s10
; SI-NEXT: s_cmp_eq_u32 s4, 0
; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_cmp_eq_u32 s4, 3
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v5, s15
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
; SI-NEXT: v_mov_b32_e32 v5, s14
; SI-NEXT: s_cmp_eq_u32 s4, 2
; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
; SI-NEXT: v_mov_b32_e32 v5, s13
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
; SI-NEXT: v_mov_b32_e32 v4, s12
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
@ -1718,22 +1819,26 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
; VI-NEXT: s_cmp_eq_u32 s4, 1
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_cmp_eq_u32 s4, 3
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_mov_b32_e32 v5, s15
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
; VI-NEXT: v_mov_b32_e32 v5, s14
; VI-NEXT: s_cmp_eq_u32 s4, 2
; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc

View File

@ -86,7 +86,7 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
}
; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
%cmp = icmp eq i32 %i, 0
@ -119,7 +119,8 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
; SI: s_endpgm

View File

@ -298,8 +298,10 @@ define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) {
}
; GCN-LABEL: {{^}}v_icmp_i1_ne0:
; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]],
; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]],
; GCN: s_cmp_gt_u32
; GCN: s_cselect_b64 s[[C0:\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_cmp_gt_u32
; GCN: s_cselect_b64 s[[C1:\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]]
; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1
; GCN-NEXT: v_mov_b32_e32

View File

@ -29,8 +29,7 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]]
; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]]
; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
%val = call i1 @llvm.amdgcn.is.private(i8* %ptr)

View File

@ -30,8 +30,7 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]]
; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]]
; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
%val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)

View File

@ -265,7 +265,7 @@ define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 {
}
; GCN-LABEL: {{^}}test_scc_liveness:
; GCN: v_cmp
; GCN: s_cmp
; GCN: s_and_b64 exec
; GCN: s_cmp
; GCN: s_cbranch_scc

View File

@ -246,9 +246,10 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX9-NEXT: s_addc_u32 s5, 0, s5
; GFX9-NEXT: s_add_i32 s1, s8, s7
; GFX9-NEXT: s_add_i32 s1, s1, s6
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_mul_i32 s2, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
@ -274,11 +275,12 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX10-NEXT: s_add_u32 s4, s3, s1
; GFX10-NEXT: s_addc_u32 s5, 0, s5
; GFX10-NEXT: s_add_i32 s1, s8, s7
; GFX10-NEXT: v_cmp_ne_u64_e64 s3, s[4:5], 0
; GFX10-NEXT: s_add_i32 s1, s1, s6
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s3
; GFX10-NEXT: s_add_i32 s1, s1, s6
; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX10-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_endpgm
bb:
@ -306,10 +308,11 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; SI-NEXT: v_mul_hi_u32 v1, s0, v1
; SI-NEXT: v_mul_hi_i32 v3, s1, v3
; SI-NEXT: s_mul_i32 s6, s1, s3
; SI-NEXT: s_mul_i32 s8, s0, s2
; SI-NEXT: s_cmp_lt_i32 s1, 0
; SI-NEXT: s_mul_i32 s1, s0, s2
; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v1
; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v6, s8
; SI-NEXT: v_mov_b32_e32 v6, s1
; SI-NEXT: v_add_i32_e32 v5, vcc, s4, v5
; SI-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@ -319,14 +322,15 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v1
; SI-NEXT: v_subrev_i32_e32 v1, vcc, s2, v2
; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_lt_i32 s3, 0
; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v4
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: v_subrev_i32_e32 v5, vcc, s0, v2
; SI-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
@ -356,7 +360,8 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX9-NEXT: s_addc_u32 s6, 0, s6
; GFX9-NEXT: s_sub_u32 s9, s4, s2
; GFX9-NEXT: s_subb_u32 s10, s6, 0
; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0
; GFX9-NEXT: s_cmp_lt_i32 s1, 0
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
@ -364,10 +369,11 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX9-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2
; GFX9-NEXT: s_add_i32 s1, s8, s7
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
; GFX9-NEXT: s_cmp_lt_i32 s3, 0
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: s_add_i32 s1, s8, s7
; GFX9-NEXT: s_add_i32 s1, s1, s5
; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0
; GFX9-NEXT: s_ashr_i32 s4, s1, 31
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
@ -401,21 +407,23 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX10-NEXT: s_addc_u32 s6, 0, s6
; GFX10-NEXT: s_sub_u32 s9, s4, s2
; GFX10-NEXT: s_subb_u32 s10, s6, 0
; GFX10-NEXT: v_cmp_lt_i32_e64 vcc_lo, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s9
; GFX10-NEXT: v_mov_b32_e32 v1, s10
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_cmp_lt_i32 s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s10
; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX10-NEXT: s_cmp_lt_i32 s3, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX10-NEXT: s_add_i32 s1, s8, s7
; GFX10-NEXT: s_add_i32 s1, s1, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i32_e64 vcc_lo, s3, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo

View File

@ -16,12 +16,14 @@ define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s5
; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[0:1]
; SI-NEXT: s_and_b32 s0, s11, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s5, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s5, 51
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s5, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s5, 51
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -155,12 +157,14 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
; SI-NEXT: s_brev_b32 s15, 1
; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1]
; SI-NEXT: s_and_b32 s0, s11, s15
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cmp_lt_i32 s14, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s14, 0
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cmp_gt_i32 s14, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s14, 51
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -169,23 +173,25 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
; SI-NEXT: s_add_i32 s7, s0, s7
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s11
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7
; SI-NEXT: v_bfi_b32 v4, s10, v6, v4
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
; SI-NEXT: s_and_b32 s0, s9, s15
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_cmp_lt_i32 s7, 0
; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s7, 51
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s7, 51
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -250,12 +256,14 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: s_brev_b32 s20, 1
; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1]
; SI-NEXT: s_and_b32 s0, s7, s20
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cmp_lt_i32 s19, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s17
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cmp_gt_i32 s19, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s16
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -268,40 +276,44 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: s_brev_b32 s16, -2
; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_bfi_b32 v4, s16, v12, v4
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: v_bfi_b32 v4, s16, v12, v4
; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1]
; SI-NEXT: s_and_b32 s0, s5, s20
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_cmp_lt_i32 s17, 0
; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cmp_gt_i32 s17, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
; SI-NEXT: s_add_i32 s6, s0, s18
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1]
; SI-NEXT: v_bfi_b32 v6, s16, v12, v6
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: s_and_b32 s0, s11, s20
; SI-NEXT: v_bfi_b32 v6, s16, v12, v6
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc
; SI-NEXT: v_mov_b32_e32 v5, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s5
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
; SI-NEXT: v_mov_b32_e32 v5, s0
; SI-NEXT: s_cmp_gt_i32 s6, 51
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; SI-NEXT: v_mov_b32_e32 v5, s11
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s6, 51
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1]
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
@ -309,22 +321,24 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
; SI-NEXT: v_mov_b32_e32 v10, s11
; SI-NEXT: s_add_i32 s4, s0, s18
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: v_mov_b32_e32 v10, s11
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: v_bfi_b32 v10, s16, v12, v10
; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc
; SI-NEXT: v_mov_b32_e32 v6, 0
; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
; SI-NEXT: s_and_b32 s0, s9, s20
; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc
; SI-NEXT: v_mov_b32_e32 v6, 0
; SI-NEXT: s_cmp_lt_i32 s4, 0
; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
; SI-NEXT: v_mov_b32_e32 v5, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s4, 0
; SI-NEXT: v_mov_b32_e32 v5, s0
; SI-NEXT: s_cmp_gt_i32 s4, 51
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, 51
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1]
; SI-NEXT: v_mov_b32_e32 v4, s2
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
@ -405,15 +419,17 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014
; SI-NEXT: s_add_i32 s26, s2, s23
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26
; SI-NEXT: s_brev_b32 s27, 1
; SI-NEXT: s_brev_b32 s28, 1
; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3]
; SI-NEXT: s_and_b32 s2, s7, s27
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_and_b32 s2, s7, s28
; SI-NEXT: s_cmp_lt_i32 s26, 0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s25
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s26, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_cmp_gt_i32 s26, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s26, 51
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
; SI-NEXT: v_mov_b32_e32 v0, s24
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -421,24 +437,26 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014
; SI-NEXT: s_add_i32 s25, s2, s23
; SI-NEXT: s_add_i32 s24, s2, s23
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
; SI-NEXT: s_brev_b32 s24, -2
; SI-NEXT: v_mov_b32_e32 v18, 0x3ff00000
; SI-NEXT: s_brev_b32 s29, -2
; SI-NEXT: v_mov_b32_e32 v14, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_bfi_b32 v4, s24, v18, v4
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s25
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24
; SI-NEXT: v_bfi_b32 v4, s29, v14, v4
; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3]
; SI-NEXT: s_and_b32 s2, s5, s28
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3]
; SI-NEXT: s_and_b32 s2, s5, s27
; SI-NEXT: s_cmp_lt_i32 s24, 0
; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_cmp_gt_i32 s24, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
@ -446,22 +464,24 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3]
; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: s_add_i32 s6, s2, s23
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: v_bfi_b32 v6, s24, v18, v6
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
; SI-NEXT: v_bfi_b32 v6, s29, v14, v6
; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3]
; SI-NEXT: s_and_b32 s2, s11, s28
; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3]
; SI-NEXT: s_and_b32 s2, s11, s27
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s5
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_cmp_gt_i32 s6, 51
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; SI-NEXT: v_mov_b32_e32 v5, s11
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3]
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
@ -469,116 +489,126 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3]
; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014
; SI-NEXT: v_mov_b32_e32 v8, s11
; SI-NEXT: s_add_i32 s6, s2, s23
; SI-NEXT: v_mov_b32_e32 v8, s11
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: v_bfi_b32 v8, s24, v18, v8
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
; SI-NEXT: v_bfi_b32 v8, s29, v14, v8
; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3]
; SI-NEXT: s_and_b32 s2, s9, s28
; SI-NEXT: v_cndmask_b32_e32 v7, 0, v8, vcc
; SI-NEXT: v_mov_b32_e32 v6, 0
; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3]
; SI-NEXT: s_and_b32 s2, s9, s27
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: v_mov_b32_e32 v4, s5
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
; SI-NEXT: v_mov_b32_e32 v5, s2
; SI-NEXT: s_cmp_gt_i32 s6, 51
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3]
; SI-NEXT: v_mov_b32_e32 v4, s4
; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
; SI-NEXT: v_mov_b32_e32 v8, s8
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3]
; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5]
; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014
; SI-NEXT: s_add_i32 s4, s2, s23
; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5]
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4
; SI-NEXT: v_mov_b32_e32 v10, s9
; SI-NEXT: s_add_i32 s6, s2, s23
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
; SI-NEXT: v_bfi_b32 v10, s24, v18, v10
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3]
; SI-NEXT: s_and_b32 s2, s15, s28
; SI-NEXT: v_bfi_b32 v10, s29, v14, v10
; SI-NEXT: s_cmp_lt_i32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc
; SI-NEXT: v_mov_b32_e32 v8, 0
; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[2:3]
; SI-NEXT: s_and_b32 s2, s15, s27
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s4, 51
; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9]
; SI-NEXT: v_mov_b32_e32 v9, s2
; SI-NEXT: v_mov_b32_e32 v8, s5
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; SI-NEXT: v_mov_b32_e32 v9, s15
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
; SI-NEXT: v_cndmask_b32_e64 v13, v8, v9, s[2:3]
; SI-NEXT: v_mov_b32_e32 v8, s4
; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
; SI-NEXT: v_mov_b32_e32 v9, s14
; SI-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[2:3]
; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014
; SI-NEXT: s_add_i32 s8, s2, s23
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s8
; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[2:3]
; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014
; SI-NEXT: s_add_i32 s10, s2, s23
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10
; SI-NEXT: v_mov_b32_e32 v8, s15
; SI-NEXT: s_andn2_b64 s[6:7], s[18:19], s[2:3]
; SI-NEXT: s_and_b32 s2, s19, s27
; SI-NEXT: v_bfi_b32 v19, s24, v18, v8
; SI-NEXT: v_mov_b32_e32 v9, s2
; SI-NEXT: v_mov_b32_e32 v8, s7
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0
; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014
; SI-NEXT: s_add_i32 s6, s4, s23
; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6
; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5]
; SI-NEXT: s_and_b32 s4, s13, s28
; SI-NEXT: v_mov_b32_e32 v8, s25
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v15, v8, v9, vcc
; SI-NEXT: v_mov_b32_e32 v9, s4
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_cmp_gt_i32 s6, 51
; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014
; SI-NEXT: s_add_i32 s25, s8, s23
; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25
; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9]
; SI-NEXT: s_and_b32 s8, s19, s28
; SI-NEXT: v_mov_b32_e32 v8, s27
; SI-NEXT: s_cmp_lt_i32 s25, 0
; SI-NEXT: v_cndmask_b32_e64 v17, v8, v9, s[4:5]
; SI-NEXT: v_mov_b32_e32 v9, s8
; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
; SI-NEXT: v_mov_b32_e32 v8, s11
; SI-NEXT: s_cmp_gt_i32 s25, 51
; SI-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9]
; SI-NEXT: v_mov_b32_e32 v10, s10
; SI-NEXT: v_mov_b32_e32 v9, s19
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51
; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[2:3]
; SI-NEXT: v_mov_b32_e32 v8, s6
; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[10:11]
; SI-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[8:9]
; SI-NEXT: v_mov_b32_e32 v10, s18
; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[2:3]
; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014
; SI-NEXT: s_add_i32 s10, s2, s23
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10
; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[2:3]
; SI-NEXT: s_and_b32 s2, s17, s27
; SI-NEXT: v_mov_b32_e32 v11, s2
; SI-NEXT: v_mov_b32_e32 v10, s7
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0
; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014
; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[10:11]
; SI-NEXT: s_add_i32 s10, s8, s23
; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10
; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9]
; SI-NEXT: s_and_b32 s8, s17, s28
; SI-NEXT: s_cmp_lt_i32 s10, 0
; SI-NEXT: v_mov_b32_e32 v11, s8
; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
; SI-NEXT: v_mov_b32_e32 v10, s21
; SI-NEXT: s_cmp_gt_i32 s10, 51
; SI-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[8:9]
; SI-NEXT: v_mov_b32_e32 v11, s17
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51
; SI-NEXT: v_cndmask_b32_e64 v15, v10, v11, s[2:3]
; SI-NEXT: v_mov_b32_e32 v10, s6
; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[10:11]
; SI-NEXT: v_mov_b32_e32 v10, s20
; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[8:9]
; SI-NEXT: v_mov_b32_e32 v11, s16
; SI-NEXT: v_cndmask_b32_e64 v14, v10, v11, s[2:3]
; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15]
; SI-NEXT: v_mov_b32_e32 v17, s19
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
; SI-NEXT: v_cndmask_b32_e64 v12, v10, v11, s[10:11]
; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[12:13]
; SI-NEXT: v_mov_b32_e32 v19, s17
; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5
; SI-NEXT: v_mov_b32_e32 v10, s19
; SI-NEXT: v_bfi_b32 v20, s29, v14, v10
; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9]
; SI-NEXT: v_mov_b32_e32 v16, s17
; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
; SI-NEXT: v_bfi_b32 v17, s24, v18, v17
; SI-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[2:3]
; SI-NEXT: v_bfi_b32 v19, s29, v14, v19
; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5
; SI-NEXT: v_mov_b32_e32 v10, 0
; SI-NEXT: v_bfi_b32 v16, s24, v18, v16
; SI-NEXT: v_cndmask_b32_e64 v11, 0, v20, s[10:11]
; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11]
; SI-NEXT: v_cndmask_b32_e32 v9, 0, v16, vcc
; SI-NEXT: v_cndmask_b32_e64 v9, 0, v19, s[8:9]
; SI-NEXT: v_mov_b32_e32 v8, 0
; SI-NEXT: s_and_b32 s9, s13, s27
; SI-NEXT: v_add_f64 v[8:9], v[14:15], v[8:9]
; SI-NEXT: v_mov_b32_e32 v14, s5
; SI-NEXT: v_mov_b32_e32 v15, s9
; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0
; SI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
; SI-NEXT: v_add_f64 v[8:9], v[12:13], v[8:9]
; SI-NEXT: v_mov_b32_e32 v12, s24
; SI-NEXT: v_mov_b32_e32 v16, s15
; SI-NEXT: v_cndmask_b32_e64 v13, v15, v16, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc
; SI-NEXT: v_mov_b32_e32 v15, s14
; SI-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[2:3]
; SI-NEXT: v_mov_b32_e32 v15, s15
; SI-NEXT: v_bfi_b32 v19, s29, v14, v15
; SI-NEXT: v_mov_b32_e32 v15, s26
; SI-NEXT: v_mov_b32_e32 v18, s13
; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v16, s12
; SI-NEXT: v_cndmask_b32_e64 v16, v15, v16, s[6:7]
; SI-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[6:7]
; SI-NEXT: v_mov_b32_e32 v15, s13
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s8, 51
; SI-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[2:3]
; SI-NEXT: v_mov_b32_e32 v14, s4
; SI-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
; SI-NEXT: v_mov_b32_e32 v15, s12
; SI-NEXT: v_cndmask_b32_e64 v16, v14, v15, s[2:3]
; SI-NEXT: v_mov_b32_e32 v14, s13
; SI-NEXT: v_bfi_b32 v18, s24, v18, v14
; SI-NEXT: v_bfi_b32 v18, s29, v14, v15
; SI-NEXT: v_add_f64 v[14:15], s[12:13], -v[16:17]
; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5

View File

@ -8,7 +8,7 @@
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: v_cmp_eq_u32
; GCN: s_cmp_eq_u32
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32

View File

@ -384,7 +384,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i
; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
; SI-NOT: v_min
; GCN: v_cmp_lt_u32
; GCN: s_cmp_lt_u32
; SI-NOT: v_min
; SI: v_cndmask_b32
; SI-NOT: v_min

View File

@ -19,9 +19,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -63,9 +63,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -107,9 +107,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -147,9 +147,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -191,9 +191,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -234,9 +234,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -275,9 +275,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -316,9 +316,9 @@ body: |
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
; GCN: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
; GCN: V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
; GCN: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -352,9 +352,9 @@ body: |
; GCN: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
; GCN: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
; GCN: V_CMP_NE_U32_e32 0, [[V_AND_B32_e64_]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0
@ -388,9 +388,9 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN: SCRATCH_STORE_DWORD [[DEF]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
; GCN: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
; GCN: V_CMP_NE_U32_e32 0, [[V_AND_B32_e64_]], implicit-def $vcc, implicit $exec
; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
; GCN: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; GCN: bb.2:
; GCN: S_ENDPGM 0

View File

@ -35,7 +35,8 @@ bb4:
}
; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
; GCN: v_cmp_ne_u32_e64 [[CC1:[^,]+]],
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN: s_branch [[BB1:BB[0-9]+_[0-9]+]]
; GCN: [[BB0:BB[0-9]+_[0-9]+]]
; GCN-NOT: v_cndmask_b32

View File

@ -262,7 +262,11 @@ define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in
}
; FUNC-LABEL: {{^}}s_or_i1:
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
; SI: s_cmp_eq_u32
; SI: s_cselect_b64 [[C1:[^,]+]], -1, 0
; SI: s_cmp_eq_u32
; SI: s_cselect_b64 [[C2:[^,]+]], -1, 0
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], [[C1]], [[C2]]
define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d

View File

@ -133,8 +133,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out,
}
; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
%icmp0 = icmp ugt i32 %a, %b
@ -254,7 +254,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
; GCN: s_load_dword
; GCN: s_bfe_u32
; GCN-DAG: s_bfe_u32
; GCN-DAG: s_sub_i32
; GCN-DAG: s_and_b32
; GCN-DAG: s_sub_i32
@ -273,8 +273,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %
}
; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
; GCN: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {

View File

@ -111,14 +111,15 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: v_cmp_lt_i32_e64 s[10:11], s9, 0
; SI-NEXT: s_add_i32 s9, s8, s9
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: s_add_i32 s12, s8, s9
; SI-NEXT: s_cmp_lt_i32 s9, 0
; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
; SI-NEXT: s_cmp_lt_i32 s12, s8
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cmp_lt_i32_e32 vcc, s9, v0
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9]
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
@ -134,13 +135,14 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s1, 0
; VI-NEXT: s_add_i32 s1, s0, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_cmp_lt_i32_e32 vcc, s1, v4
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: s_add_i32 s4, s0, s1
; VI-NEXT: s_cmp_lt_i32 s1, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: s_cmp_lt_i32 s4, s0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
@ -548,42 +550,42 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_i32 v5, v0, v2 clamp
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: v_add_u32_e32 v5, v1, v3
; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
; GFX9-NEXT: v_add_u32_e32 v4, v0, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1
; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v5
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_i32 v5, v1, v3 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_add_nc_i32 v6, v0, v2 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v3, v0, v2
; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1]
; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
%a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
%b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4

View File

@ -466,76 +466,75 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s7, 0xf000
; TONGA-NEXT: s_mov_b32 s6, -1
; TONGA-NEXT: s_mov_b32 s2, s6
; TONGA-NEXT: s_mov_b32 s3, s7
; TONGA-NEXT: s_mov_b32 s10, s6
; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s0, s10
; TONGA-NEXT: s_mov_b32 s1, s11
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_mov_b32 s8, s2
; TONGA-NEXT: s_mov_b32 s9, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe
; TONGA-NEXT: s_mov_b32 s4, s8
; TONGA-NEXT: s_mov_b32 s5, s9
; TONGA-NEXT: s_mov_b32 s4, s0
; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v2
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v4
; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5
; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v0
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7
; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3
; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v2
; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v7
; TONGA-NEXT: v_xor_b32_e32 v4, v7, v4
; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3
; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5
; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
; TONGA-NEXT: v_mul_lo_u32 v6, v6, v5
; TONGA-NEXT: v_mul_hi_u32 v6, v5, v6
; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v5
; TONGA-NEXT: v_mul_hi_u32 v5, v0, v5
; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v3
; TONGA-NEXT: v_mul_lo_u32 v8, v5, v2
; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[0:1]
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v5
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; TONGA-NEXT: s_mov_b64 s[0:1], vcc
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v3
; TONGA-NEXT: v_xor_b32_e32 v2, v0, v6
; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v3
; TONGA-NEXT: v_xor_b32_e32 v6, v3, v6
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
; TONGA-NEXT: v_mul_f32_e32 v0, s2, v0
; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
; TONGA-NEXT: v_mul_lo_u32 v9, v9, v0
; TONGA-NEXT: v_mul_hi_u32 v7, v0, v9
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; TONGA-NEXT: v_mul_hi_u32 v3, v1, v0
; TONGA-NEXT: v_xor_b32_e32 v0, v5, v4
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
; TONGA-NEXT: v_mul_lo_u32 v4, v3, v2
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v4, v1
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2
; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v2, v1
; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_mul_f32_e32 v7, s2, v7
; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1
; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v6
; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10
; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7
; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2
; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3
; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v6, v0
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4
; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v5
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9
; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
@ -931,137 +930,134 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s7, 0xf000
; TONGA-NEXT: s_mov_b32 s6, -1
; TONGA-NEXT: s_mov_b32 s2, s6
; TONGA-NEXT: s_mov_b32 s3, s7
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s11, 0xf000
; TONGA-NEXT: s_mov_b32 s10, -1
; TONGA-NEXT: s_mov_b32 s6, s10
; TONGA-NEXT: s_mov_b32 s7, s11
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s0, s10
; TONGA-NEXT: s_mov_b32 s1, s11
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; TONGA-NEXT: s_mov_b32 s10, 0x4f7ffffe
; TONGA-NEXT: s_mov_b32 s4, s8
; TONGA-NEXT: s_mov_b32 s5, s9
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: s_mov_b32 s4, s2
; TONGA-NEXT: s_mov_b32 s5, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe
; TONGA-NEXT: s_mov_b32 s8, s0
; TONGA-NEXT: s_mov_b32 s9, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11
; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9
; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9
; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v5
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0
; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
; TONGA-NEXT: v_cvt_f32_u32_e32 v4, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v2
; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v4
; TONGA-NEXT: v_mul_f32_e32 v4, s10, v4
; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v4
; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
; TONGA-NEXT: v_mul_lo_u32 v10, v4, v9
; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
; TONGA-NEXT: v_mul_hi_u32 v10, v9, v10
; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9
; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4
; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9
; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6
; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v4
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v11, v4
; TONGA-NEXT: v_xor_b32_e32 v4, v4, v11
; TONGA-NEXT: v_mul_hi_u32 v9, v4, v9
; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8
; TONGA-NEXT: v_mul_lo_u32 v12, v9, v0
; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v9
; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v4, v0
; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[0:1]
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v9
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0
; TONGA-NEXT: s_mov_b64 s[0:1], vcc
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v1
; TONGA-NEXT: v_xor_b32_e32 v1, v0, v10
; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v1
; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v5
; TONGA-NEXT: v_add_u32_e32 v5, vcc, v4, v5
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
; TONGA-NEXT: v_xor_b32_e32 v5, v5, v4
; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[0:1]
; TONGA-NEXT: v_xor_b32_e32 v4, v4, v10
; TONGA-NEXT: v_mul_f32_e32 v0, s10, v0
; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v6
; TONGA-NEXT: v_mul_lo_u32 v13, v13, v0
; TONGA-NEXT: v_mul_hi_u32 v11, v0, v13
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v11, v0
; TONGA-NEXT: v_mul_hi_u32 v11, v5, v0
; TONGA-NEXT: v_xor_b32_e32 v0, v9, v8
; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0
; TONGA-NEXT: v_mul_lo_u32 v8, v11, v1
; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v11
; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v8
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v5, v1
; TONGA-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1]
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v5, v1
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v8
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
; TONGA-NEXT: s_mov_b64 s[0:1], vcc
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v14, v2
; TONGA-NEXT: v_xor_b32_e32 v2, v1, v14
; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v2
; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v2
; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8
; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9
; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13
; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11
; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6
; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2
; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8
; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5
; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11
; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9
; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10
; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
; TONGA-NEXT: v_mul_lo_u32 v10, v10, v8
; TONGA-NEXT: v_mul_hi_u32 v12, v9, v12
; TONGA-NEXT: v_mul_f32_e32 v11, s2, v11
; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11
; TONGA-NEXT: v_mul_hi_u32 v10, v8, v10
; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v6
; TONGA-NEXT: v_mul_lo_u32 v12, v12, v11
; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8
; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7
; TONGA-NEXT: v_mul_hi_u32 v12, v11, v12
; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7
; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14
; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v7
; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v11
; TONGA-NEXT: v_mul_lo_u32 v12, v8, v4
; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9
; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10
; TONGA-NEXT: v_mul_hi_u32 v11, v2, v11
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12
; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
; TONGA-NEXT: v_mul_lo_u32 v0, v9, v5
; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10
; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v10
; TONGA-NEXT: v_mul_lo_u32 v10, v11, v6
; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9
; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10
; TONGA-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3]
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v0, v5
; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11
; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8
; TONGA-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5]
; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v2, v6
; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3]
; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v1
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc
; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15
; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16
; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v5
; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v7
; TONGA-NEXT: v_mul_lo_u32 v5, v5, v4
; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3
; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
; TONGA-NEXT: v_mul_f32_e32 v1, s10, v1
; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
; TONGA-NEXT: v_mul_lo_u32 v5, v5, v1
; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1
; TONGA-NEXT: v_add_u32_e32 v5, vcc, v10, v6
; TONGA-NEXT: v_xor_b32_e32 v5, v5, v10
; TONGA-NEXT: v_mul_hi_u32 v6, v5, v1
; TONGA-NEXT: v_xor_b32_e32 v1, v8, v4
; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v4, v1
; TONGA-NEXT: v_xor_b32_e32 v10, v10, v14
; TONGA-NEXT: v_mul_lo_u32 v4, v6, v2
; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v5, v4
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v2
; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v4, v2
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; TONGA-NEXT: s_mov_b64 s[0:1], vcc
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v9, v3
; TONGA-NEXT: v_xor_b32_e32 v3, v2, v9
; TONGA-NEXT: v_cvt_f32_u32_e32 v2, v3
; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v3
; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1]
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v7
; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
; TONGA-NEXT: v_add_u32_e32 v7, vcc, v4, v7
; TONGA-NEXT: v_xor_b32_e32 v9, v4, v9
; TONGA-NEXT: v_xor_b32_e32 v4, v7, v4
; TONGA-NEXT: v_mul_f32_e32 v2, s10, v2
; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
; TONGA-NEXT: v_mul_lo_u32 v8, v8, v2
; TONGA-NEXT: v_mul_hi_u32 v6, v2, v8
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2
; TONGA-NEXT: v_mul_hi_u32 v6, v4, v2
; TONGA-NEXT: v_xor_b32_e32 v2, v5, v10
; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v10, v2
; TONGA-NEXT: v_mul_lo_u32 v5, v6, v3
; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v6
; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v3
; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v7, s[0:1]
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v4, v3
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v9, v3
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9
; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v9, v3
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5
; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5]
; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v10
; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc
; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17
; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7
; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2
; TONGA-NEXT: v_xor_b32_e32 v6, v9, v14
; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4
; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7
; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v3, v7
; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6
; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:

View File

@ -11,12 +11,13 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out,
; GCN-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s0, 1, s2
; GCN-NEXT: s_ff1_i32_b32 s0, s0
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[2:3]
; GCN-NEXT: v_ffbh_i32_e32 v1, v0
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0

View File

@ -166,7 +166,8 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, |[[X]]|, [[VCC]]
; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
@ -185,7 +186,8 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000
; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
@ -221,7 +223,8 @@ define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
@ -407,7 +410,7 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
; GCN-LABEL: {{^}}add_select_negk_negk_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cmp_eq_u32_e64
; GCN: s_cmp_eq_u32
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
@ -424,7 +427,7 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cmp_eq_u32_e64
; GCN: s_cmp_eq_u32
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
@ -455,7 +458,8 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
@ -490,7 +494,8 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
@ -632,7 +637,8 @@ define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: v_cmp_eq_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
@ -651,7 +657,8 @@ define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
@ -688,7 +695,8 @@ define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: v_cmp_ne_u32_e64 vcc
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {

View File

@ -5,9 +5,11 @@
; scalar compares, we don't want to use multiple condition registers.
; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
; GCN-DAG: v_cmp_ne_u32_e32 vcc,
; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; GCN: s_and_b64 vcc, vcc, [[CMP1]]
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
@ -37,9 +39,11 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, fl
}
; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32:
; GCN-DAG: v_cmp_ne_u32_e32 vcc,
; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; GCN: s_and_b64 vcc, vcc, [[CMP1]]
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
@ -69,9 +73,11 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, fl
}
; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
; GCN-DAG: v_cmp_ne_u32_e32 vcc,
; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; GCN: s_or_b64 vcc, vcc, [[CMP1]]
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
@ -102,9 +108,11 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, flo
}
; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32:
; GCN-DAG: v_cmp_ne_u32_e32 vcc,
; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
; GCN: s_or_b64 vcc, vcc, [[CMP1]]
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN-DAG: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}

View File

@ -183,11 +183,12 @@ define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
; GCN-LABEL: {{^}}v_select_v4i32:
; GCN: buffer_load_dwordx4
; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
bb:
@ -221,7 +222,7 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN-DAG: v_cndmask_b32_e32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
@ -235,7 +236,7 @@ define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x f
}
; GCN-LABEL: {{^}}s_select_v3f32:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
@ -252,7 +253,7 @@ define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x f
; GCN-LABEL: {{^}}s_select_v4f32:
; GCN: s_load_dwordx4
; GCN: s_load_dwordx4
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32
@ -269,11 +270,12 @@ define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x f
; GCN-LABEL: {{^}}v_select_v4f32:
; GCN: buffer_load_dwordx4
; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
; GCN: s_cselect_b64 vcc, -1, 0
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
bb:
@ -285,7 +287,7 @@ bb:
}
; GCN-LABEL: {{^}}s_select_v5f32:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32
; GCN: v_cndmask_b32_e32

View File

@ -68,7 +68,7 @@ entry:
}
; FUNC-LABEL: {{^}}selectcc_bool:
; SI: v_cmp_ne_u32
; SI: s_cmp_lg_u32
; SI: v_cndmask_b32_e64
; SI-NOT: cmp
; SI-NOT: cndmask

View File

@ -1,6 +1,6 @@
; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}selectcc_i64:
; EG: XOR_INT
@ -9,8 +9,9 @@
; EG: CNDE_INT
; EG: CNDE_INT
; SI: v_cmp_eq_u64
; SI: v_cndmask
; SI: v_cndmask
; VI: s_cmp_eq_u64
; GCN: v_cndmask
; GCN: v_cndmask
define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
entry:
%0 = icmp eq i64 %lhs, %rhs

View File

@ -4,8 +4,9 @@
; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
; GCN-NOT: v_cmp
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT:buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
@ -21,8 +22,9 @@ define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i3
; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
; GCN-NOT: v_cmp
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
@ -38,8 +40,9 @@ define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i3
; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1:
; GCN-NOT: v_cmp
; GCN: v_cmp_eq_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_eq_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@ -52,8 +55,9 @@ define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a,
; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1:
; GCN-NOT: v_cmp
; GCN: v_cmp_eq_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_eq_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@ -66,8 +70,9 @@ define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a,
; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
; GCN-NOT: v_cmp
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@ -80,8 +85,9 @@ define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i3
; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
; GCN-NOT: v_cmp
; GCN: v_cmp_ne_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_lg_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@ -94,8 +100,9 @@ define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i3
; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
; GCN-NOT: v_cmp
; GCN: v_cmp_eq_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_eq_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@ -108,8 +115,9 @@ define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i3
; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
; GCN-NOT: v_cmp
; GCN: v_cmp_eq_u32_e32 vcc,
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_eq_u32
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN-NEXT: buffer_store_byte [[RESULT]]
define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
%icmp0 = icmp ne i32 %a, %b
@ -149,14 +157,16 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a,
; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
; SI: v_cmp_ne_u32_e32 vcc, [[B]], [[VK255]]
; SI: s_cmp_lg_u32 [[B]], [[K255]]
; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0
; VI-DAG: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]]
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
@ -200,9 +210,9 @@ define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
; GCN: v_cmp_ne_u32_e32 vcc, [[B]], [[VK]]{{$}}
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}}
; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {

View File

@ -7,8 +7,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
; GCN: v_cmp_eq_u32_e32
; GCN: v_cmp_eq_u32_e32
; GCN: s_cmp_eq_u32
; GCN: s_cmp_eq_u32
define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
%result = icmp eq <2 x i32> %a, %b
%sext = sext <2 x i1> %result to <2 x i32>
@ -22,10 +22,10 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; GCN: v_cmp_eq_u32_e32
; GCN: v_cmp_eq_u32_e32
; GCN: v_cmp_eq_u32_e32
; GCN: v_cmp_eq_u32_e32
; GCN: s_cmp_eq_u32
; GCN: s_cmp_eq_u32
; GCN: s_cmp_eq_u32
; GCN: s_cmp_eq_u32
define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
@ -227,7 +227,7 @@ entry:
; FUNC-LABEL: {{^}}i32_eq:
; R600: SETE_INT
; GCN: v_cmp_eq_u32
; GCN: s_cmp_eq_u32
define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp eq i32 %a, %b
@ -238,7 +238,7 @@ entry:
; FUNC-LABEL: {{^}}i32_ne:
; R600: SETNE_INT
; GCN: v_cmp_ne_u32
; GCN: s_cmp_lg_u32
define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp ne i32 %a, %b
@ -249,7 +249,7 @@ entry:
; FUNC-LABEL: {{^}}i32_ugt:
; R600: SETGT_UINT
; GCN: v_cmp_gt_u32
; GCN: s_cmp_gt_u32
define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp ugt i32 %a, %b
@ -260,7 +260,7 @@ entry:
; FUNC-LABEL: {{^}}i32_uge:
; R600: SETGE_UINT
; GCN: v_cmp_ge_u32
; GCN: s_cmp_ge_u32
define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp uge i32 %a, %b
@ -271,7 +271,7 @@ entry:
; FUNC-LABEL: {{^}}i32_ult:
; R600: SETGT_UINT
; GCN: v_cmp_lt_u32
; GCN: s_cmp_lt_u32
define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp ult i32 %a, %b
@ -282,7 +282,7 @@ entry:
; FUNC-LABEL: {{^}}i32_ule:
; R600: SETGE_UINT
; GCN: v_cmp_le_u32
; GCN: s_cmp_le_u32
define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp ule i32 %a, %b
@ -293,7 +293,7 @@ entry:
; FUNC-LABEL: {{^}}i32_sgt:
; R600: SETGT_INT
; GCN: v_cmp_gt_i32
; GCN: s_cmp_gt_i32
define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp sgt i32 %a, %b
@ -304,7 +304,7 @@ entry:
; FUNC-LABEL: {{^}}i32_sge:
; R600: SETGE_INT
; GCN: v_cmp_ge_i32
; GCN: s_cmp_ge_i32
define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp sge i32 %a, %b
@ -315,7 +315,7 @@ entry:
; FUNC-LABEL: {{^}}i32_slt:
; R600: SETGT_INT
; GCN: v_cmp_lt_i32
; GCN: s_cmp_lt_i32
define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp slt i32 %a, %b
@ -326,7 +326,7 @@ entry:
; FUNC-LABEL: {{^}}i32_sle:
; R600: SETGE_INT
; GCN: v_cmp_le_i32
; GCN: s_cmp_le_i32
define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
entry:
%0 = icmp sle i32 %a, %b
@ -413,8 +413,8 @@ bb2:
}
; FUNC-LABEL: setcc_v2i32_expand
; GCN: v_cmp_gt_i32
; GCN: v_cmp_gt_i32
; GCN: s_cmp_gt_i32
; GCN: s_cmp_gt_i32
define amdgpu_kernel void @setcc_v2i32_expand(
<2 x i32> addrspace(1)* %a,
<2 x i32> addrspace(1)* %b,
@ -438,10 +438,10 @@ entry:
}
; FUNC-LABEL: setcc_v4i32_expand
; GCN: v_cmp_gt_i32
; GCN: v_cmp_gt_i32
; GCN: v_cmp_gt_i32
; GCN: v_cmp_gt_i32
; GCN: s_cmp_gt_i32
; GCN: s_cmp_gt_i32
; GCN: s_cmp_gt_i32
; GCN: s_cmp_gt_i32
define amdgpu_kernel void @setcc_v4i32_expand(
<4 x i32> addrspace(1)* %a,
<4 x i32> addrspace(1)* %b,

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; XXX: Merge this into setcc, once R600 supports 64-bit operations
@ -159,7 +159,8 @@ entry:
;;;==========================================================================;;;
; GCN-LABEL: {{^}}i64_eq:
; GCN: v_cmp_eq_u64
; SI: v_cmp_eq_u64
; VI: s_cmp_eq_u64
define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
entry:
%tmp0 = icmp eq i64 %a, %b
@ -169,7 +170,8 @@ entry:
}
; GCN-LABEL: {{^}}i64_ne:
; GCN: v_cmp_ne_u64
; SI: v_cmp_ne_u64
; VI: s_cmp_lg_u64
define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
entry:
%tmp0 = icmp ne i64 %a, %b

View File

@ -191,14 +191,16 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: s_sub_i32 s2, s8, 64
; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GCN-NEXT: s_lshr_b64 s[10:11], s[4:5], s9
; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11]
; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s2
; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11]
; GCN-NEXT: s_cmp_lt_u32 s8, 64
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, s11
; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s10
@ -230,12 +232,14 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s9
; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11]
; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s2
; GCN-NEXT: s_cmp_lt_u32 s8, 64
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, s11
; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v2, s10
@ -259,25 +263,27 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8
; GCN-NEXT: s_ashr_i32 s2, s7, 31
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_sub_i32 s0, s8, 64
; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8
; GCN-NEXT: s_cmp_lt_u32 s8, 64
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_sub_i32 s0, s8, 64
; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s0
; GCN-NEXT: s_sub_i32 s0, 64, s8
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GCN-NEXT: s_cmp_eq_u32 s8, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v4, s6

View File

@ -76,7 +76,8 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dword s0, s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; SI-NEXT: s_cmp_eq_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec
; SI-NEXT: BB1_2: ; %endif
; SI-NEXT: s_or_b64 exec, exec, s[6:7]
@ -100,7 +101,8 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
; FLAT-NEXT: ; %bb.1: ; %else
; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
; FLAT-NEXT: s_cmp_eq_u32 s0, 0
; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0
; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec
; FLAT-NEXT: BB1_2: ; %endif
; FLAT-NEXT: s_or_b64 exec, exec, s[6:7]
@ -169,11 +171,14 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; SI-NEXT: s_load_dword s14, s[0:1], 0xc
; SI-NEXT: s_brev_b32 s8, 44
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1
; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4
; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3
; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5]
; SI-NEXT: s_cmp_lt_i32 s2, 1
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_cmp_lt_i32 s3, 4
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_cmp_gt_i32 s3, 3
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; SI-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8
@ -242,11 +247,14 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; FLAT-NEXT: s_load_dword s14, s[0:1], 0x30
; FLAT-NEXT: s_brev_b32 s8, 44
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1
; FLAT-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4
; FLAT-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3
; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5]
; FLAT-NEXT: s_cmp_lt_i32 s2, 1
; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0
; FLAT-NEXT: s_cmp_lt_i32 s3, 4
; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0
; FLAT-NEXT: s_cmp_gt_i32 s3, 3
; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0
; FLAT-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; FLAT-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3]
; FLAT-NEXT: s_waitcnt vmcnt(0)
; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8

View File

@ -6,8 +6,10 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e64 s[2:3], s0, 0
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_cmp_eq_u32 s1, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz BB0_3

View File

@ -10,9 +10,9 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: s_cmp_eq_u32 s0, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@ -23,9 +23,9 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: s_cmp_eq_u32 s0, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp = icmp eq i32 %a, %b
@ -82,9 +82,9 @@ define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: s_cmp_eq_u32 s0, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
@ -96,9 +96,9 @@ define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: s_cmp_eq_u32 s0, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
@ -220,9 +220,9 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: s_cmp_eq_u32 s0, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@ -233,9 +233,9 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: s_cmp_eq_u32 s0, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp = icmp eq i32 %a, %b
@ -256,11 +256,11 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: s_cmp_eq_u32 s0, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_cmp_eq_u32 s2, s3
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@ -272,11 +272,11 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: s_cmp_eq_u32 s0, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_cmp_eq_u32 s2, s3
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@ -298,8 +298,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
; SI-NEXT: s_cmp_eq_u32 s3, s0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@ -314,8 +314,8 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
; VI-NEXT: s_cmp_eq_u32 s3, s0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0

View File

@ -22,8 +22,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32
; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
; VI: s_endpgm
; SI-DAG: v_cmp_eq_u32_e64 vcc,
; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
; SI-DAG: s_cmp_eq_u32
; SI-DAG: s_cselect_b64 vcc, -1, 0
; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc
; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
; SI: s_endpgm

View File

@ -77,7 +77,8 @@ define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4
}
; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32:
; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
; SI: s_cmp_eq_u32
; SI: s_cselect_b64 [[CMP:s\[[0-9]+:[0-9]\]]], -1, 0
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm

View File

@ -493,7 +493,8 @@ exit:
define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; SI-LABEL: test_kill_control_flow_return:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s0, 1
; SI-NEXT: s_cmp_eq_u32 s0, 1
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
@ -529,8 +530,9 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
;
; GFX10-WAVE64-LABEL: test_kill_control_flow_return:
; GFX10-WAVE64: ; %bb.0: ; %entry
; GFX10-WAVE64-NEXT: v_cmp_eq_u32_e64 s[4:5], s0, 1
; GFX10-WAVE64-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-WAVE64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], s[4:5], exec
; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB9_4
@ -565,8 +567,9 @@ define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
;
; GFX10-WAVE32-LABEL: test_kill_control_flow_return:
; GFX10-WAVE32: ; %bb.0: ; %entry
; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e64 s2, s0, 1
; GFX10-WAVE32-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-WAVE32-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-WAVE32-NEXT: s_xor_b32 s2, s2, exec_lo
; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB9_4

View File

@ -375,8 +375,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK: [[V_OR_B32_e32_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_ADD_I32_24]], [[V_OR_B32_e32_66]], implicit $exec
; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e32_67]], implicit $exec
; CHECK: undef %691.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
; CHECK: IMAGE_STORE_V4_V2_gfx10 %691, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
; CHECK: undef %692.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
; CHECK: IMAGE_STORE_V4_V2_gfx10 %692, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
; CHECK: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2

View File

@ -693,39 +693,40 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[0:1], 0xd
; GCN-NEXT: s_load_dword s3, s[0:1], 0xe
; GCN-NEXT: s_mov_b32 s5, 0xff000000
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s4
; GCN-NEXT: s_mov_b32 s7, 0xff000000
; GCN-NEXT: s_mov_b32 s6, 0xffff
; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s2, s2, s5
; GCN-NEXT: s_and_b32 s3, s3, s4
; GCN-NEXT: s_and_b32 s2, s2, s7
; GCN-NEXT: s_and_b32 s3, s3, s6
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0
; GCN-NEXT: s_load_dword s6, s[0:1], 0xb
; GCN-NEXT: s_load_dword s7, s[0:1], 0xc
; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: s_load_dword s8, s[0:1], 0xb
; GCN-NEXT: s_load_dword s0, s[0:1], 0xc
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2
; GCN-NEXT: v_rcp_f32_e32 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s7, s7, s4
; GCN-NEXT: s_and_b32 s6, s6, s5
; GCN-NEXT: s_sub_u32 s8, 0, s2
; GCN-NEXT: s_and_b32 s6, s0, s6
; GCN-NEXT: s_and_b32 s8, s8, s7
; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24
; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: s_subb_u32 s9, 0, s3
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_mul_lo_u32 v4, s8, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: s_sub_u32 s2, 0, s0
; GCN-NEXT: s_subb_u32 s3, 0, s1
; GCN-NEXT: v_mul_hi_u32 v3, s2, v1
; GCN-NEXT: v_mul_lo_u32 v4, s2, v2
; GCN-NEXT: v_mul_lo_u32 v5, s3, v1
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; GCN-NEXT: v_mul_lo_u32 v4, s2, v1
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_mul_lo_u32 v6, v1, v3
; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
@ -740,14 +741,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3
; GCN-NEXT: v_add_i32_e64 v1, s[0:1], v1, v3
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc
; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3]
; GCN-NEXT: v_mul_lo_u32 v5, s8, v3
; GCN-NEXT: v_mul_hi_u32 v6, s8, v1
; GCN-NEXT: v_mul_lo_u32 v7, s9, v1
; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
; GCN-NEXT: v_mul_lo_u32 v5, s2, v3
; GCN-NEXT: v_mul_hi_u32 v6, s2, v1
; GCN-NEXT: v_mul_lo_u32 v7, s3, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
; GCN-NEXT: v_mul_lo_u32 v6, s2, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GCN-NEXT: v_mul_lo_u32 v11, v1, v5
; GCN-NEXT: v_mul_hi_u32 v13, v1, v5
@ -764,15 +765,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3]
; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24
; GCN-NEXT: v_mov_b32_e32 v3, s8
; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
; GCN-NEXT: v_mul_hi_u32 v1, v3, v1
; GCN-NEXT: v_mul_hi_u32 v2, v3, v2
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
@ -781,33 +781,32 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
; GCN-NEXT: v_mul_hi_u32 v5, v0, v1
; GCN-NEXT: v_mul_lo_u32 v6, v0, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0
; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v6, v0, v2
; GCN-NEXT: v_mul_hi_u32 v7, v0, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v1
; GCN-NEXT: v_mul_lo_u32 v10, v0, v1
; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10
; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc
; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0
; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
@ -1464,31 +1463,31 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v2, v1, 24
; GCN-NEXT: v_mul_hi_u32 v3, v0, 24
; GCN-NEXT: v_mul_lo_u32 v4, v0, 24
; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4
; GCN-NEXT: v_mov_b32_e32 v3, s11
; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc
; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 24, v4
; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v3
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0
; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0
; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v4
; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1]
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v1, 24
; GCN-NEXT: v_mul_hi_u32 v5, v0, 24
; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0
; GCN-NEXT: v_mul_lo_u32 v8, v0, 24
; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8
; GCN-NEXT: v_mov_b32_e32 v5, s11
; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8
; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v5
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v8
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0

View File

@ -36,7 +36,7 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
;
; GFX6-LABEL: test_udivrem:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x26
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d
@ -44,27 +44,27 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX6-NEXT: s_sub_i32 s2, 0, s3
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX6-NEXT: s_sub_i32 s3, 0, s2
; GFX6-NEXT: s_mov_b32 s11, s7
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3
; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1]
@ -73,11 +73,11 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
;
; GFX8-LABEL: test_udivrem:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s7, s[0:1], 0x98
; GFX8-NEXT: s_load_dword s6, s[0:1], 0x74
; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98
; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX8-NEXT: s_sub_i32 s2, 0, s7
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
@ -86,22 +86,22 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v2, s6, v0
; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7
; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s7, v3
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v3
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s7, v3
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@ -201,40 +201,40 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX8-NEXT: s_mov_b32 s3, 0x4f7ffffe
; GFX8-NEXT: s_mov_b32 s2, 0x4f7ffffe
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_mul_f32_e32 v0, s3, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s3, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0
; GFX8-NEXT: s_sub_i32 s2, 0, s7
; GFX8-NEXT: v_mul_lo_u32 v3, s2, v1
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1
; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s6, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6
; GFX8-NEXT: v_mul_lo_u32 v1, v1, s7
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@ -402,66 +402,66 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX8-NEXT: s_sub_i32 s2, 0, s8
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s11
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_sub_i32 s2, 0, s8
; GFX8-NEXT: s_sub_i32 s3, 0, s9
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11
; GFX8-NEXT: v_mul_f32_e32 v0, s12, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s12, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_lo_u32 v3, s2, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0
; GFX8-NEXT: s_sub_i32 s2, 0, s10
; GFX8-NEXT: v_mul_f32_e32 v2, s12, v2
; GFX8-NEXT: v_mul_hi_u32 v3, v0, v3
; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v4
; GFX8-NEXT: v_mul_lo_u32 v4, s3, v1
; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8
; GFX8-NEXT: v_mul_f32_e32 v3, s12, v3
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX8-NEXT: v_mul_lo_u32 v4, s2, v2
; GFX8-NEXT: s_sub_i32 s2, 0, s11
; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3
; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8
; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9
; GFX8-NEXT: v_mul_hi_u32 v4, v2, v4
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_mul_lo_u32 v3, s2, v2
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s9, v1
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s9, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3
; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5
; GFX8-NEXT: s_sub_i32 s2, 0, s11
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_mul_f32_e32 v3, s12, v4
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1
; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2
; GFX8-NEXT: v_mul_lo_u32 v4, s2, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, v2, s10
; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4
; GFX8-NEXT: v_mul_hi_u32 v4, v3, v5
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v2
; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s11, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3

View File

@ -78,8 +78,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)
; VI-DAG: s_cmp_eq_u32
; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
; SI-DAG: v_cmp_eq_u32_e64 vcc
; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
; SI-DAG: s_cmp_eq_u32
; SI-DAG: s_cselect_b64 vcc, -1, 0
; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
; GCN: s_endpgm

View File

@ -77,7 +77,8 @@ define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4
}
; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32:
; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
; SI: s_cmp_eq_u32
; SI: s_cselect_b64 [[CMP:s\[[0-9]+:[0-9]\]]], -1, 0
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm

View File

@ -51,10 +51,13 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: s_waitcnt expcnt(0)
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
; CHECK-NEXT: BB1_1: ; %bb9
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
; CHECK-NEXT: s_cbranch_vccnz BB1_1
; CHECK-NEXT: ; %bb.2: ; %bb11
; CHECK-NEXT: s_mov_b32 s3, 0xf000

View File

@ -251,7 +251,7 @@ ENDIF: ; preds = %IF, %main_body
; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}}
; GCN: s_cmp_lt_i32 s[[COND0]], 1
; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}}
; GCN: s_cmp_gt_i32 s[[COND1]], 0{{$}}
; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
; GCN: {{^}}[[EXIT]]:
; GCN: s_endpgm

View File

@ -8,7 +8,8 @@ declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0
; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
; GCN-DAG: v{{[0-9]}}
; All nan values are converted to 0xffffffff
@ -30,9 +31,11 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, fl
; However on GFX10 constant bus is limited to 2 scalar operands, not one.
; GCN-LABEL: {{^}}v_cnd_nan:
; SIVI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0
; SIVI: s_cmp_eq_u32 s{{[0-9]+}}, 0
; SIVI: s_cselect_b64 vcc, -1, 0
; SIVI: v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc
; GFX10: v_cmp_eq_u32_e64 [[CC:s\[[0-9:]+\]]], s{{[0-9]+}}, 0
; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]],
; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]]
; GCN-DAG: v{{[0-9]}}
; All nan values are converted to 0xffffffff

View File

@ -14,7 +14,8 @@ target datalayout = "A5"
; GCN-ALLOCA: buffer_load_dword
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
@ -322,7 +323,8 @@ entry:
; GCN-ALLOCA: buffer_load_dword
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3

View File

@ -35,7 +35,7 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1
; GCN-LABEL: extract_insert_different_dynelt_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
@ -43,26 +43,33 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1
; GCN-NEXT: s_mov_b64 s[4:5], s[10:11]
; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64
; GCN-NEXT: s_load_dword s0, s[0:1], 0xf
; GCN-NEXT: s_load_dword s14, s[0:1], 0xf
; GCN-NEXT: s_cmp_eq_u32 s13, 3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s13, 2
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_cmp_eq_u32 s13, 1
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: s_cmp_eq_u32 s13, 0
; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s14, 1
; GCN-NEXT: v_mov_b32_e32 v7, v5
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
; GCN-NEXT: s_cmp_eq_u32 s14, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s14, 3
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GCN-NEXT: buffer_store_dword v0, v[6:7], s[8:11], 0 addr64
; GCN-NEXT: s_endpgm

View File

@ -12,10 +12,10 @@
; VI: s_cmp_gt_i32
; VI: s_cselect_b32
; SI: v_cmp_gt_i32_e32 vcc
; SI: v_cndmask_b32_e32
; SI: v_cmp_gt_i32_e32 vcc
; SI: v_cndmask_b32_e32
; SI-DAG: s_cmp_gt_i32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: s_cmp_gt_i32
; SI-DAG: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
entry:
@ -59,10 +59,10 @@ entry:
; VI: s_cselect_b32
; VI: s_cselect_b32
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI: v_cndmask_b32_e32
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e32
define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
entry:

View File

@ -347,9 +347,9 @@ bb:
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
bb:
%tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1

View File

@ -37,7 +37,7 @@ define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroe
; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
; GCN-DAG: v_cmp_eq_u32
; GCN-DAG: s_cmp_eq_u32
; GCN: v_cndmask_b32
define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
%cmp = icmp eq i32 %a, %b
@ -54,10 +54,9 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a,
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]]
; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]]
; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]]
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_short [[RESULT]]
define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
%tmp0 = icmp eq i16 %a, %b