AMDGPU/GlobalISel: Fold constant offset vector extract indexes

Handle dynamic vector extracts that use an index that's an add of a
constant offset into moving the base subregister of the indexing
operation.

Force the add into the loop in regbankselect, which will be recognized
when selected.
This commit is contained in:
Matt Arsenault 2020-01-03 10:07:51 -05:00 committed by Matt Arsenault
parent 9dc9f7ca14
commit e3d352c541
5 changed files with 246 additions and 218 deletions

View File

@ -1792,6 +1792,30 @@ bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
return true;
}
/// Return the register to use for the index value, and the subregister to use
/// for the indirectly accessed register.
static std::pair<Register, unsigned>
computeIndirectRegIndex(MachineRegisterInfo &MRI,
const SIRegisterInfo &TRI,
const TargetRegisterClass *SuperRC,
Register IdxReg,
unsigned EltSize) {
Register IdxBaseReg;
int Offset;
MachineInstr *Unused;
std::tie(IdxBaseReg, Offset, Unused)
= AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
// Skip out of bounds offsets, or else we would end up using an undefined
// register.
if (static_cast<unsigned>(Offset) >= SubRegs.size())
return std::make_pair(IdxReg, SubRegs[0]);
return std::make_pair(IdxBaseReg, SubRegs[Offset]);
}
bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
MachineInstr &MI) const {
Register DstReg = MI.getOperand(0).getReg();
@ -1823,7 +1847,9 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
const DebugLoc &DL = MI.getDebugLoc();
const bool Is64 = DstTy.getSizeInBits() == 64;
unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
unsigned SubReg;
std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
DstTy.getSizeInBits() / 8);
if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
if (DstTy.getSizeInBits() != 32 && !Is64)

View File

@ -69,6 +69,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@ -76,8 +78,8 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@ -1975,7 +1977,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT DstTy = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
MachineIRBuilder B(MI);
const ValueMapping &DstMapping
@ -1983,10 +1991,40 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank *IdxBank =
OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
Register IdxReg = MI.getOperand(2).getReg();
Register BaseIdxReg;
unsigned ConstOffset;
MachineInstr *OffsetDef;
std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
// See if the index is an add of a constant which will be foldable by moving
// the base register of the index later if this is going to be executed in a
// waterfall loop. This is essentially to reassociate the add of a constant
// with the readfirstlane.
bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
ConstOffset > 0 &&
ConstOffset < SrcTy.getNumElements();
// Re-insert the constant offset add inside the waterfall loop.
auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr,
unsigned OpIdx) {
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
};
// Move the base register. We'll re-insert the add later.
if (ShouldMoveIndexIntoLoop)
MI.getOperand(2).setReg(BaseIdxReg);
// If this is a VGPR result only because the index was a VGPR result, the
// actual indexing will be done on the SGPR source vector, which will
@ -2010,13 +2048,14 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
buildVCopy(B, DstReg, TmpReg);
}
if (ShouldMoveIndexIntoLoop)
ReinsertIndexAdd(MI, 2);
return;
}
assert(DstTy.getSizeInBits() == 64);
LLT SrcTy = MRI.getType(SrcReg);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@ -2029,7 +2068,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, IdxReg, One);
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
@ -2070,6 +2109,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
buildVCopy(B, DstRegs[1], TmpReg1);
}
if (ShouldMoveIndexIntoLoop)
ReinsertIndexAdd(*IdxLo, 1);
return;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {

View File

@ -257,15 +257,10 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
; GPRIDX-NEXT: s_mov_b64 s[20:21], exec
; GPRIDX-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s22, v0
; GPRIDX-NEXT: s_lshl_b32 m0, s22, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s22, v0
; GPRIDX-NEXT: s_lshl_b32 s22, s22, 1
; GPRIDX-NEXT: s_add_u32 s23, s22, 1
; GPRIDX-NEXT: s_mov_b32 m0, s22
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movrels_b32 s22, s4
; GPRIDX-NEXT: s_mov_b32 m0, s23
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movrels_b32 s23, s4
; GPRIDX-NEXT: s_movrels_b32 s23, s5
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB6_1
@ -289,13 +284,10 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
; MOVREL-NEXT: s_mov_b64 s[20:21], exec
; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s22, v0
; MOVREL-NEXT: s_lshl_b32 m0, s22, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s22, v0
; MOVREL-NEXT: s_lshl_b32 s22, s22, 1
; MOVREL-NEXT: s_add_u32 s23, s22, 1
; MOVREL-NEXT: s_mov_b32 m0, s22
; MOVREL-NEXT: s_movrels_b32 s22, s4
; MOVREL-NEXT: s_mov_b32 m0, s23
; MOVREL-NEXT: s_movrels_b32 s23, s4
; MOVREL-NEXT: s_movrels_b32 s23, s5
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: s_xor_b64 exec, exec, vcc
; MOVREL-NEXT: s_cbranch_execnz BB6_1
@ -371,15 +363,11 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
; GPRIDX-NEXT: s_mov_b64 s[16:17], exec
; GPRIDX-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s18, v0
; GPRIDX-NEXT: s_lshl_b32 m0, s18, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s18, v0
; GPRIDX-NEXT: s_lshl_b32 s18, s18, 1
; GPRIDX-NEXT: s_add_u32 s19, s18, 1
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movrels_b32 s18, s0
; GPRIDX-NEXT: s_mov_b32 m0, s19
; GPRIDX-NEXT: s_movrels_b32 s19, s1
; GPRIDX-NEXT: v_mov_b32_e32 v1, s18
; GPRIDX-NEXT: s_movrels_b32 s19, s0
; GPRIDX-NEXT: v_mov_b32_e32 v2, s19
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -410,13 +398,10 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
; MOVREL-NEXT: s_mov_b64 s[16:17], exec
; MOVREL-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s18, v0
; MOVREL-NEXT: s_lshl_b32 m0, s18, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s18, v0
; MOVREL-NEXT: s_lshl_b32 s18, s18, 1
; MOVREL-NEXT: s_add_u32 s19, s18, 1
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_movrels_b32 s18, s0
; MOVREL-NEXT: s_mov_b32 m0, s19
; MOVREL-NEXT: s_movrels_b32 s19, s0
; MOVREL-NEXT: s_movrels_b32 s19, s1
; MOVREL-NEXT: v_mov_b32_e32 v1, s18
; MOVREL-NEXT: v_mov_b32_e32 v2, s19
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
@ -439,14 +424,13 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
; GPRIDX-NEXT: s_mov_b64 s[4:5], exec
; GPRIDX-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v16
; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1
; GPRIDX-NEXT: s_add_u32 s7, s6, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0)
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v0
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -463,13 +447,10 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
; MOVREL-NEXT: s_mov_b64 s[4:5], exec
; MOVREL-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s6, v16
; MOVREL-NEXT: s_lshl_b32 m0, s6, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; MOVREL-NEXT: s_lshl_b32 s6, s6, 1
; MOVREL-NEXT: s_mov_b32 m0, s6
; MOVREL-NEXT: s_add_u32 s7, s6, 1
; MOVREL-NEXT: v_movrels_b32_e32 v17, v0
; MOVREL-NEXT: s_mov_b32 m0, s7
; MOVREL-NEXT: v_movrels_b32_e32 v18, v0
; MOVREL-NEXT: v_movrels_b32_e32 v18, v1
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: s_xor_b64 exec, exec, vcc
; MOVREL-NEXT: s_cbranch_execnz BB9_1
@ -487,24 +468,20 @@ define amdgpu_ps void @dyn_extract_v8i64_v_s(<8 x i64> %vec, i32 inreg %sel) {
; GPRIDX-LABEL: dyn_extract_v8i64_v_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1
; GPRIDX-NEXT: s_add_u32 s1, s0, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v16, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: global_store_dwordx2 v[0:1], v[16:17], off
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v8i64_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_lshl_b32 s0, s2, 1
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: s_add_u32 s0, s0, 1
; MOVREL-NEXT: s_lshl_b32 m0, s2, 1
; MOVREL-NEXT: v_movrels_b32_e32 v16, v0
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: v_movrels_b32_e32 v17, v0
; MOVREL-NEXT: v_movrels_b32_e32 v17, v1
; MOVREL-NEXT: flat_store_dwordx2 v[0:1], v[16:17]
; MOVREL-NEXT: s_endpgm
entry:
@ -573,30 +550,30 @@ define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i3
; GPRIDX-LABEL: dyn_extract_v8f32_s_s_offset3:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_add_u32 m0, s10, 3
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 m0, s10
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_movrels_b32 s0, s0
; GPRIDX-NEXT: s_movrels_b32 s0, s3
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f32_s_s_offset3:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_add_u32 m0, s10, 3
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 m0, s10
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_movrels_b32 s0, s0
; MOVREL-NEXT: s_movrels_b32 s0, s3
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: ; return to shader part epilog
entry:
@ -609,38 +586,36 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GPRIDX-NEXT: v_add_u32_e32 v9, 3, v8
; GPRIDX-NEXT: s_mov_b64 s[4:5], exec
; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8
; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v8, v0
; GPRIDX-NEXT: v_mov_b32_e32 v9, v3
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB13_1
; GPRIDX-NEXT: ; %bb.2:
; GPRIDX-NEXT: s_mov_b64 exec, s[4:5]
; GPRIDX-NEXT: v_mov_b32_e32 v0, v8
; GPRIDX-NEXT: v_mov_b32_e32 v0, v9
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: dyn_extract_v8f32_v_v_offset3:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: v_add_u32_e32 v9, vcc, 3, v8
; MOVREL-NEXT: s_mov_b64 s[4:5], exec
; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s6, v9
; MOVREL-NEXT: v_readfirstlane_b32 s6, v8
; MOVREL-NEXT: s_mov_b32 m0, s6
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9
; MOVREL-NEXT: v_movrels_b32_e32 v8, v0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8
; MOVREL-NEXT: v_movrels_b32_e32 v9, v3
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: s_xor_b64 exec, exec, vcc
; MOVREL-NEXT: s_cbranch_execnz BB13_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b64 exec, s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, v8
; MOVREL-NEXT: v_mov_b32_e32 v0, v9
; MOVREL-NEXT: s_setpc_b64 s[30:31]
entry:
%add = add i32 %sel, 3
@ -653,9 +628,9 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 1
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
@ -668,16 +643,16 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[2:3]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset1:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 1
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
@ -690,7 +665,7 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[2:3]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 1
@ -703,11 +678,11 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 2
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
@ -718,18 +693,18 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[4:5]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset2:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 2
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 s8, s10
@ -740,7 +715,7 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[4:5]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 2
@ -753,13 +728,13 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 3
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
@ -768,20 +743,20 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[6:7]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset3:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 3
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s8, s10
; MOVREL-NEXT: s_mov_b32 s9, s11
; MOVREL-NEXT: s_mov_b32 s10, s12
@ -790,7 +765,7 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[6:7]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 3
@ -803,7 +778,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 4
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
@ -812,20 +786,20 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_mov_b32 s8, s10
; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s10, s12
; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[8:9]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset4:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 4
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
@ -834,13 +808,14 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 s8, s10
; MOVREL-NEXT: s_mov_b32 s9, s11
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s10, s12
; MOVREL-NEXT: s_mov_b32 s11, s13
; MOVREL-NEXT: s_mov_b32 s12, s14
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[8:9]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 4
@ -853,7 +828,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 5
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
@ -864,18 +838,18 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s9, s11
; GPRIDX-NEXT: s_mov_b32 s10, s12
; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[10:11]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset5:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 5
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
@ -886,11 +860,12 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s9, s11
; MOVREL-NEXT: s_mov_b32 s10, s12
; MOVREL-NEXT: s_mov_b32 s11, s13
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s12, s14
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[10:11]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 5
@ -903,7 +878,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 6
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
@ -916,16 +890,16 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s11, s13
; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[12:13]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset6:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 6
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
@ -938,9 +912,10 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s11, s13
; MOVREL-NEXT: s_mov_b32 s12, s14
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[12:13]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 6
@ -953,7 +928,6 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec,
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_mov_b32 s0, s2
; GPRIDX-NEXT: s_mov_b32 s1, s3
; GPRIDX-NEXT: s_add_u32 m0, s18, 7
; GPRIDX-NEXT: s_mov_b32 s2, s4
; GPRIDX-NEXT: s_mov_b32 s3, s5
; GPRIDX-NEXT: s_mov_b32 s4, s6
@ -968,14 +942,15 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec,
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1]
; GPRIDX-NEXT: s_mov_b32 m0, s18
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[14:15]
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset7:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s18, 7
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
@ -990,7 +965,8 @@ define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec,
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1]
; MOVREL-NEXT: s_mov_b32 m0, s18
; MOVREL-NEXT: s_movrels_b64 s[0:1], s[14:15]
; MOVREL-NEXT: ; return to shader part epilog
entry:
%add = add i32 %sel, 7
@ -1052,49 +1028,45 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
; GPRIDX-LABEL: dyn_extract_v8f64_v_v_offset3:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GPRIDX-NEXT: v_add_u32_e32 v18, 3, v16
; GPRIDX-NEXT: s_mov_b64 s[4:5], exec
; GPRIDX-NEXT: BB22_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v18
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v18
; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1
; GPRIDX-NEXT: s_add_u32 s7, s6, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v16, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v16
; GPRIDX-NEXT: s_add_u32 s7, s6, 3
; GPRIDX-NEXT: s_lshl_b32 s7, s7, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB22_1
; GPRIDX-NEXT: ; %bb.2:
; GPRIDX-NEXT: s_mov_b64 exec, s[4:5]
; GPRIDX-NEXT: v_mov_b32_e32 v0, v16
; GPRIDX-NEXT: v_mov_b32_e32 v1, v17
; GPRIDX-NEXT: v_mov_b32_e32 v0, v17
; GPRIDX-NEXT: v_mov_b32_e32 v1, v18
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: dyn_extract_v8f64_v_v_offset3:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: v_add_u32_e32 v18, vcc, 3, v16
; MOVREL-NEXT: s_mov_b64 s[4:5], exec
; MOVREL-NEXT: BB22_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s6, v18
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v18
; MOVREL-NEXT: s_lshl_b32 s6, s6, 1
; MOVREL-NEXT: s_mov_b32 m0, s6
; MOVREL-NEXT: s_add_u32 s7, s6, 1
; MOVREL-NEXT: v_movrels_b32_e32 v16, v0
; MOVREL-NEXT: s_mov_b32 m0, s7
; MOVREL-NEXT: v_readfirstlane_b32 s6, v16
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; MOVREL-NEXT: s_add_u32 s6, s6, 3
; MOVREL-NEXT: s_lshl_b32 m0, s6, 1
; MOVREL-NEXT: v_movrels_b32_e32 v17, v0
; MOVREL-NEXT: v_movrels_b32_e32 v18, v1
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: s_xor_b64 exec, exec, vcc
; MOVREL-NEXT: s_cbranch_execnz BB22_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b64 exec, s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, v16
; MOVREL-NEXT: v_mov_b32_e32 v1, v17
; MOVREL-NEXT: v_mov_b32_e32 v0, v17
; MOVREL-NEXT: v_mov_b32_e32 v1, v18
; MOVREL-NEXT: s_setpc_b64 s[30:31]
entry:
%add = add i32 %sel, 3
@ -1188,14 +1160,13 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %
; GPRIDX-NEXT: s_mov_b64 s[4:5], exec
; GPRIDX-NEXT: BB25_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v16
; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1
; GPRIDX-NEXT: s_add_u32 s7, s6, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0)
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v17, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v0
; GPRIDX-NEXT: v_mov_b32_e32 v18, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -1212,13 +1183,10 @@ define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %
; MOVREL-NEXT: s_mov_b64 s[4:5], exec
; MOVREL-NEXT: BB25_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s6, v16
; MOVREL-NEXT: s_lshl_b32 m0, s6, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
; MOVREL-NEXT: s_lshl_b32 s6, s6, 1
; MOVREL-NEXT: s_mov_b32 m0, s6
; MOVREL-NEXT: s_add_u32 s7, s6, 1
; MOVREL-NEXT: v_movrels_b32_e32 v17, v0
; MOVREL-NEXT: s_mov_b32 m0, s7
; MOVREL-NEXT: v_movrels_b32_e32 v18, v0
; MOVREL-NEXT: v_movrels_b32_e32 v18, v1
; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
; MOVREL-NEXT: s_xor_b64 exec, exec, vcc
; MOVREL-NEXT: s_cbranch_execnz BB25_1

View File

@ -286,18 +286,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_1
; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub1, implicit $m0, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[S_MOVRELS_B32_]]
; GPRIDX-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_1
; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: $m0 = COPY [[S_ADD_U32_]]
; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]]
; GPRIDX: $m0 = COPY [[COPY1]]
; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub1, implicit $m0, implicit [[COPY]]
; GPRIDX: S_ENDPGM 0, implicit [[S_MOVRELS_B32_]]
%0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
%1:sgpr(s32) = COPY $sgpr8
@ -352,18 +348,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_7
; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub7, implicit $m0, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[S_MOVRELS_B32_]]
; GPRIDX-LABEL: name: extract_vector_elt_s_s32_v8s32_idx_offset_7
; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: $m0 = COPY [[S_ADD_U32_]]
; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub0, implicit $m0, implicit [[COPY]]
; GPRIDX: $m0 = COPY [[COPY1]]
; GPRIDX: [[S_MOVRELS_B32_:%[0-9]+]]:sreg_32 = S_MOVRELS_B32 [[COPY]].sub7, implicit $m0, implicit [[COPY]]
; GPRIDX: S_ENDPGM 0, implicit [[S_MOVRELS_B32_]]
%0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
%1:sgpr(s32) = COPY $sgpr8
@ -418,18 +410,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_1
; MOVREL: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub2_sub3, implicit $m0, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[S_MOVRELS_B64_]]
; GPRIDX-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_1
; GPRIDX: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: $m0 = COPY [[S_ADD_U32_]]
; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]]
; GPRIDX: $m0 = COPY [[COPY1]]
; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub2_sub3, implicit $m0, implicit [[COPY]]
; GPRIDX: S_ENDPGM 0, implicit [[S_MOVRELS_B64_]]
%0:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
%1:sgpr(s32) = COPY $sgpr8
@ -451,18 +439,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_2
; MOVREL: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub4_sub5, implicit $m0, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[S_MOVRELS_B64_]]
; GPRIDX-LABEL: name: extract_vector_elt_s_s64_v8s64_idx_offset_2
; GPRIDX: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: $m0 = COPY [[S_ADD_U32_]]
; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub0_sub1, implicit $m0, implicit [[COPY]]
; GPRIDX: $m0 = COPY [[COPY1]]
; GPRIDX: [[S_MOVRELS_B64_:%[0-9]+]]:sreg_64 = S_MOVRELS_B64 [[COPY]].sub4_sub5, implicit $m0, implicit [[COPY]]
; GPRIDX: S_ENDPGM 0, implicit [[S_MOVRELS_B64_]]
%0:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
%1:sgpr(s32) = COPY $sgpr8
@ -685,18 +669,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_1
; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub1, implicit $m0, implicit $exec, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]]
; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_1
; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 1, implicit-def $m0, implicit $m0
; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0
; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit $m0
; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub1, implicit $exec, implicit [[COPY]], implicit $m0
; GPRIDX: S_SET_GPR_IDX_OFF
; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
%0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@ -753,18 +733,14 @@ body: |
; MOVREL-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_7
; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub0, implicit $m0, implicit $exec, implicit [[COPY]]
; MOVREL: $m0 = COPY [[COPY1]]
; MOVREL: [[V_MOVRELS_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOVRELS_B32_e32 undef [[COPY]].sub7, implicit $m0, implicit $exec, implicit [[COPY]]
; MOVREL: S_ENDPGM 0, implicit [[V_MOVRELS_B32_e32_]]
; GPRIDX-LABEL: name: extract_vector_elt_v_s32_v8s32_idx_offset_7
; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 1, implicit-def $m0, implicit $m0
; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub0, implicit $exec, implicit [[COPY]], implicit $m0
; GPRIDX: S_SET_GPR_IDX_ON [[COPY1]], 1, implicit-def $m0, implicit $m0
; GPRIDX: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY]].sub7, implicit $exec, implicit [[COPY]], implicit $m0
; GPRIDX: S_SET_GPR_IDX_OFF
; GPRIDX: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
%0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7

View File

@ -436,10 +436,10 @@ body: |
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@ -447,9 +447,11 @@ body: |
; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -462,10 +464,10 @@ body: |
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
@ -473,9 +475,11 @@ body: |
; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -643,10 +647,10 @@ body: |
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
@ -662,12 +666,14 @@ body: |
; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C1]](s32)
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
; WAVE64: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD1]](s32)
; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -681,10 +687,10 @@ body: |
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
@ -700,12 +706,14 @@ body: |
; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C1]](s32)
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
; WAVE32: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD1]](s32)
; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
@ -736,10 +744,10 @@ body: |
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@ -747,9 +755,11 @@ body: |
; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
@ -763,10 +773,10 @@ body: |
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
@ -774,9 +784,11 @@ body: |
; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
@ -807,10 +819,10 @@ body: |
; WAVE64: successors: %bb.1(0x80000000)
; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE64: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
@ -826,12 +838,14 @@ body: |
; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C1]](s32)
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
; WAVE64: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
; WAVE64: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD1]](s32)
; WAVE64: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
@ -847,10 +861,10 @@ body: |
; WAVE32: successors: %bb.1(0x80000000)
; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
; WAVE32: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
@ -866,12 +880,14 @@ body: |
; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C1]](s32)
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
; WAVE32: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
; WAVE32: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD1]](s32)
; WAVE32: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec