AMDGPU/GlobalISel: Fold add of constant into G_INSERT_VECTOR_ELT

Move the subregister base like in the extract case.
This commit is contained in:
Matt Arsenault 2020-01-03 23:13:15 -05:00 committed by Matt Arsenault
parent 349f6bb873
commit 52ec7379ad
4 changed files with 260 additions and 267 deletions

View File

@ -1930,12 +1930,15 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
return false;
unsigned SubReg;
std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
ValSize / 8);
const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
STI.useVGPRIndexMode();
MachineBasicBlock *BB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
unsigned SubReg = ValSize == 64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
if (IndexMode) {
BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))

View File

@ -1511,6 +1511,25 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
}
/// Utility function for pushing dynamic vector indexes with a constant offset
/// into waterwall loops.
static void reinsertVectorIndexAdd(MachineIRBuilder &B,
MachineInstr &IdxUseInstr,
unsigned OpIdx,
unsigned ConstOffset) {
MachineRegisterInfo &MRI = *B.getMRI();
const LLT S32 = LLT::scalar(32);
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
}
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@ -2011,20 +2030,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
ConstOffset > 0 &&
ConstOffset < SrcTy.getNumElements();
// Re-insert the constant offset add inside the waterfall loop.
auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr,
unsigned OpIdx) {
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
};
// Move the base register. We'll re-insert the add later.
if (ShouldMoveIndexIntoLoop)
MI.getOperand(2).setReg(BaseIdxReg);
@ -2051,8 +2056,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
buildVCopy(B, DstReg, TmpReg);
}
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop)
ReinsertIndexAdd(MI, 2);
reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
return;
}
@ -2113,7 +2119,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
}
if (ShouldMoveIndexIntoLoop)
ReinsertIndexAdd(*IdxLo, 1);
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
return;
}
@ -2126,26 +2132,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(3).empty());
const RegisterBank *IdxBank =
OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
if (substituteSimpleCopyRegs(OpdMapper, 1))
MRI.setType(MI.getOperand(1).getReg(), VecTy);
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
LLT InsTy = MRI.getType(InsReg);
(void)InsTy;
Register BaseIdxReg;
unsigned ConstOffset;
MachineInstr *OffsetDef;
std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
// See if the index is an add of a constant which will be foldable by moving
// the base register of the index later if this is going to be executed in a
// waterfall loop. This is essentially to reassociate the add of a constant
// with the readfirstlane.
bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
ConstOffset > 0 &&
ConstOffset < VecTy.getNumElements();
// Move the base register. We'll re-insert the add later.
if (ShouldMoveIndexIntoLoop)
MI.getOperand(3).setReg(BaseIdxReg);
if (InsRegs.empty()) {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { 3 });
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop) {
MachineIRBuilder B(MI);
reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
}
return;
}
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
Register IdxReg = MI.getOperand(3).getReg();
LLT SrcTy = MRI.getType(SrcReg);
LLT InsTy = MRI.getType(InsReg);
(void)InsTy;
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@ -2158,7 +2191,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, IdxReg, One);
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
@ -2192,6 +2225,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop)
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
return;
}
case AMDGPU::G_INTRINSIC: {

View File

@ -747,10 +747,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: s_mov_b64 s[4:5], exec
; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s6, v2
; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2
; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1
; GPRIDX-NEXT: s_add_u32 s7, s6, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST)
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v3, v19
; GPRIDX-NEXT: v_mov_b32_e32 v4, v20
; GPRIDX-NEXT: v_mov_b32_e32 v5, v21
@ -770,7 +769,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: v_mov_b32_e32 v3, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v3, v1
; GPRIDX-NEXT: v_mov_b32_e32 v4, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -831,13 +830,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: v_mov_b32_e32 v4, v20
; MOVREL-NEXT: v_mov_b32_e32 v5, v21
; MOVREL-NEXT: v_mov_b32_e32 v6, v22
; MOVREL-NEXT: s_lshl_b32 s6, s5, 1
; MOVREL-NEXT: s_lshl_b32 m0, s5, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2
; MOVREL-NEXT: v_mov_b32_e32 v7, v23
; MOVREL-NEXT: v_mov_b32_e32 v8, v24
; MOVREL-NEXT: v_mov_b32_e32 v9, v25
; MOVREL-NEXT: s_add_u32 s5, s6, 1
; MOVREL-NEXT: s_mov_b32 m0, s6
; MOVREL-NEXT: v_mov_b32_e32 v10, v26
; MOVREL-NEXT: v_mov_b32_e32 v11, v27
; MOVREL-NEXT: v_mov_b32_e32 v12, v28
@ -848,8 +845,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: v_mov_b32_e32 v17, v33
; MOVREL-NEXT: v_mov_b32_e32 v18, v34
; MOVREL-NEXT: v_movreld_b32_e32 v3, v0
; MOVREL-NEXT: s_mov_b32 m0, s5
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
; MOVREL-NEXT: v_movreld_b32_e32 v4, v1
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB13_1
@ -916,10 +912,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v0
; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1
; GPRIDX-NEXT: s_add_u32 s3, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v1, v17
; GPRIDX-NEXT: v_mov_b32_e32 v2, v18
; GPRIDX-NEXT: v_mov_b32_e32 v3, v19
@ -939,7 +934,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; GPRIDX-NEXT: v_mov_b32_e32 v1, s18
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v1, s19
; GPRIDX-NEXT: v_mov_b32_e32 v2, s19
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -994,13 +989,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v2, v18
; MOVREL-NEXT: v_mov_b32_e32 v3, v19
; MOVREL-NEXT: v_mov_b32_e32 v4, v20
; MOVREL-NEXT: s_lshl_b32 s2, s1, 1
; MOVREL-NEXT: s_lshl_b32 m0, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
; MOVREL-NEXT: v_mov_b32_e32 v5, v21
; MOVREL-NEXT: v_mov_b32_e32 v6, v22
; MOVREL-NEXT: v_mov_b32_e32 v7, v23
; MOVREL-NEXT: s_add_u32 s1, s2, 1
; MOVREL-NEXT: s_mov_b32 m0, s2
; MOVREL-NEXT: v_mov_b32_e32 v8, v24
; MOVREL-NEXT: v_mov_b32_e32 v9, v25
; MOVREL-NEXT: v_mov_b32_e32 v10, v26
@ -1011,8 +1004,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v15, v31
; MOVREL-NEXT: v_mov_b32_e32 v16, v32
; MOVREL-NEXT: v_movreld_b32_e32 v1, s18
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_movreld_b32_e32 v1, s19
; MOVREL-NEXT: v_movreld_b32_e32 v2, s19
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB14_1
@ -1072,12 +1064,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
; GPRIDX-NEXT: v_mov_b32_e32 v3, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s0
; GPRIDX-NEXT: s_lshl_b32 s0, s18, 1
; GPRIDX-NEXT: s_add_u32 s1, s0, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v2, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v2, v1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v3, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
@ -1103,12 +1094,12 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
; MOVREL-NEXT: s_mov_b32 s10, s12
; MOVREL-NEXT: s_mov_b32 s12, s14
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_lshl_b32 s16, s18, 1
; MOVREL-NEXT: v_mov_b32_e32 v17, s15
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: s_lshl_b32 m0, s18, 1
; MOVREL-NEXT: v_mov_b32_e32 v15, s13
; MOVREL-NEXT: v_mov_b32_e32 v14, s12
; MOVREL-NEXT: v_mov_b32_e32 v16, s14
; MOVREL-NEXT: s_mov_b32 m0, s16
; MOVREL-NEXT: v_mov_b32_e32 v14, s12
; MOVREL-NEXT: v_mov_b32_e32 v13, s11
; MOVREL-NEXT: v_mov_b32_e32 v12, s10
; MOVREL-NEXT: v_mov_b32_e32 v11, s9
@ -1120,15 +1111,12 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v5, s3
; MOVREL-NEXT: v_mov_b32_e32 v4, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: s_add_u32 s0, s16, 1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: v_movreld_b32_e32 v2, v1
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
; MOVREL-NEXT: s_endpgm
entry:
@ -1148,12 +1136,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_lshl_b32 s0, s4, 1
; GPRIDX-NEXT: s_add_u32 s1, s0, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v0, s2
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v0, s3
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v1, s3
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@ -1163,13 +1150,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
;
; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_lshl_b32 s0, s4, 1
; MOVREL-NEXT: s_lshl_b32 m0, s4, 1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: s_add_u32 s0, s0, 1
; MOVREL-NEXT: v_movreld_b32_e32 v0, s2
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: v_movreld_b32_e32 v0, s3
; MOVREL-NEXT: v_movreld_b32_e32 v1, s3
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
@ -1226,10 +1210,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2
; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2
; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1
; GPRIDX-NEXT: s_add_u32 s3, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v3, v19
; GPRIDX-NEXT: v_mov_b32_e32 v4, v20
; GPRIDX-NEXT: v_mov_b32_e32 v5, v21
@ -1249,7 +1232,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
; GPRIDX-NEXT: v_mov_b32_e32 v3, v0
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v3, v1
; GPRIDX-NEXT: v_mov_b32_e32 v4, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -1304,13 +1287,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v4, v20
; MOVREL-NEXT: v_mov_b32_e32 v5, v21
; MOVREL-NEXT: v_mov_b32_e32 v6, v22
; MOVREL-NEXT: s_lshl_b32 s2, s1, 1
; MOVREL-NEXT: s_lshl_b32 m0, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2
; MOVREL-NEXT: v_mov_b32_e32 v7, v23
; MOVREL-NEXT: v_mov_b32_e32 v8, v24
; MOVREL-NEXT: v_mov_b32_e32 v9, v25
; MOVREL-NEXT: s_add_u32 s1, s2, 1
; MOVREL-NEXT: s_mov_b32 m0, s2
; MOVREL-NEXT: v_mov_b32_e32 v10, v26
; MOVREL-NEXT: v_mov_b32_e32 v11, v27
; MOVREL-NEXT: v_mov_b32_e32 v12, v28
@ -1321,8 +1302,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
; MOVREL-NEXT: v_mov_b32_e32 v17, v33
; MOVREL-NEXT: v_mov_b32_e32 v18, v34
; MOVREL-NEXT: v_movreld_b32_e32 v3, v0
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
; MOVREL-NEXT: v_movreld_b32_e32 v4, v1
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB17_1
@ -1352,10 +1332,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s4, v16
; GPRIDX-NEXT: s_lshl_b32 s5, s4, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16
; GPRIDX-NEXT: s_lshl_b32 s4, s4, 1
; GPRIDX-NEXT: s_add_u32 s5, s4, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v32, v15
; GPRIDX-NEXT: v_mov_b32_e32 v31, v14
; GPRIDX-NEXT: v_mov_b32_e32 v30, v13
@ -1375,7 +1354,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
; GPRIDX-NEXT: v_mov_b32_e32 v17, s2
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v17, s3
; GPRIDX-NEXT: v_mov_b32_e32 v18, s3
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -1398,13 +1377,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
; MOVREL-NEXT: v_mov_b32_e32 v17, v0
; MOVREL-NEXT: v_mov_b32_e32 v31, v14
; MOVREL-NEXT: v_mov_b32_e32 v30, v13
; MOVREL-NEXT: s_lshl_b32 s4, s1, 1
; MOVREL-NEXT: s_lshl_b32 m0, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16
; MOVREL-NEXT: v_mov_b32_e32 v29, v12
; MOVREL-NEXT: v_mov_b32_e32 v28, v11
; MOVREL-NEXT: v_mov_b32_e32 v27, v10
; MOVREL-NEXT: s_add_u32 s1, s4, 1
; MOVREL-NEXT: s_mov_b32 m0, s4
; MOVREL-NEXT: v_mov_b32_e32 v26, v9
; MOVREL-NEXT: v_mov_b32_e32 v25, v8
; MOVREL-NEXT: v_mov_b32_e32 v24, v7
@ -1415,8 +1392,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
; MOVREL-NEXT: v_mov_b32_e32 v19, v2
; MOVREL-NEXT: v_mov_b32_e32 v18, v1
; MOVREL-NEXT: v_movreld_b32_e32 v17, s2
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_movreld_b32_e32 v17, s3
; MOVREL-NEXT: v_movreld_b32_e32 v18, s3
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB18_1
@ -1444,12 +1420,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1
; GPRIDX-NEXT: s_add_u32 s1, s0, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v0, v16
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v0, v17
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v1, v17
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
@ -1459,13 +1434,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
;
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_lshl_b32 s0, s2, 1
; MOVREL-NEXT: s_lshl_b32 m0, s2, 1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: s_add_u32 s0, s0, 1
; MOVREL-NEXT: v_movreld_b32_e32 v0, v16
; MOVREL-NEXT: s_mov_b32 m0, s0
; MOVREL-NEXT: v_movreld_b32_e32 v0, v17
; MOVREL-NEXT: v_movreld_b32_e32 v1, v17
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
@ -1490,10 +1462,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18
; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18
; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1
; GPRIDX-NEXT: s_add_u32 s3, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v34, v15
; GPRIDX-NEXT: v_mov_b32_e32 v33, v14
; GPRIDX-NEXT: v_mov_b32_e32 v32, v13
@ -1513,7 +1484,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; GPRIDX-NEXT: v_mov_b32_e32 v19, v16
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v19, v17
; GPRIDX-NEXT: v_mov_b32_e32 v20, v17
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
@ -1536,13 +1507,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_mov_b32_e32 v33, v14
; MOVREL-NEXT: v_mov_b32_e32 v32, v13
; MOVREL-NEXT: s_lshl_b32 s2, s1, 1
; MOVREL-NEXT: s_lshl_b32 m0, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18
; MOVREL-NEXT: v_mov_b32_e32 v31, v12
; MOVREL-NEXT: v_mov_b32_e32 v30, v11
; MOVREL-NEXT: v_mov_b32_e32 v29, v10
; MOVREL-NEXT: s_add_u32 s1, s2, 1
; MOVREL-NEXT: s_mov_b32 m0, s2
; MOVREL-NEXT: v_mov_b32_e32 v28, v9
; MOVREL-NEXT: v_mov_b32_e32 v27, v8
; MOVREL-NEXT: v_mov_b32_e32 v26, v7
@ -1553,8 +1522,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_mov_b32_e32 v21, v2
; MOVREL-NEXT: v_mov_b32_e32 v20, v1
; MOVREL-NEXT: v_movreld_b32_e32 v19, v16
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_movreld_b32_e32 v19, v17
; MOVREL-NEXT: v_movreld_b32_e32 v20, v17
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB20_1
@ -1783,9 +1751,9 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> in
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_add_u32 m0, s11, 1
; GPRIDX-NEXT: s_mov_b32 m0, s11
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movreld_b32 s0, s10
; GPRIDX-NEXT: s_movreld_b32 s1, s10
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
@ -1798,16 +1766,16 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> in
;
; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_1:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_add_u32 m0, s11, 1
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 m0, s11
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_movreld_b32 s0, s10
; MOVREL-NEXT: s_movreld_b32 s1, s10
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
@ -1835,9 +1803,9 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> in
; GPRIDX-NEXT: s_mov_b32 s5, s7
; GPRIDX-NEXT: s_mov_b32 s6, s8
; GPRIDX-NEXT: s_mov_b32 s7, s9
; GPRIDX-NEXT: s_add_u32 m0, s11, 7
; GPRIDX-NEXT: s_mov_b32 m0, s11
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movreld_b32 s0, s10
; GPRIDX-NEXT: s_movreld_b32 s7, s10
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
@ -1850,16 +1818,16 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> in
;
; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_7:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_add_u32 m0, s11, 7
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_movreld_b32 s0, s10
; MOVREL-NEXT: s_mov_b32 m0, s11
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: s_movreld_b32 s7, s10
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v1, s1
; MOVREL-NEXT: v_mov_b32_e32 v2, s2
@ -1879,68 +1847,66 @@ entry:
define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: v_add_u32_e32 v17, 1, v9
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v16, v7
; GPRIDX-NEXT: v_mov_b32_e32 v15, v6
; GPRIDX-NEXT: v_mov_b32_e32 v14, v5
; GPRIDX-NEXT: v_mov_b32_e32 v13, v4
; GPRIDX-NEXT: v_mov_b32_e32 v12, v3
; GPRIDX-NEXT: v_mov_b32_e32 v11, v2
; GPRIDX-NEXT: v_mov_b32_e32 v10, v1
; GPRIDX-NEXT: v_mov_b32_e32 v9, v0
; GPRIDX-NEXT: v_mov_b32_e32 v9, v8
; GPRIDX-NEXT: v_mov_b32_e32 v17, v7
; GPRIDX-NEXT: v_mov_b32_e32 v16, v6
; GPRIDX-NEXT: v_mov_b32_e32 v15, v5
; GPRIDX-NEXT: v_mov_b32_e32 v14, v4
; GPRIDX-NEXT: v_mov_b32_e32 v13, v3
; GPRIDX-NEXT: v_mov_b32_e32 v12, v2
; GPRIDX-NEXT: v_mov_b32_e32 v11, v1
; GPRIDX-NEXT: v_mov_b32_e32 v10, v0
; GPRIDX-NEXT: v_mov_b32_e32 v11, v8
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB29_1
; GPRIDX-NEXT: ; %bb.2:
; GPRIDX-NEXT: s_mov_b64 exec, s[0:1]
; GPRIDX-NEXT: v_mov_b32_e32 v0, v9
; GPRIDX-NEXT: v_mov_b32_e32 v1, v10
; GPRIDX-NEXT: v_mov_b32_e32 v2, v11
; GPRIDX-NEXT: v_mov_b32_e32 v3, v12
; GPRIDX-NEXT: v_mov_b32_e32 v4, v13
; GPRIDX-NEXT: v_mov_b32_e32 v5, v14
; GPRIDX-NEXT: v_mov_b32_e32 v6, v15
; GPRIDX-NEXT: v_mov_b32_e32 v7, v16
; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
; GPRIDX-NEXT: v_mov_b32_e32 v1, v11
; GPRIDX-NEXT: v_mov_b32_e32 v2, v12
; GPRIDX-NEXT: v_mov_b32_e32 v3, v13
; GPRIDX-NEXT: v_mov_b32_e32 v4, v14
; GPRIDX-NEXT: v_mov_b32_e32 v5, v15
; GPRIDX-NEXT: v_mov_b32_e32 v6, v16
; GPRIDX-NEXT: v_mov_b32_e32 v7, v17
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_1:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v17, 1, v9
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s1, v17
; MOVREL-NEXT: v_mov_b32_e32 v16, v7
; MOVREL-NEXT: v_mov_b32_e32 v9, v0
; MOVREL-NEXT: v_mov_b32_e32 v15, v6
; MOVREL-NEXT: v_mov_b32_e32 v14, v5
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17
; MOVREL-NEXT: v_readfirstlane_b32 s1, v9
; MOVREL-NEXT: v_mov_b32_e32 v17, v7
; MOVREL-NEXT: v_mov_b32_e32 v11, v1
; MOVREL-NEXT: v_mov_b32_e32 v16, v6
; MOVREL-NEXT: v_mov_b32_e32 v15, v5
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_mov_b32_e32 v13, v4
; MOVREL-NEXT: v_mov_b32_e32 v12, v3
; MOVREL-NEXT: v_mov_b32_e32 v11, v2
; MOVREL-NEXT: v_mov_b32_e32 v10, v1
; MOVREL-NEXT: v_movreld_b32_e32 v9, v8
; MOVREL-NEXT: v_mov_b32_e32 v14, v4
; MOVREL-NEXT: v_mov_b32_e32 v13, v3
; MOVREL-NEXT: v_mov_b32_e32 v12, v2
; MOVREL-NEXT: v_mov_b32_e32 v10, v0
; MOVREL-NEXT: v_movreld_b32_e32 v11, v8
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB29_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
; MOVREL-NEXT: v_mov_b32_e32 v0, v9
; MOVREL-NEXT: v_mov_b32_e32 v1, v10
; MOVREL-NEXT: v_mov_b32_e32 v2, v11
; MOVREL-NEXT: v_mov_b32_e32 v3, v12
; MOVREL-NEXT: v_mov_b32_e32 v4, v13
; MOVREL-NEXT: v_mov_b32_e32 v5, v14
; MOVREL-NEXT: v_mov_b32_e32 v6, v15
; MOVREL-NEXT: v_mov_b32_e32 v7, v16
; MOVREL-NEXT: v_mov_b32_e32 v0, v10
; MOVREL-NEXT: v_mov_b32_e32 v1, v11
; MOVREL-NEXT: v_mov_b32_e32 v2, v12
; MOVREL-NEXT: v_mov_b32_e32 v3, v13
; MOVREL-NEXT: v_mov_b32_e32 v4, v14
; MOVREL-NEXT: v_mov_b32_e32 v5, v15
; MOVREL-NEXT: v_mov_b32_e32 v6, v16
; MOVREL-NEXT: v_mov_b32_e32 v7, v17
; MOVREL-NEXT: ; return to shader part epilog
entry:
%idx.add = add i32 %idx, 1
@ -1951,68 +1917,66 @@ entry:
define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: v_add_u32_e32 v17, 7, v9
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v16, v7
; GPRIDX-NEXT: v_mov_b32_e32 v15, v6
; GPRIDX-NEXT: v_mov_b32_e32 v14, v5
; GPRIDX-NEXT: v_mov_b32_e32 v13, v4
; GPRIDX-NEXT: v_mov_b32_e32 v12, v3
; GPRIDX-NEXT: v_mov_b32_e32 v11, v2
; GPRIDX-NEXT: v_mov_b32_e32 v10, v1
; GPRIDX-NEXT: v_mov_b32_e32 v9, v0
; GPRIDX-NEXT: v_mov_b32_e32 v9, v8
; GPRIDX-NEXT: v_mov_b32_e32 v17, v7
; GPRIDX-NEXT: v_mov_b32_e32 v16, v6
; GPRIDX-NEXT: v_mov_b32_e32 v15, v5
; GPRIDX-NEXT: v_mov_b32_e32 v14, v4
; GPRIDX-NEXT: v_mov_b32_e32 v13, v3
; GPRIDX-NEXT: v_mov_b32_e32 v12, v2
; GPRIDX-NEXT: v_mov_b32_e32 v11, v1
; GPRIDX-NEXT: v_mov_b32_e32 v10, v0
; GPRIDX-NEXT: v_mov_b32_e32 v17, v8
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB30_1
; GPRIDX-NEXT: ; %bb.2:
; GPRIDX-NEXT: s_mov_b64 exec, s[0:1]
; GPRIDX-NEXT: v_mov_b32_e32 v0, v9
; GPRIDX-NEXT: v_mov_b32_e32 v1, v10
; GPRIDX-NEXT: v_mov_b32_e32 v2, v11
; GPRIDX-NEXT: v_mov_b32_e32 v3, v12
; GPRIDX-NEXT: v_mov_b32_e32 v4, v13
; GPRIDX-NEXT: v_mov_b32_e32 v5, v14
; GPRIDX-NEXT: v_mov_b32_e32 v6, v15
; GPRIDX-NEXT: v_mov_b32_e32 v7, v16
; GPRIDX-NEXT: v_mov_b32_e32 v0, v10
; GPRIDX-NEXT: v_mov_b32_e32 v1, v11
; GPRIDX-NEXT: v_mov_b32_e32 v2, v12
; GPRIDX-NEXT: v_mov_b32_e32 v3, v13
; GPRIDX-NEXT: v_mov_b32_e32 v4, v14
; GPRIDX-NEXT: v_mov_b32_e32 v5, v15
; GPRIDX-NEXT: v_mov_b32_e32 v6, v16
; GPRIDX-NEXT: v_mov_b32_e32 v7, v17
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_7:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v17, 7, v9
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s1, v17
; MOVREL-NEXT: v_mov_b32_e32 v16, v7
; MOVREL-NEXT: v_mov_b32_e32 v9, v0
; MOVREL-NEXT: v_mov_b32_e32 v15, v6
; MOVREL-NEXT: v_mov_b32_e32 v14, v5
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17
; MOVREL-NEXT: v_readfirstlane_b32 s1, v9
; MOVREL-NEXT: v_mov_b32_e32 v17, v7
; MOVREL-NEXT: v_mov_b32_e32 v16, v6
; MOVREL-NEXT: v_mov_b32_e32 v15, v5
; MOVREL-NEXT: v_mov_b32_e32 v14, v4
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_mov_b32_e32 v13, v4
; MOVREL-NEXT: v_mov_b32_e32 v12, v3
; MOVREL-NEXT: v_mov_b32_e32 v11, v2
; MOVREL-NEXT: v_mov_b32_e32 v10, v1
; MOVREL-NEXT: v_movreld_b32_e32 v9, v8
; MOVREL-NEXT: v_mov_b32_e32 v13, v3
; MOVREL-NEXT: v_mov_b32_e32 v12, v2
; MOVREL-NEXT: v_mov_b32_e32 v11, v1
; MOVREL-NEXT: v_mov_b32_e32 v10, v0
; MOVREL-NEXT: v_movreld_b32_e32 v17, v8
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB30_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
; MOVREL-NEXT: v_mov_b32_e32 v0, v9
; MOVREL-NEXT: v_mov_b32_e32 v1, v10
; MOVREL-NEXT: v_mov_b32_e32 v2, v11
; MOVREL-NEXT: v_mov_b32_e32 v3, v12
; MOVREL-NEXT: v_mov_b32_e32 v4, v13
; MOVREL-NEXT: v_mov_b32_e32 v5, v14
; MOVREL-NEXT: v_mov_b32_e32 v6, v15
; MOVREL-NEXT: v_mov_b32_e32 v7, v16
; MOVREL-NEXT: v_mov_b32_e32 v0, v10
; MOVREL-NEXT: v_mov_b32_e32 v1, v11
; MOVREL-NEXT: v_mov_b32_e32 v2, v12
; MOVREL-NEXT: v_mov_b32_e32 v3, v13
; MOVREL-NEXT: v_mov_b32_e32 v4, v14
; MOVREL-NEXT: v_mov_b32_e32 v5, v15
; MOVREL-NEXT: v_mov_b32_e32 v6, v16
; MOVREL-NEXT: v_mov_b32_e32 v7, v17
; MOVREL-NEXT: ; return to shader part epilog
entry:
%idx.add = add i32 %idx, 7
@ -2039,9 +2003,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: s_mov_b32 s14, s16
; GPRIDX-NEXT: s_mov_b32 s15, s17
; GPRIDX-NEXT: s_add_u32 m0, s20, 1
; GPRIDX-NEXT: s_mov_b32 m0, s20
; GPRIDX-NEXT: s_nop 0
; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19]
; GPRIDX-NEXT: s_movreld_b64 s[2:3], s[18:19]
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
@ -2071,9 +2035,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: s_mov_b32 s0, s2
; MOVREL-NEXT: s_mov_b32 s1, s3
; MOVREL-NEXT: s_add_u32 m0, s20, 1
; MOVREL-NEXT: s_mov_b32 s2, s4
; MOVREL-NEXT: s_mov_b32 s3, s5
; MOVREL-NEXT: s_mov_b32 m0, s20
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s5, s7
; MOVREL-NEXT: s_mov_b32 s6, s8
@ -2086,7 +2050,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
; MOVREL-NEXT: s_mov_b32 s13, s15
; MOVREL-NEXT: s_mov_b32 s14, s16
; MOVREL-NEXT: s_mov_b32 s15, s17
; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19]
; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19]
; MOVREL-NEXT: v_mov_b32_e32 v0, s0
; MOVREL-NEXT: v_mov_b32_e32 v4, s4
; MOVREL-NEXT: v_mov_b32_e32 v8, s8
@ -2126,85 +2090,81 @@ entry:
define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: v_add_u32_e32 v34, 1, v18
; GPRIDX-NEXT: s_mov_b64 s[0:1], exec
; GPRIDX-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v34
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v34
; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18
; GPRIDX-NEXT: s_add_u32 s3, s2, 1
; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v33, v15
; GPRIDX-NEXT: v_mov_b32_e32 v32, v14
; GPRIDX-NEXT: v_mov_b32_e32 v31, v13
; GPRIDX-NEXT: v_mov_b32_e32 v30, v12
; GPRIDX-NEXT: v_mov_b32_e32 v29, v11
; GPRIDX-NEXT: v_mov_b32_e32 v28, v10
; GPRIDX-NEXT: v_mov_b32_e32 v27, v9
; GPRIDX-NEXT: v_mov_b32_e32 v26, v8
; GPRIDX-NEXT: v_mov_b32_e32 v25, v7
; GPRIDX-NEXT: v_mov_b32_e32 v24, v6
; GPRIDX-NEXT: v_mov_b32_e32 v23, v5
; GPRIDX-NEXT: v_mov_b32_e32 v22, v4
; GPRIDX-NEXT: v_mov_b32_e32 v21, v3
; GPRIDX-NEXT: v_mov_b32_e32 v20, v2
; GPRIDX-NEXT: v_mov_b32_e32 v19, v1
; GPRIDX-NEXT: v_mov_b32_e32 v18, v0
; GPRIDX-NEXT: v_mov_b32_e32 v18, v16
; GPRIDX-NEXT: s_lshl_b32 s3, s3, 1
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v34, v15
; GPRIDX-NEXT: v_mov_b32_e32 v33, v14
; GPRIDX-NEXT: v_mov_b32_e32 v32, v13
; GPRIDX-NEXT: v_mov_b32_e32 v31, v12
; GPRIDX-NEXT: v_mov_b32_e32 v30, v11
; GPRIDX-NEXT: v_mov_b32_e32 v29, v10
; GPRIDX-NEXT: v_mov_b32_e32 v28, v9
; GPRIDX-NEXT: v_mov_b32_e32 v27, v8
; GPRIDX-NEXT: v_mov_b32_e32 v26, v7
; GPRIDX-NEXT: v_mov_b32_e32 v25, v6
; GPRIDX-NEXT: v_mov_b32_e32 v24, v5
; GPRIDX-NEXT: v_mov_b32_e32 v23, v4
; GPRIDX-NEXT: v_mov_b32_e32 v22, v3
; GPRIDX-NEXT: v_mov_b32_e32 v21, v2
; GPRIDX-NEXT: v_mov_b32_e32 v20, v1
; GPRIDX-NEXT: v_mov_b32_e32 v19, v0
; GPRIDX-NEXT: v_mov_b32_e32 v19, v16
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
; GPRIDX-NEXT: v_mov_b32_e32 v18, v17
; GPRIDX-NEXT: v_mov_b32_e32 v20, v17
; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc
; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc
; GPRIDX-NEXT: s_cbranch_execnz BB32_1
; GPRIDX-NEXT: ; %bb.2:
; GPRIDX-NEXT: s_mov_b64 exec, s[0:1]
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[22:25], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[30:33], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v34, 1, v18
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s1, v34
; MOVREL-NEXT: v_mov_b32_e32 v33, v15
; MOVREL-NEXT: v_mov_b32_e32 v18, v0
; MOVREL-NEXT: v_mov_b32_e32 v32, v14
; MOVREL-NEXT: v_mov_b32_e32 v31, v13
; MOVREL-NEXT: s_lshl_b32 s2, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v34
; MOVREL-NEXT: v_mov_b32_e32 v30, v12
; MOVREL-NEXT: v_mov_b32_e32 v29, v11
; MOVREL-NEXT: v_mov_b32_e32 v28, v10
; MOVREL-NEXT: s_add_u32 s1, s2, 1
; MOVREL-NEXT: s_mov_b32 m0, s2
; MOVREL-NEXT: v_mov_b32_e32 v27, v9
; MOVREL-NEXT: v_mov_b32_e32 v26, v8
; MOVREL-NEXT: v_mov_b32_e32 v25, v7
; MOVREL-NEXT: v_mov_b32_e32 v24, v6
; MOVREL-NEXT: v_mov_b32_e32 v23, v5
; MOVREL-NEXT: v_mov_b32_e32 v22, v4
; MOVREL-NEXT: v_mov_b32_e32 v21, v3
; MOVREL-NEXT: v_mov_b32_e32 v20, v2
; MOVREL-NEXT: v_mov_b32_e32 v19, v1
; MOVREL-NEXT: v_movreld_b32_e32 v18, v16
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_movreld_b32_e32 v18, v17
; MOVREL-NEXT: v_readfirstlane_b32 s1, v18
; MOVREL-NEXT: v_mov_b32_e32 v34, v15
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_mov_b32_e32 v33, v14
; MOVREL-NEXT: v_mov_b32_e32 v32, v13
; MOVREL-NEXT: s_add_u32 s2, s1, 1
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18
; MOVREL-NEXT: v_mov_b32_e32 v31, v12
; MOVREL-NEXT: v_mov_b32_e32 v30, v11
; MOVREL-NEXT: v_mov_b32_e32 v29, v10
; MOVREL-NEXT: s_lshl_b32 m0, s2, 1
; MOVREL-NEXT: v_mov_b32_e32 v28, v9
; MOVREL-NEXT: v_mov_b32_e32 v27, v8
; MOVREL-NEXT: v_mov_b32_e32 v26, v7
; MOVREL-NEXT: v_mov_b32_e32 v25, v6
; MOVREL-NEXT: v_mov_b32_e32 v24, v5
; MOVREL-NEXT: v_mov_b32_e32 v23, v4
; MOVREL-NEXT: v_mov_b32_e32 v22, v3
; MOVREL-NEXT: v_mov_b32_e32 v21, v2
; MOVREL-NEXT: v_mov_b32_e32 v20, v1
; MOVREL-NEXT: v_movreld_b32_e32 v19, v16
; MOVREL-NEXT: v_movreld_b32_e32 v20, v17
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB32_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off
; MOVREL-NEXT: s_endpgm
entry:
%idx.add = add i32 %idx, 1

View File

@ -500,19 +500,15 @@ body: |
; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec
; MOVREL: $m0 = COPY [[COPY2]]
; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec
; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]]
; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_1
; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 8, implicit-def $m0, implicit $m0
; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec
; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0
; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec
; GPRIDX: S_SET_GPR_IDX_OFF
; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]]
%0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
@ -574,19 +570,15 @@ body: |
; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9
; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc
; MOVREL: $m0 = COPY [[S_ADD_U32_]]
; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0
; MOVREL: $m0 = COPY [[COPY2]]
; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0
; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]]
; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32_add_1
; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8
; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9
; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc
; GPRIDX: $m0 = COPY [[S_ADD_U32_]]
; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0
; GPRIDX: $m0 = COPY [[COPY2]]
; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0
; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]]
%0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
%1:sgpr(s32) = COPY $sgpr8