From 52ec7379adfa27b24f834551a2b3bf2b7249549c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 3 Jan 2020 23:13:15 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Fold add of constant into G_INSERT_VECTOR_ELT Move the subregister base like in the extract case. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 5 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 88 ++-- .../AMDGPU/GlobalISel/insertelement.ll | 410 ++++++++---------- .../inst-select-insert-vector-elt.mir | 24 +- 4 files changed, 260 insertions(+), 267 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index c867240289f8..d5f793bc1233 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1930,12 +1930,15 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) return false; + unsigned SubReg; + std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, + ValSize / 8); + const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && STI.useVGPRIndexMode(); MachineBasicBlock *BB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned SubReg = ValSize == 64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; if (IndexMode) { BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c4b20f221ec6..9dec099c227e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1511,6 +1511,25 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); } +/// Utility function for pushing dynamic vector indexes with a constant offset +/// into waterwall loops. +static void reinsertVectorIndexAdd(MachineIRBuilder &B, + MachineInstr &IdxUseInstr, + unsigned OpIdx, + unsigned ConstOffset) { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT S32 = LLT::scalar(32); + Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); + B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); + + auto MaterializedOffset = B.buildConstant(S32, ConstOffset); + + auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); + MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); + IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -2011,20 +2030,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( ConstOffset > 0 && ConstOffset < SrcTy.getNumElements(); - // Re-insert the constant offset add inside the waterfall loop. - auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr, - unsigned OpIdx) { - Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); - B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); - - auto MaterializedOffset = B.buildConstant(S32, ConstOffset); - - auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); - MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); - MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); - IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); - }; - // Move the base register. We'll re-insert the add later. if (ShouldMoveIndexIntoLoop) MI.getOperand(2).setReg(BaseIdxReg); @@ -2051,8 +2056,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( buildVCopy(B, DstReg, TmpReg); } + // Re-insert the constant offset add inside the waterfall loop. if (ShouldMoveIndexIntoLoop) - ReinsertIndexAdd(MI, 2); + reinsertVectorIndexAdd(B, MI, 2, ConstOffset); return; } @@ -2113,7 +2119,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } if (ShouldMoveIndexIntoLoop) - ReinsertIndexAdd(*IdxLo, 1); + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); return; } @@ -2126,26 +2132,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(0).empty()); assert(OpdMapper.getVRegs(3).empty()); + const RegisterBank *IdxBank = + OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + if (substituteSimpleCopyRegs(OpdMapper, 1)) MRI.setType(MI.getOperand(1).getReg(), VecTy); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + Register BaseIdxReg; + unsigned ConstOffset; + MachineInstr *OffsetDef; + std::tie(BaseIdxReg, ConstOffset, OffsetDef) = + AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); + + // See if the index is an add of a constant which will be foldable by moving + // the base register of the index later if this is going to be executed in a + // waterfall loop. This is essentially to reassociate the add of a constant + // with the readfirstlane. + bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && + ConstOffset > 0 && + ConstOffset < VecTy.getNumElements(); + + // Move the base register. We'll re-insert the add later. + if (ShouldMoveIndexIntoLoop) + MI.getOperand(3).setReg(BaseIdxReg); + + if (InsRegs.empty()) { - applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, { 3 }); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) { + MachineIRBuilder B(MI); + reinsertVectorIndexAdd(B, MI, 3, ConstOffset); + } + return; } - Register SrcReg = MI.getOperand(1).getReg(); - Register InsReg = MI.getOperand(2).getReg(); - Register IdxReg = MI.getOperand(3).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - LLT InsTy = MRI.getType(InsReg); - (void)InsTy; assert(InsTy.getSizeInBits() == 64); const LLT S32 = LLT::scalar(32); - LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); MachineIRBuilder B(MI); auto CastSrc = B.buildBitcast(Vec32, SrcReg); @@ -2158,7 +2191,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). - auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxLo = B.buildShl(S32, BaseIdxReg, One); auto IdxHi = B.buildAdd(S32, IdxLo, One); auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); @@ -2192,6 +2225,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl( executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), OpsToWaterfall, MRI); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); + return; } case AMDGPU::G_INTRINSIC: { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 7ca65f9aec97..7b6d247d0023 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -747,10 +747,9 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: s_mov_b64 s[4:5], exec ; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s6, v2 +; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 -; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1 -; GPRIDX-NEXT: s_add_u32 s7, s6, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 ; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 ; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 @@ -770,7 +769,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -831,13 +830,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; MOVREL-NEXT: v_mov_b32_e32 v4, v20 ; MOVREL-NEXT: v_mov_b32_e32 v5, v21 ; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: s_lshl_b32 s6, s5, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s5, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 ; MOVREL-NEXT: v_mov_b32_e32 v7, v23 ; MOVREL-NEXT: v_mov_b32_e32 v8, v24 ; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: s_add_u32 s5, s6, 1 -; MOVREL-NEXT: s_mov_b32 m0, s6 ; MOVREL-NEXT: v_mov_b32_e32 v10, v26 ; MOVREL-NEXT: v_mov_b32_e32 v11, v27 ; MOVREL-NEXT: v_mov_b32_e32 v12, v28 @@ -848,8 +845,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; MOVREL-NEXT: v_mov_b32_e32 v17, v33 ; MOVREL-NEXT: v_mov_b32_e32 v18, v34 ; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: s_mov_b32 m0, s5 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB13_1 @@ -916,10 +912,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v0 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v1, v17 ; GPRIDX-NEXT: v_mov_b32_e32 v2, v18 ; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 @@ -939,7 +934,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_mov_b32_e32 v1, s18 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v1, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s19 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -994,13 +989,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v2, v18 ; MOVREL-NEXT: v_mov_b32_e32 v3, v19 ; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 ; MOVREL-NEXT: v_mov_b32_e32 v5, v21 ; MOVREL-NEXT: v_mov_b32_e32 v6, v22 ; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v8, v24 ; MOVREL-NEXT: v_mov_b32_e32 v9, v25 ; MOVREL-NEXT: v_mov_b32_e32 v10, v26 @@ -1011,8 +1004,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v15, v31 ; MOVREL-NEXT: v_mov_b32_e32 v16, v32 ; MOVREL-NEXT: v_movreld_b32_e32 v1, s18 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v1, s19 +; MOVREL-NEXT: v_movreld_b32_e32 v2, s19 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB14_1 @@ -1072,12 +1064,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 ; GPRIDX-NEXT: s_lshl_b32 s0, s18, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v2, v1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off @@ -1103,12 +1094,12 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_lshl_b32 s16, s18, 1 ; MOVREL-NEXT: v_mov_b32_e32 v17, s15 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_lshl_b32 m0, s18, 1 ; MOVREL-NEXT: v_mov_b32_e32 v15, s13 -; MOVREL-NEXT: v_mov_b32_e32 v14, s12 ; MOVREL-NEXT: v_mov_b32_e32 v16, s14 -; MOVREL-NEXT: s_mov_b32 m0, s16 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 ; MOVREL-NEXT: v_mov_b32_e32 v13, s11 ; MOVREL-NEXT: v_mov_b32_e32 v12, s10 ; MOVREL-NEXT: v_mov_b32_e32 v11, s9 @@ -1120,15 +1111,12 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v5, s3 ; MOVREL-NEXT: v_mov_b32_e32 v4, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 -; MOVREL-NEXT: s_add_u32 s0, s16, 1 -; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; MOVREL-NEXT: s_endpgm entry: @@ -1148,12 +1136,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_lshl_b32 s0, s4, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, s3 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off @@ -1163,13 +1150,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_lshl_b32 s0, s4, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s4, 1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: s_add_u32 s0, s0, 1 ; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v0, s3 +; MOVREL-NEXT: v_movreld_b32_e32 v1, s3 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1226,10 +1210,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 ; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 ; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 @@ -1249,7 +1232,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1304,13 +1287,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v4, v20 ; MOVREL-NEXT: v_mov_b32_e32 v5, v21 ; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 ; MOVREL-NEXT: v_mov_b32_e32 v7, v23 ; MOVREL-NEXT: v_mov_b32_e32 v8, v24 ; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v10, v26 ; MOVREL-NEXT: v_mov_b32_e32 v11, v27 ; MOVREL-NEXT: v_mov_b32_e32 v12, v28 @@ -1321,8 +1302,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; MOVREL-NEXT: v_mov_b32_e32 v17, v33 ; MOVREL-NEXT: v_mov_b32_e32 v18, v34 ; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB17_1 @@ -1352,10 +1332,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s4, v16 +; GPRIDX-NEXT: s_lshl_b32 s5, s4, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 -; GPRIDX-NEXT: s_lshl_b32 s4, s4, 1 -; GPRIDX-NEXT: s_add_u32 s5, s4, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v32, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v31, v14 ; GPRIDX-NEXT: v_mov_b32_e32 v30, v13 @@ -1375,7 +1354,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1398,13 +1377,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; MOVREL-NEXT: v_mov_b32_e32 v17, v0 ; MOVREL-NEXT: v_mov_b32_e32 v31, v14 ; MOVREL-NEXT: v_mov_b32_e32 v30, v13 -; MOVREL-NEXT: s_lshl_b32 s4, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16 ; MOVREL-NEXT: v_mov_b32_e32 v29, v12 ; MOVREL-NEXT: v_mov_b32_e32 v28, v11 ; MOVREL-NEXT: v_mov_b32_e32 v27, v10 -; MOVREL-NEXT: s_add_u32 s1, s4, 1 -; MOVREL-NEXT: s_mov_b32 m0, s4 ; MOVREL-NEXT: v_mov_b32_e32 v26, v9 ; MOVREL-NEXT: v_mov_b32_e32 v25, v8 ; MOVREL-NEXT: v_mov_b32_e32 v24, v7 @@ -1415,8 +1392,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; MOVREL-NEXT: v_mov_b32_e32 v19, v2 ; MOVREL-NEXT: v_mov_b32_e32 v18, v1 ; MOVREL-NEXT: v_movreld_b32_e32 v17, s2 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v17, s3 +; MOVREL-NEXT: v_movreld_b32_e32 v18, s3 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB18_1 @@ -1444,12 +1420,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double % ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v0, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v17 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v1, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off @@ -1459,13 +1434,10 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double % ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_lshl_b32 s0, s2, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: s_add_u32 s0, s0, 1 ; MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v1, v17 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1490,10 +1462,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 ; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 @@ -1513,7 +1484,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v19, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1536,13 +1507,11 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_mov_b32_e32 v33, v14 ; MOVREL-NEXT: v_mov_b32_e32 v32, v13 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v31, v12 ; MOVREL-NEXT: v_mov_b32_e32 v30, v11 ; MOVREL-NEXT: v_mov_b32_e32 v29, v10 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v28, v9 ; MOVREL-NEXT: v_mov_b32_e32 v27, v8 ; MOVREL-NEXT: v_mov_b32_e32 v26, v7 @@ -1553,8 +1522,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double % ; MOVREL-NEXT: v_mov_b32_e32 v21, v2 ; MOVREL-NEXT: v_mov_b32_e32 v20, v1 ; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB20_1 @@ -1783,9 +1751,9 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> in ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_add_u32 m0, s11, 1 +; GPRIDX-NEXT: s_mov_b32 m0, s11 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_movreld_b32 s1, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1798,16 +1766,16 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> in ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_add_u32 m0, s11, 1 ; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_movreld_b32 s1, s10 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -1835,9 +1803,9 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> in ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_add_u32 m0, s11, 7 +; GPRIDX-NEXT: s_mov_b32 m0, s11 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_movreld_b32 s7, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1850,16 +1818,16 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> in ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_add_u32 m0, s11, 7 ; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_movreld_b32 s7, s10 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -1879,68 +1847,66 @@ entry: define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v17, 1, v9 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 ; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v8 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc ; GPRIDX-NEXT: s_cbranch_execnz BB29_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v17, 1, v9 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v17, v7 +; MOVREL-NEXT: v_mov_b32_e32 v11, v1 +; MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: v_mov_b32_e32 v14, v4 +; MOVREL-NEXT: v_mov_b32_e32 v13, v3 +; MOVREL-NEXT: v_mov_b32_e32 v12, v2 +; MOVREL-NEXT: v_mov_b32_e32 v10, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v11, v8 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB29_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 1 @@ -1951,68 +1917,66 @@ entry: define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v17, 7, v9 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 ; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v8 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc ; GPRIDX-NEXT: s_cbranch_execnz BB30_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v17, 7, v9 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v17, v7 +; MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; MOVREL-NEXT: v_mov_b32_e32 v14, v4 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: v_mov_b32_e32 v13, v3 +; MOVREL-NEXT: v_mov_b32_e32 v12, v2 +; MOVREL-NEXT: v_mov_b32_e32 v11, v1 +; MOVREL-NEXT: v_mov_b32_e32 v10, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v17, v8 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB30_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 7 @@ -2039,9 +2003,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_add_u32 m0, s20, 1 +; GPRIDX-NEXT: s_mov_b32 m0, s20 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19] +; GPRIDX-NEXT: s_movreld_b64 s[2:3], s[18:19] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -2071,9 +2035,9 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_add_u32 m0, s20, 1 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 m0, s20 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 @@ -2086,7 +2050,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v ; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19] +; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v4, s4 ; MOVREL-NEXT: v_mov_b32_e32 v8, s8 @@ -2126,85 +2090,81 @@ entry: define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v34, 1, v18 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v34 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v34 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 ; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v33, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v32, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v31, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v30, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v29, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v28, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v16 +; GPRIDX-NEXT: s_lshl_b32 s3, s3, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v18, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc ; GPRIDX-NEXT: s_cbranch_execnz BB32_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[18:21], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[22:25], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[26:29], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[30:33], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v34, 1, v18 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v34 -; MOVREL-NEXT: v_mov_b32_e32 v33, v15 -; MOVREL-NEXT: v_mov_b32_e32 v18, v0 -; MOVREL-NEXT: v_mov_b32_e32 v32, v14 -; MOVREL-NEXT: v_mov_b32_e32 v31, v13 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v34 -; MOVREL-NEXT: v_mov_b32_e32 v30, v12 -; MOVREL-NEXT: v_mov_b32_e32 v29, v11 -; MOVREL-NEXT: v_mov_b32_e32 v28, v10 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v27, v9 -; MOVREL-NEXT: v_mov_b32_e32 v26, v8 -; MOVREL-NEXT: v_mov_b32_e32 v25, v7 -; MOVREL-NEXT: v_mov_b32_e32 v24, v6 -; MOVREL-NEXT: v_mov_b32_e32 v23, v5 -; MOVREL-NEXT: v_mov_b32_e32 v22, v4 -; MOVREL-NEXT: v_mov_b32_e32 v21, v3 -; MOVREL-NEXT: v_mov_b32_e32 v20, v2 -; MOVREL-NEXT: v_mov_b32_e32 v19, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v18, v16 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v18, v17 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v34, v15 +; MOVREL-NEXT: v_mov_b32_e32 v19, v0 +; MOVREL-NEXT: v_mov_b32_e32 v33, v14 +; MOVREL-NEXT: v_mov_b32_e32 v32, v13 +; MOVREL-NEXT: s_add_u32 s2, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v31, v12 +; MOVREL-NEXT: v_mov_b32_e32 v30, v11 +; MOVREL-NEXT: v_mov_b32_e32 v29, v10 +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 +; MOVREL-NEXT: v_mov_b32_e32 v28, v9 +; MOVREL-NEXT: v_mov_b32_e32 v27, v8 +; MOVREL-NEXT: v_mov_b32_e32 v26, v7 +; MOVREL-NEXT: v_mov_b32_e32 v25, v6 +; MOVREL-NEXT: v_mov_b32_e32 v24, v5 +; MOVREL-NEXT: v_mov_b32_e32 v23, v4 +; MOVREL-NEXT: v_mov_b32_e32 v22, v3 +; MOVREL-NEXT: v_mov_b32_e32 v21, v2 +; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 +; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB32_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[22:25], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[26:29], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[30:33], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off ; MOVREL-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir index 5708ea00f62a..04df5559e8d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir @@ -500,19 +500,15 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; MOVREL: $m0 = COPY [[S_ADD_U32_]] - ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_1 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 8, implicit-def $m0, implicit $m0 - ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec ; GPRIDX: S_SET_GPR_IDX_OFF ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -574,19 +570,15 @@ body: | ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; MOVREL: $m0 = COPY [[S_ADD_U32_]] - ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0 ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32_add_1 ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GPRIDX: $m0 = COPY [[S_ADD_U32_]] - ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0 ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:sgpr(s32) = COPY $sgpr8