diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 9e4e26f1392e..1f9f8056fdae 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -281,6 +281,7 @@ namespace { Register traceCopies(Register VirtReg) const; Register traceCopyChain(Register Reg) const; + bool shouldAllocateRegister(const Register Reg) const; int getStackSpaceFor(Register VirtReg); void spill(MachineBasicBlock::iterator Before, Register VirtReg, MCPhysReg AssignedReg, bool Kill, bool LiveOut); @@ -300,6 +301,12 @@ char RegAllocFast::ID = 0; INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false, false) +bool RegAllocFast::shouldAllocateRegister(const Register Reg) const { + assert(Register::isVirtualRegister(Reg)); + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + return ShouldAllocateClass(*TRI, RC); +} + void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) RegUnitStates[*UI] = NewState; @@ -839,6 +846,8 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) { assert(MO.isUndef() && "expected undef use"); Register VirtReg = MO.getReg(); assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg"); + if (!shouldAllocateRegister(VirtReg)) + return; LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg); MCPhysReg PhysReg; @@ -864,6 +873,8 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) { /// (tied or earlyclobber) that may interfere with preassigned uses. void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { + if (!shouldAllocateRegister(VirtReg)) + return; LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg); if (LRI != LiveVirtRegs.end()) { MCPhysReg PrevReg = LRI->PhysReg; @@ -897,6 +908,8 @@ void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, bool LookAtPhysRegUses) { assert(VirtReg.isVirtual() && "Not a virtual register"); + if (!shouldAllocateRegister(VirtReg)) + return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -947,6 +960,8 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { assert(VirtReg.isVirtual() && "Not a virtual register"); + if (!shouldAllocateRegister(VirtReg)) + return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -971,8 +986,13 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, Register Hint; if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) { Hint = MI.getOperand(0).getReg(); - assert(Hint.isPhysical() && - "Copy destination should already be assigned"); + if (Hint.isVirtual()) { + assert(!shouldAllocateRegister(Hint)); + Hint = Register(); + } else { + assert(Hint.isPhysical() && + "Copy destination should already be assigned"); + } } allocVirtReg(MI, *LRI, Hint, false); if (LRI->Error) { @@ -1080,6 +1100,8 @@ void RegAllocFast::addRegClassDefCounts(std::vector &RegClassDefCounts assert(RegClassDefCounts.size() == TRI->getNumRegClasses()); if (Reg.isVirtual()) { + if (!shouldAllocateRegister(Reg)) + return; const TargetRegisterClass *OpRC = MRI->getRegClass(Reg); for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses(); RCIdx != RCIdxEnd; ++RCIdx) { @@ -1139,6 +1161,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { if (MO.isReg()) { Register Reg = MO.getReg(); if (Reg.isVirtual()) { + if (!shouldAllocateRegister(Reg)) + continue; if (MO.isDef()) { HasDef = true; HasVRegDef = true; @@ -1202,7 +1226,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { } if (MO.isDef()) { - if (Reg.isVirtual()) + if (Reg.isVirtual() && shouldAllocateRegister(Reg)) DefOperandIndexes.push_back(I); addRegClassDefCounts(RegClassDefCounts, Reg); @@ -1292,6 +1316,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { Register Reg = MO.getReg(); if (!Reg) continue; + if (Reg.isVirtual()) { + assert(!shouldAllocateRegister(Reg)); + continue; + } assert(Reg.isPhysical()); if (MRI->isReserved(Reg)) continue; @@ -1338,7 +1366,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; if (MO.isUndef()) { @@ -1365,7 +1393,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; assert(MO.isUndef() && "Should only have undef virtreg uses left"); @@ -1388,6 +1416,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { Register Reg = MO.getReg(); if (!Reg) continue; + if (Reg.isVirtual()) { + assert(!shouldAllocateRegister(Reg)); + continue; + } assert(Reg.isPhysical() && "should have register assigned"); // We sometimes get odd situations like: @@ -1417,6 +1449,8 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) { for (Register Reg : MI.getUsedDebugRegs()) { if (!Register::isVirtualRegister(Reg)) continue; + if (!shouldAllocateRegister(Reg)) + continue; // Already spilled to a stackslot? int SS = StackSlotForVirtReg[Reg]; @@ -1457,7 +1491,7 @@ void RegAllocFast::handleBundle(MachineInstr &MI) { continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; DenseMap::iterator DI; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll index a6be8956dbcd..a5bfd4a88afc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -8,51 +8,50 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: v_mov_b32_e32 v14, v1 -; CHECK-NEXT: v_mov_b32_e32 v13, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v3 -; CHECK-NEXT: v_mov_b32_e32 v11, v4 -; CHECK-NEXT: v_mov_b32_e32 v10, v5 -; CHECK-NEXT: v_mov_b32_e32 v9, v6 -; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: v_mov_b32_e32 v15, v1 +; CHECK-NEXT: v_mov_b32_e32 v14, v2 +; CHECK-NEXT: v_mov_b32_e32 v13, v3 +; CHECK-NEXT: v_mov_b32_e32 v12, v4 +; CHECK-NEXT: v_mov_b32_e32 v11, v5 +; CHECK-NEXT: v_mov_b32_e32 v10, v6 +; CHECK-NEXT: v_mov_b32_e32 v9, v7 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v1, v14 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v12 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v10 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v8 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v16, s4, 0 -; CHECK-NEXT: v_writelane_b32 v16, s5, 1 -; CHECK-NEXT: v_writelane_b32 v16, s6, 2 -; CHECK-NEXT: v_writelane_b32 v16, s7, 3 +; CHECK-NEXT: v_writelane_b32 v8, s4, 0 +; CHECK-NEXT: v_writelane_b32 v8, s5, 1 +; CHECK-NEXT: v_writelane_b32 v8, s6, 2 +; CHECK-NEXT: v_writelane_b32 v8, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v16, s4, 4 +; CHECK-NEXT: v_writelane_b32 v8, s4, 4 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -60,15 +59,16 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v7, v8 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v5, v10 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v3, v12 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v1, v14 -; CHECK-NEXT: v_mov_b32_e32 v0, v15 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v0, v16 ; CHECK-NEXT: v_readfirstlane_b32 s12, v7 ; CHECK-NEXT: v_readfirstlane_b32 s10, v6 ; CHECK-NEXT: v_readfirstlane_b32 s9, v5 @@ -85,22 +85,22 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v16, s12, 5 -; CHECK-NEXT: v_writelane_b32 v16, s13, 6 -; CHECK-NEXT: v_writelane_b32 v16, s14, 7 -; CHECK-NEXT: v_writelane_b32 v16, s15, 8 -; CHECK-NEXT: v_writelane_b32 v16, s16, 9 -; CHECK-NEXT: v_writelane_b32 v16, s17, 10 -; CHECK-NEXT: v_writelane_b32 v16, s18, 11 -; CHECK-NEXT: v_writelane_b32 v16, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v6, v8 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v4, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v2, v12 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v0, v14 -; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_writelane_b32 v8, s12, 5 +; CHECK-NEXT: v_writelane_b32 v8, s13, 6 +; CHECK-NEXT: v_writelane_b32 v8, s14, 7 +; CHECK-NEXT: v_writelane_b32 v8, s15, 8 +; CHECK-NEXT: v_writelane_b32 v8, s16, 9 +; CHECK-NEXT: v_writelane_b32 v8, s17, 10 +; CHECK-NEXT: v_writelane_b32 v8, s18, 11 +; CHECK-NEXT: v_writelane_b32 v8, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v10 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v14 +; CHECK-NEXT: v_mov_b32_e32 v0, v15 +; CHECK-NEXT: v_mov_b32_e32 v1, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] @@ -113,40 +113,40 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v16, s4, 13 +; CHECK-NEXT: v_writelane_b32 v8, s4, 13 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: v_readlane_b32 s4, v16, 13 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s8, v16, 5 -; CHECK-NEXT: v_readlane_b32 s9, v16, 6 -; CHECK-NEXT: v_readlane_b32 s10, v16, 7 -; CHECK-NEXT: v_readlane_b32 s11, v16, 8 -; CHECK-NEXT: v_readlane_b32 s12, v16, 9 -; CHECK-NEXT: v_readlane_b32 s13, v16, 10 -; CHECK-NEXT: v_readlane_b32 s14, v16, 11 -; CHECK-NEXT: v_readlane_b32 s15, v16, 12 -; CHECK-NEXT: v_readlane_b32 s16, v16, 0 -; CHECK-NEXT: v_readlane_b32 s17, v16, 1 -; CHECK-NEXT: v_readlane_b32 s18, v16, 2 -; CHECK-NEXT: v_readlane_b32 s19, v16, 3 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s4, v8, 13 +; CHECK-NEXT: v_readlane_b32 s8, v8, 5 +; CHECK-NEXT: v_readlane_b32 s9, v8, 6 +; CHECK-NEXT: v_readlane_b32 s10, v8, 7 +; CHECK-NEXT: v_readlane_b32 s11, v8, 8 +; CHECK-NEXT: v_readlane_b32 s12, v8, 9 +; CHECK-NEXT: v_readlane_b32 s13, v8, 10 +; CHECK-NEXT: v_readlane_b32 s14, v8, 11 +; CHECK-NEXT: v_readlane_b32 s15, v8, 12 +; CHECK-NEXT: v_readlane_b32 s16, v8, 0 +; CHECK-NEXT: v_readlane_b32 s17, v8, 1 +; CHECK-NEXT: v_readlane_b32 s18, v8, 2 +; CHECK-NEXT: v_readlane_b32 s19, v8, 3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_readlane_b32 s4, v16, 4 +; CHECK-NEXT: v_readlane_b32 s4, v8, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 5077ddf894c3..5d0931d85f92 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -32,39 +32,39 @@ define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 0 +; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s2, -1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_read_b32 v1, v1 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_write_b32 v1, v2 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_endpgm @@ -107,35 +107,35 @@ define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nou ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_read_b32 v1, v1 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_write_b32 v1, v2 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -172,35 +172,35 @@ define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) no ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_read_b32 v1, v1 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_write_b32 v1, v2 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -238,33 +238,33 @@ define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) no ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_read_b32 v1, v1 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_write_b32 v1, v2 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -316,48 +316,48 @@ define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: ds_read_u8 v0, v0 +; GCN_DBG-NEXT: ds_read_u8 v1, v1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 +; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v1 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 ; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 -; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 -; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 -; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 +; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 +; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 +; GCN_DBG-NEXT: v_readlane_b32 s4, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_read_b32 v0, v0 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_read_b32 v1, v1 ; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 +; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 -; GCN_DBG-NEXT: ds_write_b32 v0, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 +; GCN_DBG-NEXT: ds_write_b32 v1, v2 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index f81c46ee2439..f5bf963cd2bc 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -420,11 +420,11 @@ bb.end: ; preds = %bb.then, %bb ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: +; GCN-O0: buffer_load_dword ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0: buffer_load_dword ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index 7891cded195d..3548e301aee0 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -48,6 +48,9 @@ ; VMEM: [[ENDIF]]: +; Restore val +; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload + ; Reload and restore exec mask ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -59,9 +62,6 @@ ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] -; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload - ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { entry: @@ -121,6 +121,7 @@ endif: ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: [[END]]: +; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -130,7 +131,6 @@ endif: ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { @@ -187,6 +187,7 @@ end: ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow +; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -198,7 +199,6 @@ end: ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]] ; Regular spill value restored after exec modification -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Followed by spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -230,6 +230,7 @@ end: ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: +; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] @@ -241,7 +242,6 @@ end: ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 1944f813f74e..bee7e80a5a7b 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4) ; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 @@ -23,7 +23,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr6 ; GCN-NEXT: renamable $sgpr2 = COPY killed renamable $sgpr5 ; GCN-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr4 - ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) + ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 16 ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 15 ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 14 @@ -40,59 +40,55 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN-NEXT: renamable $sgpr13 = S_MOV_B32 2 ; GCN-NEXT: renamable $sgpr14 = S_MOV_B32 1 ; GCN-NEXT: renamable $sgpr15 = S_MOV_B32 0 - ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr15 - ; GCN-NEXT: renamable $vgpr30 = COPY killed renamable $sgpr14 - ; GCN-NEXT: renamable $vgpr29 = COPY killed renamable $sgpr13 - ; GCN-NEXT: renamable $vgpr28 = COPY killed renamable $sgpr12 - ; GCN-NEXT: renamable $vgpr27 = COPY killed renamable $sgpr11 - ; GCN-NEXT: renamable $vgpr26 = COPY killed renamable $sgpr10 - ; GCN-NEXT: renamable $vgpr25 = COPY killed renamable $sgpr9 - ; GCN-NEXT: renamable $vgpr24 = COPY killed renamable $sgpr8 - ; GCN-NEXT: renamable $vgpr23 = COPY killed renamable $sgpr7 - ; GCN-NEXT: renamable $vgpr22 = COPY killed renamable $sgpr6 - ; GCN-NEXT: renamable $vgpr21 = COPY killed renamable $sgpr5 - ; GCN-NEXT: renamable $vgpr20 = COPY killed renamable $sgpr4 - ; GCN-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr3 - ; GCN-NEXT: renamable $vgpr18 = COPY killed renamable $sgpr2 - ; GCN-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr1 - ; GCN-NEXT: renamable $vgpr16 = COPY killed renamable $sgpr0 - ; GCN-NEXT: undef renamable $vgpr0 = COPY killed renamable $vgpr0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN-NEXT: renamable $vgpr1 = COPY killed renamable $vgpr30 - ; GCN-NEXT: renamable $vgpr2 = COPY killed renamable $vgpr29 - ; GCN-NEXT: renamable $vgpr3 = COPY killed renamable $vgpr28 - ; GCN-NEXT: renamable $vgpr4 = COPY killed renamable $vgpr27 - ; GCN-NEXT: renamable $vgpr5 = COPY killed renamable $vgpr26 - ; GCN-NEXT: renamable $vgpr6 = COPY killed renamable $vgpr25 - ; GCN-NEXT: renamable $vgpr7 = COPY killed renamable $vgpr24 - ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr23 - ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr22 - ; GCN-NEXT: renamable $vgpr10 = COPY killed renamable $vgpr21 - ; GCN-NEXT: renamable $vgpr11 = COPY killed renamable $vgpr20 - ; GCN-NEXT: renamable $vgpr12 = COPY killed renamable $vgpr19 - ; GCN-NEXT: renamable $vgpr13 = COPY killed renamable $vgpr18 - ; GCN-NEXT: renamable $vgpr14 = COPY killed renamable $vgpr17 - ; GCN-NEXT: renamable $vgpr15 = COPY killed renamable $vgpr16 - ; GCN-NEXT: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5) + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr15 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr14 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr13 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr12 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr11 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr10 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr9 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr8 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr7 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr6 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr5 + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr4 + ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr3 + ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr2 + ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr1 + ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr0 + ; GCN-NEXT: undef %28.sub0:vreg_512 = COPY [[COPY1]] + ; GCN-NEXT: %28.sub1:vreg_512 = COPY [[COPY2]] + ; GCN-NEXT: %28.sub2:vreg_512 = COPY [[COPY3]] + ; GCN-NEXT: %28.sub3:vreg_512 = COPY [[COPY4]] + ; GCN-NEXT: %28.sub4:vreg_512 = COPY [[COPY5]] + ; GCN-NEXT: %28.sub5:vreg_512 = COPY [[COPY6]] + ; GCN-NEXT: %28.sub6:vreg_512 = COPY [[COPY7]] + ; GCN-NEXT: %28.sub7:vreg_512 = COPY [[COPY8]] + ; GCN-NEXT: %28.sub8:vreg_512 = COPY [[COPY9]] + ; GCN-NEXT: %28.sub9:vreg_512 = COPY [[COPY10]] + ; GCN-NEXT: %28.sub10:vreg_512 = COPY [[COPY11]] + ; GCN-NEXT: %28.sub11:vreg_512 = COPY [[COPY12]] + ; GCN-NEXT: %28.sub12:vreg_512 = COPY [[COPY13]] + ; GCN-NEXT: %28.sub13:vreg_512 = COPY [[COPY14]] + ; GCN-NEXT: %28.sub14:vreg_512 = COPY [[COPY15]] + ; GCN-NEXT: %28.sub15:vreg_512 = COPY [[COPY16]] ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) - ; GCN-NEXT: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5) - ; GCN-NEXT: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec - ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: dead %45:vgpr_32 = COPY [[DEF]] + ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: renamable $vgpr0 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec - ; GCN-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) - ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; GCN-NEXT: [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %28, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec + ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]] ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1 - ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) ; GCN-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN-NEXT: {{ $}} @@ -103,9 +99,8 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN-NEXT: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1) + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]], killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index b8bb3a5a242a..3d597b998a65 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -227,14 +227,14 @@ entry: ; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}} ; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}} +; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -251,7 +251,7 @@ entry: ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -270,10 +270,10 @@ entry: ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -297,10 +297,10 @@ entry: ; W64-O0: s_xor_b64 exec, exec, [[SAVE]] ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] +; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] ; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill ; W64-O0: [[TERMBB]]: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 1e424ecde23d..e00825b6f4f0 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -19,10 +19,10 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: v_writelane_b32 v40, s33, 2 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _ZL13sleep_foreverv@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _ZL13sleep_foreverv@gotpcrel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index b7d45756f9c3..515253e6a43f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -191,23 +191,23 @@ define void @spill_sgpr_with_tail_call() #0 { ; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory. ; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr: -; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0 -; GCN: buffer_store_dword [[A]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0 -; GCN: buffer_store_dword [[B]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0 -; GCN: buffer_store_dword [[C]], off, s[0:3], s32 -; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0 -; GCN: buffer_store_dword [[D]], off, s[0:3], s32 +; GCN: v_writelane_b32 v{{[0-9]+}}, s34, 0 +; GCN: v_writelane_b32 v{{[0-9]+}}, s35, 1 +; GCN: v_writelane_b32 v{{[0-9]+}}, s36, 2 +; GCN: v_writelane_b32 v{{[0-9]+}}, s37, 3 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: #ASMEND -; GCN: buffer_load_dword [[E:v[0-9]+]] -; GCN: v_readlane_b32 s37, [[E]], 0 -; GCN: buffer_load_dword [[F:v[0-9]+]] -; GCN: v_readlane_b32 s36, [[F]], 0 -; GCN: buffer_load_dword [[G:v[0-9]+]] -; GCN: v_readlane_b32 s35, [[G]], 0 -; GCN: buffer_load_dword [[H:v[0-9]+]] -; GCN: v_readlane_b32 s34, [[H]], 0 +; GCN: buffer_load_dword v{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}} +; GCN: v_readlane_b32 s37, v{{[0-9]+}}, 3 +; GCN: v_readlane_b32 s36, v{{[0-9]+}}, 2 +; GCN: v_readlane_b32 s35, v{{[0-9]+}}, 1 +; GCN: v_readlane_b32 s34, v{{[0-9]+}}, 0 define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %a = load <4 x i32>, <4 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 29f8c60ad281..5d3db4ea38c6 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -133,7 +133,7 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -144,18 +144,18 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] -; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 3 +; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -165,23 +165,23 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 5 +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -194,19 +194,20 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 5 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 3 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s34, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 @@ -215,7 +216,7 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]