diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 4ad7041d329f..91fbf6ad4645 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -160,6 +160,15 @@ namespace { bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); + + /// \brief If copy instruction \p MI is a virtual register copy, track it in + /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was + /// previously seen as a copy, replace the uses of this copy with the + /// previously seen copy's destination register. + bool foldRedundantCopy(MachineInstr *MI, + SmallSet &CopiedFromRegs, + DenseMap &CopyMIs); + bool isLoadFoldable(MachineInstr *MI, SmallSet &FoldAsLoadDefCandidates); @@ -1335,6 +1344,65 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, return false; } +// FIXME: This is very simple and misses some cases which should be handled when +// motivating examples are found. +// +// The copy rewriting logic should look at uses as well as defs and be able to +// eliminate copies across blocks. +// +// Later copies that are subregister extracts will also not be eliminated since +// only the first copy is considered. +// +// e.g. +// %vreg1 = COPY %vreg0 +// %vreg2 = COPY %vreg0:sub1 +// +// Should replace %vreg2 uses with %vreg1:sub1 +bool PeepholeOptimizer::foldRedundantCopy( + MachineInstr *MI, + SmallSet &CopySrcRegs, + DenseMap &CopyMIs) { + assert(MI->isCopy()); + + unsigned SrcReg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + + unsigned DstReg = MI->getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + if (CopySrcRegs.insert(SrcReg).second) { + // First copy of this reg seen. + CopyMIs.insert(std::make_pair(SrcReg, MI)); + return false; + } + + MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second; + + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg(); + + // Can't replace different subregister extracts. + if (SrcSubReg != PrevSrcSubReg) + return false; + + unsigned PrevDstReg = PrevCopy->getOperand(0).getReg(); + + // Only replace if the copy register class is the same. + // + // TODO: If we have multiple copies to different register classes, we may want + // to track multiple copies of the same source register. + if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg)) + return false; + + MRI->replaceRegWith(DstReg, PrevDstReg); + + // Lifetime of the previous copy has been extended. + MRI->clearKillFlags(PrevDstReg); + return true; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipOptnoneFunction(*MF.getFunction())) return false; @@ -1368,6 +1436,10 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { DenseMap ImmDefMIs; SmallSet FoldAsLoadDefCandidates; + // Set of virtual registers that are copied from. + SmallSet CopySrcRegs; + DenseMap CopySrcMIs; + for (MachineBasicBlock::iterator MII = I->begin(), MIE = I->end(); MII != MIE; ) { MachineInstr *MI = &*MII; @@ -1410,6 +1482,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { continue; } + if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) { + LocalMIs.erase(MI); + MI->eraseFromParent(); + Changed = true; + continue; + } + if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) { SeenMoveImm = true; } else { diff --git a/llvm/test/CodeGen/AMDGPU/address-space.ll b/llvm/test/CodeGen/AMDGPU/address-space.ll index 4be8c5847529..3aa2f653bf9c 100644 --- a/llvm/test/CodeGen/AMDGPU/address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/address-space.ll @@ -5,15 +5,11 @@ %struct.foo = type { [3 x float], [3 x float] } -; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is -; already in a VGPR after the first read. - ; CHECK-LABEL: {{^}}do_as_ptr_calcs: ; CHECK: s_load_dword [[SREG1:s[0-9]+]], -; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index 7debc130a64c..a3cb158be7b5 100644 --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -41,6 +41,32 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa ret void } +; GCN-LABEL: {{^}}test_use_s_v_s: +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} + +; GCN: buffer_load_dword [[VA0:v[0-9]+]] +; GCN-NOT: v_mov_b32 +; GCN: buffer_load_dword [[VA1:v[0-9]+]] + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-NOT: v_mov_b32 + +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] +; GCN: buffer_store_dword [[RESULT0]] +; GCN: buffer_store_dword [[RESULT1]] +define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { + %va0 = load volatile float, float addrspace(1)* %in + %va1 = load volatile float, float addrspace(1)* %in + %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1 + %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1 + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc @@ -199,14 +225,11 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out ; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} ; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 +; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]] -; FIXME: Why do we end up with 2 copies of the same SGPR? These should be CSE'd -; GCN: v_mov_b32_e32 [[VS1_1:v[0-9]+]], [[SGPR1]] -; GCN: v_mov_b32_e32 [[VS1_0:v[0-9]+]], [[SGPR1]] - -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1_0]], [[VK0]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]] ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1_1]], [[VK1]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] @@ -224,24 +247,16 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 { ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} ; GCN-DAG: s_mov_b32 s[[SK0_SUB1:[0-9]+]], 0x40900000 ; GCN-DAG: s_mov_b32 s[[SZERO:[0-9]+]], 0{{$}} -; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]] -; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]] - -; GCN-DAG: s_mov_b32 s[[SK1_SUB0:[0-9]+]], 0x40b00000{{$}} - -; FIXME: Redundant copies -; GCN: v_mov_b32_e32 v[[VS1_1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] -; GCN: v_mov_b32_e32 v[[VS1_1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] -; GCN: v_mov_b32_e32 v[[VS1_0_SUB0:[0-9]+]], s[[SGPR1_SUB0]] -; GCN: v_mov_b32_e32 v[[VS1_0_SUB1:[0-9]+]], s[[SGPR1_SUB1]] - - -; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_0_SUB0]]:[[VS1_0_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}} +; GCN: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]] +; GCN: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]] +; GCN-DAG: s_mov_b32 s[[SK1_SUB1:[0-9]+]], 0x40b00000{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] +; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] ; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB0:[0-9]+]], s[[SZERO]] -; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB0]] - -; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_1_SUB0]]:[[VS1_1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB1]] +; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}} +; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT0]] ; GCN: buffer_store_dwordx2 [[RESULT1]]