From 4d000d24889670bb433eb3379e8936c6fb1ab615 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 19 Jun 2019 20:44:15 +0000 Subject: [PATCH] AMDGPU: Fix folding immediate into readfirstlane through reg_sequence The def instruction for the vreg may not match, because it may be folding through a reg_sequence. The assert was overly conservative and not necessary. It's not actually important if DefMI really defined the register, because the fold that will be done cares about the def of the value that will be folded. For some reason copies aren't making it through the reg_sequence, although they should. llvm-svn: 363876 --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 7 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 - .../AMDGPU/constant-address-space-32bit.ll | 12 ++ llvm/test/CodeGen/AMDGPU/fold-readlane.mir | 123 ++++++++++++++++++ 4 files changed, 141 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 4a1fc1332c36..3f566884f6b0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -436,9 +436,11 @@ void SIFoldOperands::foldOperand( unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) @@ -523,6 +525,9 @@ void SIFoldOperands::foldOperand( return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 56935b35734a..5831abb8071c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6079,7 +6079,6 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, const MachineInstr &DefMI, const MachineInstr *UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); - assert(DefMI.definesRegister(VReg) && "wrong def instruction"); auto *TRI = MRI.getTargetRegisterInfo(); auto *DefBB = DefMI.getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 040bcbc01827..e90c85545b00 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -279,12 +279,24 @@ define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg noalias %p0) # ret float %r2 } +; CHECK-LABEL: {{^}}vgpr_arg_src: +; CHECK: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 +; CHECK: s_mov_b32 s[[ZERO:[0-9]+]] +; CHECK: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}} +define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) { +main_body: + %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg + %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 + ret float %tmp10 +} + ; Function Attrs: nounwind readnone speculatable declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7 !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir index 55b7a612d777..3c68686aa4a6 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir @@ -248,3 +248,126 @@ body: | %1:sreg_32_xm0 = S_MOV_B32 12 %2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec ... + +# Constant for subreg0 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence0{{$}} + +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Constant for subreg1 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence1{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec + +--- +name: fold-imm-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Different constant regs for each subreg +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence2{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1 +--- +name: fold-imm-readfirstlane-regsequence2 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Same constant reg for each subreg, so there are multiple constant uses +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence3{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence3 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# FIXME: This should fold +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence0{{$}} +# GCN: %0:vgpr_32 = COPY $sgpr10 +# GCN-NEXT: %1:vgpr_32 = COPY $sgpr11 +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:vgpr_32 = COPY $sgpr10 + %1:vgpr_32 = COPY $sgpr11 + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}} +# GCN: %0:sreg_32_xm0 = COPY $sgpr10 +# GCN-NEXT: %1:sreg_32_xm0 = COPY $sgpr11 +# GCN-NEXT: %2:vgpr_32 = COPY %0 +# GCN-NEXT: %3:vgpr_32 = COPY %1 +# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 +# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec +# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:sreg_32_xm0 = COPY $sgpr10 + %1:sreg_32_xm0 = COPY $sgpr11 + %2:vgpr_32 = COPY %0 + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec +...