From ed2213e6efc1cc71a323f49585e859d6e41853ed Mon Sep 17 00:00:00 2001 From: Marek Olsak Date: Mon, 14 Mar 2016 15:57:14 +0000 Subject: [PATCH] AMDGPU/SI: Incomplete shader binaries need to finish execution at the end Reviewers: tstellarAMD, arsenm Subscribers: arsenm Differential Revision: http://reviews.llvm.org/D18058 llvm-svn: 263441 --- llvm/lib/Target/AMDGPU/SIInsertWaits.cpp | 8 --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 24 ++++++++ llvm/test/CodeGen/AMDGPU/ret_jump.ll | 57 +++++++++++++++++++ 3 files changed, 81 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/ret_jump.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index 58bfbd989282..85fa0a835e79 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -558,14 +558,6 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - - // Functions returning something shouldn't contain S_ENDPGM, because other - // bytecode will be appended after it. - if (!ReturnsVoid) { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - I->eraseFromParent(); - } } return Changes; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index edcfb0889bb6..7dd0d7bc8f78 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -486,6 +486,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { + MachineBasicBlock *EmptyMBBAtEnd = NULL; MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { @@ -562,6 +563,29 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; + + case AMDGPU::S_ENDPGM: { + if (MF.getInfo()->returnsVoid()) + break; + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // S_ENDPGM is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + } + + I->eraseFromParent(); + break; + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll new file mode 100644 index 000000000000..0bbbc7a9d67a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +target triple = "amdgcn--" + +; GCN-LABEL: {{^}}main: +; GCN: BB0_3: +; GCN-NEXT: s_branch [[LASTBB:BB[0-9]*_[0-9]*]] +; GCN-NEXT: BB0_ +; GCN: [[LASTBB]] +; GCN-NEXT: .Lfunc_end0: +; ModuleID = 'bugpoint-reduced-simplified.bc' +target triple = "amdgcn--" + +define <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { +main_body: + %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) + %p87 = fmul float undef, %p83 + %p88 = fadd float %p87, undef + %p93 = fadd float %p88, undef + %p97 = fmul float %p93, undef + %p102 = fsub float %p97, undef + %p104 = fmul float %p102, undef + %p106 = fadd float 0.000000e+00, %p104 + %p108 = fadd float undef, %p106 + br i1 undef, label %ENDIF69, label %ELSE + +ELSE: ; preds = %main_body + %p124 = fmul float %p108, %p108 + %p125 = fsub float %p124, undef + %p126 = fcmp olt float %p125, 0.000000e+00 + br i1 %p126, label %ENDIF69, label %ELSE41 + +ELSE41: ; preds = %ELSE + unreachable + +ENDIF69: ; preds = %ELSE, %main_body + ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.fabs.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.floor.f32(float) #1 + +attributes #0 = { "InitialPSInputAddr"="36983" "ShaderType"="0" } +attributes #1 = { nounwind readnone }