From 70ab76a81b982354fea7f0cee85f7001ad30b2e6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 20 Apr 2021 14:44:41 -0400 Subject: [PATCH] AMDGPU: Fix indirect tail calls Fix a selection error on uniform callees, and use a regular call if divergent. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++++ llvm/lib/Target/AMDGPU/SIInstructions.td | 5 ++++ llvm/test/CodeGen/AMDGPU/sibling-call.ll | 30 +++++++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 785f2191dc9e..ae68c96d5e92 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2851,6 +2851,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization( if (!mayTailCallThisCC(CalleeCC)) return false; + // For a divergent call target, we need to do a waterfall loop over the + // possible callees which precludes us from using a simple jump. + if (Callee->isDivergent()) + return false; + MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 381f262abfea..6a4f984fe556 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -492,6 +492,11 @@ def SI_TCRETURN : SPseudoInstSI <(outs), let isConvergent = 1; } +// Handle selecting indirect tail calls +def : GCNPat< + (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), + (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff) +>; def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index c8aea669161e..a6a1061a295b 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -272,5 +272,35 @@ entry: ret i32 %ret } +@func_ptr_gv = external unnamed_addr addrspace(4) constant i32(i32, i32)*, align 4 + +; Do support tail calls with a uniform, but unknown, callee. +; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32: +; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]] +; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]] +; GCN: s_setpc_b64 [[FUNC_PTR]] +define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +entry: + %func.ptr.load = load i32(i32, i32)*, i32(i32, i32)* addrspace(4)* @func_ptr_gv + %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) + ret i32 %ret +} + +; We can't support a tail call to a divergent target. Use a waterfall +; loop around a regular call +; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32: +; GCN: v_readfirstlane_b32 +; GCN: v_readfirstlane_b32 +; GCN: s_and_saveexec_b64 +; GCN: s_swappc_b64 +; GCN: s_cbranch_execnz +; GCN: s_setpc_b64 +define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(i32(i32, i32)* %func.ptr, i32 %a, i32 %b, i32 %c) #1 { +entry: + %add = add i32 %b, %c + %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add) + ret i32 %ret +} + attributes #0 = { nounwind } attributes #1 = { nounwind noinline }