AMDGPU: Allow tail calls for amdgpu_gfx functions

This commit is contained in:
Matt Arsenault 2021-03-14 16:14:03 -04:00
parent 10d54e2f8d
commit 1dd23c6d53
3 changed files with 33 additions and 40 deletions

View File

@ -2829,6 +2829,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_Gfx:
return true;
default:
return canGuaranteeTCO(CC);

View File

@ -3294,66 +3294,30 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-LABEL: tail_call_byval_align16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:12
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: tail_call_byval_align16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x400
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: s_sub_u32 s32, s32, 0x400
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[4:5]
entry:
%alloca = alloca double, align 8, addrspace(5)

View File

@ -0,0 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s
; Callee with SGPR and VGPR arguments
define hidden amdgpu_gfx float @callee(float %v.arg0, float inreg %s.arg1) {
; GCN-LABEL: callee:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v0, s4, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%add = fadd float %v.arg0, %s.arg1
ret float %add
}
define amdgpu_gfx float @caller(float %arg0) {
; GCN-LABEL: caller:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 2.0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, callee@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, callee@rel32@hi+12
; GCN-NEXT: s_setpc_b64 s[6:7]
%add = fadd float %arg0, 1.0
%call = tail call amdgpu_gfx float @callee(float %add, float 2.0)
ret float %call
}