forked from OSchip/llvm-project
AMDGPU: Allow tail calls for amdgpu_gfx functions
This commit is contained in:
parent
10d54e2f8d
commit
1dd23c6d53
|
@ -2829,6 +2829,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
|
|||
static bool mayTailCallThisCC(CallingConv::ID CC) {
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
case CallingConv::AMDGPU_Gfx:
|
||||
return true;
|
||||
default:
|
||||
return canGuaranteeTCO(CC);
|
||||
|
|
|
@ -3294,66 +3294,30 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
|
|||
; GFX9-LABEL: tail_call_byval_align16:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8
|
||||
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:12
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
|
||||
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
|
||||
; GFX9-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
|
||||
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[4:5]
|
||||
;
|
||||
; GFX10-LABEL: tail_call_byval_align16:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x400
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12
|
||||
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8
|
||||
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
|
||||
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
|
||||
; GFX10-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
|
||||
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
|
||||
; GFX10-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
entry:
|
||||
%alloca = alloca double, align 8, addrspace(5)
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -enable-var-scope %s
|
||||
|
||||
; Callee with SGPR and VGPR arguments
|
||||
define hidden amdgpu_gfx float @callee(float %v.arg0, float inreg %s.arg1) {
|
||||
; GCN-LABEL: callee:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
%add = fadd float %v.arg0, %s.arg1
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @caller(float %arg0) {
|
||||
; GCN-LABEL: caller:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
|
||||
; GCN-NEXT: s_mov_b32 s4, 2.0
|
||||
; GCN-NEXT: s_getpc_b64 s[6:7]
|
||||
; GCN-NEXT: s_add_u32 s6, s6, callee@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s7, s7, callee@rel32@hi+12
|
||||
; GCN-NEXT: s_setpc_b64 s[6:7]
|
||||
%add = fadd float %arg0, 1.0
|
||||
%call = tail call amdgpu_gfx float @callee(float %add, float 2.0)
|
||||
ret float %call
|
||||
}
|
Loading…
Reference in New Issue