2019-02-06 03:50:32 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
|
|
; RUN: llc -O0 -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -stop-after=regallocfast < %s | FileCheck -check-prefixes=GCN %s
|
|
|
|
|
|
|
|
; Verify that we consider the xor at the end of the waterfall loop emitted for
|
|
|
|
; divergent indirect addressing as a terminator.
|
|
|
|
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
|
|
|
|
; There should be no spill code inserted between the xor and the real terminator
|
|
|
|
define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
|
|
|
|
; GCN-LABEL: name: extract_w_offset_vgpr
|
|
|
|
; GCN: bb.0.entry:
|
|
|
|
; GCN: successors: %bb.1(0x80000000)
|
|
|
|
; GCN: liveins: $vgpr0, $sgpr0_sgpr1
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5)
|
2019-05-01 06:08:23 +08:00
|
|
|
; GCN: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 8 from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $sgpr6 = COPY renamable $sgpr1
|
2019-05-16 20:50:39 +08:00
|
|
|
; GCN: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $sgpr4 = S_MOV_B32 61440
|
|
|
|
; GCN: renamable $sgpr5 = S_MOV_B32 -1
|
|
|
|
; GCN: undef renamable $sgpr0 = COPY killed renamable $sgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
|
|
|
|
; GCN: renamable $sgpr1 = COPY killed renamable $sgpr6
|
|
|
|
; GCN: renamable $sgpr2 = COPY killed renamable $sgpr5
|
|
|
|
; GCN: renamable $sgpr3 = COPY killed renamable $sgpr4
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.2, align 4, addrspace 5)
|
2019-05-16 20:50:39 +08:00
|
|
|
; GCN: renamable $sgpr0 = S_MOV_B32 16
|
|
|
|
; GCN: renamable $sgpr1 = S_MOV_B32 15
|
|
|
|
; GCN: renamable $sgpr2 = S_MOV_B32 14
|
2020-01-22 06:27:57 +08:00
|
|
|
; GCN: renamable $sgpr3 = S_MOV_B32 13
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $sgpr4 = S_MOV_B32 12
|
|
|
|
; GCN: renamable $sgpr5 = S_MOV_B32 11
|
|
|
|
; GCN: renamable $sgpr6 = S_MOV_B32 10
|
|
|
|
; GCN: renamable $sgpr7 = S_MOV_B32 9
|
|
|
|
; GCN: renamable $sgpr8 = S_MOV_B32 8
|
|
|
|
; GCN: renamable $sgpr9 = S_MOV_B32 7
|
|
|
|
; GCN: renamable $sgpr10 = S_MOV_B32 6
|
|
|
|
; GCN: renamable $sgpr11 = S_MOV_B32 5
|
|
|
|
; GCN: renamable $sgpr12 = S_MOV_B32 3
|
|
|
|
; GCN: renamable $sgpr13 = S_MOV_B32 2
|
|
|
|
; GCN: renamable $sgpr14 = S_MOV_B32 1
|
|
|
|
; GCN: renamable $sgpr15 = S_MOV_B32 0
|
|
|
|
; GCN: renamable $vgpr0 = COPY killed renamable $sgpr15
|
|
|
|
; GCN: renamable $vgpr30 = COPY killed renamable $sgpr14
|
|
|
|
; GCN: renamable $vgpr29 = COPY killed renamable $sgpr13
|
|
|
|
; GCN: renamable $vgpr28 = COPY killed renamable $sgpr12
|
|
|
|
; GCN: renamable $vgpr27 = COPY killed renamable $sgpr11
|
|
|
|
; GCN: renamable $vgpr26 = COPY killed renamable $sgpr10
|
|
|
|
; GCN: renamable $vgpr25 = COPY killed renamable $sgpr9
|
|
|
|
; GCN: renamable $vgpr24 = COPY killed renamable $sgpr8
|
|
|
|
; GCN: renamable $vgpr23 = COPY killed renamable $sgpr7
|
|
|
|
; GCN: renamable $vgpr22 = COPY killed renamable $sgpr6
|
|
|
|
; GCN: renamable $vgpr21 = COPY killed renamable $sgpr5
|
|
|
|
; GCN: renamable $vgpr20 = COPY killed renamable $sgpr4
|
|
|
|
; GCN: renamable $vgpr19 = COPY killed renamable $sgpr3
|
|
|
|
; GCN: renamable $vgpr18 = COPY killed renamable $sgpr2
|
|
|
|
; GCN: renamable $vgpr17 = COPY killed renamable $sgpr1
|
2019-05-16 20:50:39 +08:00
|
|
|
; GCN: renamable $vgpr16 = COPY killed renamable $sgpr0
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: undef renamable $vgpr0 = COPY killed renamable $vgpr0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
|
|
|
|
; GCN: renamable $vgpr1 = COPY killed renamable $vgpr30
|
|
|
|
; GCN: renamable $vgpr2 = COPY killed renamable $vgpr29
|
|
|
|
; GCN: renamable $vgpr3 = COPY killed renamable $vgpr28
|
|
|
|
; GCN: renamable $vgpr4 = COPY killed renamable $vgpr27
|
|
|
|
; GCN: renamable $vgpr5 = COPY killed renamable $vgpr26
|
|
|
|
; GCN: renamable $vgpr6 = COPY killed renamable $vgpr25
|
|
|
|
; GCN: renamable $vgpr7 = COPY killed renamable $vgpr24
|
|
|
|
; GCN: renamable $vgpr8 = COPY killed renamable $vgpr23
|
|
|
|
; GCN: renamable $vgpr9 = COPY killed renamable $vgpr22
|
|
|
|
; GCN: renamable $vgpr10 = COPY killed renamable $vgpr21
|
|
|
|
; GCN: renamable $vgpr11 = COPY killed renamable $vgpr20
|
|
|
|
; GCN: renamable $vgpr12 = COPY killed renamable $vgpr19
|
|
|
|
; GCN: renamable $vgpr13 = COPY killed renamable $vgpr18
|
|
|
|
; GCN: renamable $vgpr14 = COPY killed renamable $vgpr17
|
|
|
|
; GCN: renamable $vgpr15 = COPY killed renamable $vgpr16
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store 64 into %stack.1, align 4, addrspace 5)
|
2020-09-15 21:16:14 +08:00
|
|
|
; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.0, align 4, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $vgpr0 = IMPLICIT_DEF
|
|
|
|
; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
|
2019-02-06 03:50:32 +08:00
|
|
|
; GCN: bb.1:
|
[AMDGPU] SI_INDIRECT_DST_V* pseudos expansion should place EXEC restore to separate basic block
Summary:
When SI_INDIRECT_DST_V* pseudos has indexes in VGPR, they get expanded into the self-looped basic block that modifies EXEC in a loop.
To keep EXEC consistent it is stored before and then re-stored after the pseudo expansion result.
%95:vreg_512 = SI_INDIRECT_DST_V16 %93:vreg_512(tied-def 0), %94:sreg_32, 0, killed %1500:vgpr_32
results to
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_mov_b64 exec, s[6:7]
The bug appeared in case this expansion occurs in the ELSE block of the CF.
Originally
%110:vreg_512 = SI_INDIRECT_DST_V16 %103:vreg_512(tied-def 0), %85:vgpr_32, 0, %107:vgpr_32,
%112:sreg_64 = SI_ELSE %108:sreg_64, %bb.19, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
expanded to
****************** <== here exec has "THEN" context
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_or_saveexec_b64 s[4:5], s[4:5] <-- exec mask is restored for "ELSE" but immediately overwritten.
s_mov_b64 exec, s[6:7]
The rest of the "ELSE" block is executed not by the workitems which constitute the "else mask" but by those which constitute "then mask"
SILowerControlFlow::emitElse always considers the basic block begin() as an insertion point for s_or_saveexec.
Proposed fix: The SI_INDIRECT_DST_V* procedure should split the reminder block to create landing pad for the EXEC restoration.
Reviewers: rampitec, vpykhtin, nhaehnle
Reviewed By: vpykhtin
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75472
2020-03-10 18:59:11 +08:00
|
|
|
; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000)
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.4, align 4, addrspace 5)
|
|
|
|
; GCN: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 4 from %stack.5, addrspace 5)
|
|
|
|
; GCN: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 64 from %stack.1, align 4, addrspace 5)
|
|
|
|
; GCN: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec
|
|
|
|
; GCN: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec
|
|
|
|
; GCN: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec
|
2020-06-02 21:22:40 +08:00
|
|
|
; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit-def undef $mode, implicit $m0, implicit $mode
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store 4 into %stack.6, addrspace 5)
|
2020-06-02 21:22:40 +08:00
|
|
|
; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store 4 into %stack.5, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store 8 into %stack.4, align 4, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
|
2019-02-06 03:50:32 +08:00
|
|
|
; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
[AMDGPU] SI_INDIRECT_DST_V* pseudos expansion should place EXEC restore to separate basic block
Summary:
When SI_INDIRECT_DST_V* pseudos has indexes in VGPR, they get expanded into the self-looped basic block that modifies EXEC in a loop.
To keep EXEC consistent it is stored before and then re-stored after the pseudo expansion result.
%95:vreg_512 = SI_INDIRECT_DST_V16 %93:vreg_512(tied-def 0), %94:sreg_32, 0, killed %1500:vgpr_32
results to
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_mov_b64 exec, s[6:7]
The bug appeared in case this expansion occurs in the ELSE block of the CF.
Originally
%110:vreg_512 = SI_INDIRECT_DST_V16 %103:vreg_512(tied-def 0), %85:vgpr_32, 0, %107:vgpr_32,
%112:sreg_64 = SI_ELSE %108:sreg_64, %bb.19, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
expanded to
****************** <== here exec has "THEN" context
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_or_saveexec_b64 s[4:5], s[4:5] <-- exec mask is restored for "ELSE" but immediately overwritten.
s_mov_b64 exec, s[6:7]
The rest of the "ELSE" block is executed not by the workitems which constitute the "else mask" but by those which constitute "then mask"
SILowerControlFlow::emitElse always considers the basic block begin() as an insertion point for s_or_saveexec.
Proposed fix: The SI_INDIRECT_DST_V* procedure should split the reminder block to create landing pad for the EXEC restoration.
Reviewers: rampitec, vpykhtin, nhaehnle
Reviewed By: vpykhtin
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75472
2020-03-10 18:59:11 +08:00
|
|
|
; GCN: bb.3:
|
|
|
|
; GCN: successors: %bb.2(0x80000000)
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 8 from %stack.0, align 4, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1
|
[AMDGPU] SI_INDIRECT_DST_V* pseudos expansion should place EXEC restore to separate basic block
Summary:
When SI_INDIRECT_DST_V* pseudos has indexes in VGPR, they get expanded into the self-looped basic block that modifies EXEC in a loop.
To keep EXEC consistent it is stored before and then re-stored after the pseudo expansion result.
%95:vreg_512 = SI_INDIRECT_DST_V16 %93:vreg_512(tied-def 0), %94:sreg_32, 0, killed %1500:vgpr_32
results to
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_mov_b64 exec, s[6:7]
The bug appeared in case this expansion occurs in the ELSE block of the CF.
Originally
%110:vreg_512 = SI_INDIRECT_DST_V16 %103:vreg_512(tied-def 0), %85:vgpr_32, 0, %107:vgpr_32,
%112:sreg_64 = SI_ELSE %108:sreg_64, %bb.19, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
expanded to
****************** <== here exec has "THEN" context
s_mov_b64 s[6:7], exec
BB0_16:
v_readfirstlane_b32 s8, v28
v_cmp_eq_u32_e32 vcc, s8, v28
s_and_saveexec_b64 vcc, vcc
s_set_gpr_idx_on s8, gpr_idx(DST)
v_mov_b32_e32 v6, v25
s_set_gpr_idx_off
s_xor_b64 exec, exec, vcc
s_cbranch_execnz BB0_16
; %bb.17:
s_or_saveexec_b64 s[4:5], s[4:5] <-- exec mask is restored for "ELSE" but immediately overwritten.
s_mov_b64 exec, s[6:7]
The rest of the "ELSE" block is executed not by the workitems which constitute the "else mask" but by those which constitute "then mask"
SILowerControlFlow::emitElse always considers the basic block begin() as an insertion point for s_or_saveexec.
Proposed fix: The SI_INDIRECT_DST_V* procedure should split the reminder block to create landing pad for the EXEC restoration.
Reviewers: rampitec, vpykhtin, nhaehnle
Reviewed By: vpykhtin
Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D75472
2020-03-10 18:59:11 +08:00
|
|
|
; GCN: bb.2:
|
2020-11-10 08:40:35 +08:00
|
|
|
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5)
|
|
|
|
; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.2, align 4, addrspace 5)
|
2020-09-22 20:55:54 +08:00
|
|
|
; GCN: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
|
2019-05-04 03:06:57 +08:00
|
|
|
; GCN: S_ENDPGM 0
|
2019-02-06 03:50:32 +08:00
|
|
|
entry:
|
|
|
|
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
%index = add i32 %id, 1
|
|
|
|
%value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
|
|
|
|
store i32 %value, i32 addrspace(1)* %out
|
|
|
|
ret void
|
|
|
|
}
|