2016-07-13 03:01:23 +08:00
|
|
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
call void @llvm.amdgcn.kill(i1 true)
|
2016-07-13 03:01:23 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_mov_b64 exec, 0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
call void @llvm.amdgcn.kill(i1 false)
|
2016-07-13 03:01:23 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
; FIXME: Ideally only one would be emitted
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_mov_b64 exec, 0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_mov_b64 exec, 0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.2:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
call void @llvm.amdgcn.kill(i1 false)
|
|
|
|
call void @llvm.amdgcn.kill(i1 false)
|
2016-07-13 05:41:32 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_var:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp = fcmp olt float %x, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp)
|
2016-07-13 03:01:23 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
; FIXME: Ideally only one would be emitted
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.2:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp = fcmp olt float %x, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp)
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp)
|
2016-07-13 05:41:32 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.2:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.x = fcmp olt float %x, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.x)
|
|
|
|
%cmp.y = fcmp olt float %y, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.y)
|
2016-07-13 05:41:32 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.0:
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
|
AMDGPU: Force skip over s_sendmsg and exp instructions
Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.
Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48431
Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44
llvm-svn: 338235
2018-07-30 17:23:59 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execnz BB6_2
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
AMDGPU: Force skip over s_sendmsg and exp instructions
Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.
Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48431
Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44
llvm-svn: 338235
2018-07-30 17:23:59 +08:00
|
|
|
; CHECK-NEXT: exp
|
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
; CHECK-NEXT: BB6_2:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK: v_mov_b32_e64 v7, -1
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
|
AMDGPU: Force skip over s_sendmsg and exp instructions
Summary:
These instructions interact with hardware blocks outside the shader core,
and they can have "scalar" side effects even when EXEC = 0. We don't
want these scalar side effects to occur when all lanes want to skip
these instructions, so always add the execz skip branch instruction
for basic blocks that contain them.
Also ensure that we skip scalar stores / atomics, though we don't
code-gen those yet.
Reviewers: arsenm, rampitec
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48431
Change-Id: Ieaeb58352e2789ffd64745603c14970c60819d44
llvm-svn: 338235
2018-07-30 17:23:59 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execnz BB6_4
|
|
|
|
; CHECK-NEXT: ; %bb.3:
|
|
|
|
; CHECK-NEXT: exp
|
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
; CHECK-NEXT: BB6_4:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.x = fcmp olt float %x, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.x)
|
2017-06-09 03:03:20 +08:00
|
|
|
%y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"()
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.y = fcmp olt float %y, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.y)
|
2016-07-13 05:41:32 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-13 03:01:23 +08:00
|
|
|
; FIXME: why does the skip depend on the asm length in the same block?
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_control_flow:
|
2016-09-30 09:50:20 +08:00
|
|
|
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
|
|
|
|
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK: v_mov_b32_e64 v7, -1
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.2:
|
2016-12-06 04:31:49 +08:00
|
|
|
; CHECK-NEXT: exp null off, off, off, off done vm
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
|
|
|
|
; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
|
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
|
|
|
|
entry:
|
|
|
|
%cmp = icmp eq i32 %arg, 0
|
|
|
|
br i1 %cmp, label %bb, label %exit
|
|
|
|
|
|
|
|
bb:
|
|
|
|
%var = call float asm sideeffect "
|
|
|
|
v_mov_b32_e64 v7, -1
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
2017-06-09 03:03:20 +08:00
|
|
|
v_nop_e64", "={v7}"()
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.var = fcmp olt float %var, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.var)
|
2016-07-13 03:01:23 +08:00
|
|
|
br label %exit
|
|
|
|
|
|
|
|
exit:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
|
2016-09-30 09:50:20 +08:00
|
|
|
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
|
2016-11-11 09:34:21 +08:00
|
|
|
; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
|
|
|
|
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.1: ; %bb
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK: v_mov_b32_e64 v7, -1
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: v_nop_e64
|
|
|
|
; CHECK: ;;#ASMEND
|
|
|
|
; CHECK: v_mov_b32_e64 v8, -1
|
|
|
|
; CHECK: ;;#ASMEND
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
|
|
|
|
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.2:
|
2016-12-06 04:31:49 +08:00
|
|
|
; CHECK-NEXT: exp null off, off, off, off done vm
|
2016-07-13 03:01:23 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
|
|
|
|
; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
|
|
|
|
; CHECK: buffer_store_dword v8
|
|
|
|
; CHECK: v_mov_b32_e64 v9, -2
|
|
|
|
|
|
|
|
; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
|
|
|
|
; CHECK: buffer_store_dword v9
|
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
|
|
|
|
entry:
|
|
|
|
%cmp = icmp eq i32 %arg, 0
|
|
|
|
br i1 %cmp, label %bb, label %exit
|
|
|
|
|
|
|
|
bb:
|
|
|
|
%var = call float asm sideeffect "
|
|
|
|
v_mov_b32_e64 v7, -1
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
2017-06-09 03:03:20 +08:00
|
|
|
v_nop_e64", "={v7}"()
|
|
|
|
%live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={v8}"()
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.var = fcmp olt float %var, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.var)
|
2016-07-13 03:01:23 +08:00
|
|
|
store volatile float %live.across, float addrspace(1)* undef
|
2017-06-09 03:03:20 +08:00
|
|
|
%live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"()
|
2016-07-13 03:01:23 +08:00
|
|
|
br label %exit
|
|
|
|
|
|
|
|
exit:
|
|
|
|
%phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
|
|
|
|
store float %phi, float addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_kill_divergent_loop:
|
2016-09-30 09:50:20 +08:00
|
|
|
; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
|
2017-08-04 14:58:42 +08:00
|
|
|
; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
|
2016-08-11 03:11:42 +08:00
|
|
|
; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
|
|
|
|
; CHECK-NEXT: s_cbranch_execz [[EXIT]]
|
|
|
|
|
|
|
|
; CHECK: {{BB[0-9]+_[0-9]+}}: ; %bb.preheader
|
|
|
|
; CHECK: s_mov_b32
|
2016-07-13 05:41:32 +08:00
|
|
|
|
|
|
|
; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]:
|
|
|
|
|
|
|
|
; CHECK: v_mov_b32_e64 v7, -1
|
|
|
|
; CHECK: v_nop_e64
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7
|
2016-07-13 05:41:32 +08:00
|
|
|
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK-NEXT: ; %bb.3:
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
|
2016-09-30 09:50:20 +08:00
|
|
|
; CHECK: v_cmp_eq_u32_e32 vcc, 0, [[LOAD]]
|
2016-07-13 05:41:32 +08:00
|
|
|
; CHECK-NEXT: s_and_b64 vcc, exec, vcc
|
|
|
|
; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
|
|
|
|
|
|
|
|
; CHECK-NEXT: {{^}}[[EXIT]]:
|
|
|
|
; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]]
|
|
|
|
; CHECK: buffer_store_dword
|
|
|
|
; CHECK: s_endpgm
|
|
|
|
define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
|
|
|
|
entry:
|
|
|
|
%cmp = icmp eq i32 %arg, 0
|
|
|
|
br i1 %cmp, label %bb, label %exit
|
|
|
|
|
|
|
|
bb:
|
|
|
|
%var = call float asm sideeffect "
|
|
|
|
v_mov_b32_e64 v7, -1
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
|
|
|
v_nop_e64
|
2017-06-09 03:03:20 +08:00
|
|
|
v_nop_e64", "={v7}"()
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.var = fcmp olt float %var, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.var)
|
2016-07-13 05:41:32 +08:00
|
|
|
%vgpr = load volatile i32, i32 addrspace(1)* undef
|
|
|
|
%loop.cond = icmp eq i32 %vgpr, 0
|
|
|
|
br i1 %loop.cond, label %bb, label %exit
|
|
|
|
|
|
|
|
exit:
|
|
|
|
store volatile i32 8, i32 addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-15 08:58:09 +08:00
|
|
|
; bug 28550
|
|
|
|
; CHECK-LABEL: {{^}}phi_use_def_before_kill:
|
|
|
|
; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_lt_f32_e32 vcc, 0,
|
2016-07-15 08:58:09 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
|
|
|
|
|
|
|
|
; CHECK: exp
|
|
|
|
; CHECK-NEXT: s_endpgm
|
|
|
|
|
|
|
|
; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
|
2017-01-12 03:55:19 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]]
|
2016-07-15 08:58:09 +08:00
|
|
|
|
2017-01-12 03:55:19 +08:00
|
|
|
; CHECK: [[PHIBB]]:
|
2016-07-15 08:58:09 +08:00
|
|
|
; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
|
[AMDGPU] Fixed incorrect uniform branch condition
Summary:
I had a case where multiple nested uniform ifs resulted in code that did
v_cmp comparisons, combining the results with s_and_b64, s_or_b64 and
s_xor_b64 and using the resulting mask in s_cbranch_vccnz, without first
ensuring that bits for inactive lanes were clear.
There was already code for inserting an "s_and_b64 vcc, exec, vcc" to
clear bits for inactive lanes in the case that the branch is instruction
selected as s_cbranch_scc1 and is then changed to s_cbranch_vccnz in
SIFixSGPRCopies. I have added the same code into SILowerControlFlow for
the case that the branch is instruction selected as s_cbranch_vccnz.
This de-optimizes the code in some cases where the s_and is not needed,
because vcc is the result of a v_cmp, or multiple v_cmp instructions
combined by s_and/s_or. We should add a pass to re-optimize those cases.
Reviewers: arsenm, kzhuravl
Subscribers: wdng, yaxunl, t-tye, llvm-commits, dstuttard, timcorringham, nhaehnle
Differential Revision: https://reviews.llvm.org/D41292
llvm-svn: 322119
2018-01-10 05:34:43 +08:00
|
|
|
; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
|
2016-07-15 08:58:09 +08:00
|
|
|
|
2017-01-12 03:55:19 +08:00
|
|
|
; CHECK: ; %bb10
|
2016-07-15 08:58:09 +08:00
|
|
|
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
|
|
|
|
; CHECK: buffer_store_dword
|
|
|
|
|
2017-01-12 03:55:19 +08:00
|
|
|
; CHECK: [[ENDBB]]:
|
2016-07-15 08:58:09 +08:00
|
|
|
; CHECK-NEXT: s_endpgm
|
2018-03-10 00:33:34 +08:00
|
|
|
define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
|
2016-07-15 08:58:09 +08:00
|
|
|
bb:
|
2018-03-10 00:33:34 +08:00
|
|
|
%tmp = fadd float %x, 1.000000e+00
|
2016-07-15 08:58:09 +08:00
|
|
|
%tmp1 = fcmp olt float 0.000000e+00, %tmp
|
|
|
|
%tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.tmp2 = fcmp olt float %tmp2, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.tmp2)
|
2016-07-15 08:58:09 +08:00
|
|
|
br i1 undef, label %phibb, label %bb8
|
|
|
|
|
|
|
|
phibb:
|
|
|
|
%tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ]
|
|
|
|
%tmp6 = fcmp oeq float %tmp5, 0.000000e+00
|
|
|
|
br i1 %tmp6, label %bb10, label %end
|
|
|
|
|
|
|
|
bb8:
|
|
|
|
store volatile i32 8, i32 addrspace(1)* undef
|
|
|
|
br label %phibb
|
|
|
|
|
|
|
|
bb10:
|
|
|
|
store volatile i32 9, i32 addrspace(1)* undef
|
|
|
|
br label %end
|
|
|
|
|
|
|
|
end:
|
|
|
|
ret void
|
|
|
|
}
|
2016-07-13 05:41:32 +08:00
|
|
|
|
2016-07-15 08:58:13 +08:00
|
|
|
; CHECK-LABEL: {{^}}no_skip_no_successors:
|
2016-09-09 01:19:29 +08:00
|
|
|
; CHECK: v_cmp_nge_f32
|
[AMDGPU] Fixed incorrect uniform branch condition
Summary:
I had a case where multiple nested uniform ifs resulted in code that did
v_cmp comparisons, combining the results with s_and_b64, s_or_b64 and
s_xor_b64 and using the resulting mask in s_cbranch_vccnz, without first
ensuring that bits for inactive lanes were clear.
There was already code for inserting an "s_and_b64 vcc, exec, vcc" to
clear bits for inactive lanes in the case that the branch is instruction
selected as s_cbranch_scc1 and is then changed to s_cbranch_vccnz in
SIFixSGPRCopies. I have added the same code into SILowerControlFlow for
the case that the branch is instruction selected as s_cbranch_vccnz.
This de-optimizes the code in some cases where the s_and is not needed,
because vcc is the result of a v_cmp, or multiple v_cmp instructions
combined by s_and/s_or. We should add a pass to re-optimize those cases.
Reviewers: arsenm, kzhuravl
Subscribers: wdng, yaxunl, t-tye, llvm-commits, dstuttard, timcorringham, nhaehnle
Differential Revision: https://reviews.llvm.org/D41292
llvm-svn: 322119
2018-01-10 05:34:43 +08:00
|
|
|
; CHECK: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
|
2016-07-15 08:58:13 +08:00
|
|
|
|
2016-10-07 00:20:41 +08:00
|
|
|
; CHECK: ; %bb6
|
2016-07-15 08:58:13 +08:00
|
|
|
; CHECK: s_mov_b64 exec, 0
|
|
|
|
|
|
|
|
; CHECK: [[SKIPKILL]]:
|
2016-12-06 09:02:51 +08:00
|
|
|
; CHECK: v_cmp_nge_f32_e32 vcc
|
[AMDGPU] Fixed incorrect uniform branch condition
Summary:
I had a case where multiple nested uniform ifs resulted in code that did
v_cmp comparisons, combining the results with s_and_b64, s_or_b64 and
s_xor_b64 and using the resulting mask in s_cbranch_vccnz, without first
ensuring that bits for inactive lanes were clear.
There was already code for inserting an "s_and_b64 vcc, exec, vcc" to
clear bits for inactive lanes in the case that the branch is instruction
selected as s_cbranch_scc1 and is then changed to s_cbranch_vccnz in
SIFixSGPRCopies. I have added the same code into SILowerControlFlow for
the case that the branch is instruction selected as s_cbranch_vccnz.
This de-optimizes the code in some cases where the s_and is not needed,
because vcc is the result of a v_cmp, or multiple v_cmp instructions
combined by s_and/s_or. We should add a pass to re-optimize those cases.
Reviewers: arsenm, kzhuravl
Subscribers: wdng, yaxunl, t-tye, llvm-commits, dstuttard, timcorringham, nhaehnle
Differential Revision: https://reviews.llvm.org/D41292
llvm-svn: 322119
2018-01-10 05:34:43 +08:00
|
|
|
; CHECK: %bb.3: ; %bb5
|
2016-07-15 08:58:13 +08:00
|
|
|
; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
|
|
|
|
define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
|
|
|
|
bb:
|
|
|
|
%tmp = fcmp ult float %arg1, 0.000000e+00
|
|
|
|
%tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
|
|
|
|
br i1 %tmp, label %bb6, label %bb3
|
|
|
|
|
|
|
|
bb3: ; preds = %bb
|
|
|
|
br i1 %tmp2, label %bb5, label %bb4
|
|
|
|
|
|
|
|
bb4: ; preds = %bb3
|
|
|
|
br i1 true, label %bb5, label %bb7
|
|
|
|
|
|
|
|
bb5: ; preds = %bb4, %bb3
|
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb6: ; preds = %bb
|
2018-12-08 01:46:16 +08:00
|
|
|
call void @llvm.amdgcn.kill(i1 false)
|
2016-07-15 08:58:13 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
bb7: ; preds = %bb4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK-LABEL: {{^}}if_after_kill_block:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ; %bb.0:
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK: s_and_saveexec_b64
|
2017-08-04 14:58:42 +08:00
|
|
|
; CHECK: s_xor_b64
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
|
|
|
|
|
2018-12-08 01:46:16 +08:00
|
|
|
; CHECK: v_cmpx_gt_f32_e32 vcc, 0,
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK: [[BB4]]:
|
|
|
|
; CHECK: s_or_b64 exec, exec
|
|
|
|
; CHECK: image_sample_c
|
|
|
|
|
|
|
|
; CHECK: v_cmp_neq_f32_e32 vcc, 0,
|
|
|
|
; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
|
|
|
|
; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
|
AMDGPU: Force skip over SMRD, VMEM and s_waitcnt instructions
Summary: This fixes a large Dawn of War 3 performance regression with RADV from Mesa 19.0 to master which was caused by creating less code in some branches.
Reviewers: arsen, nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60824
llvm-svn: 358592
2019-04-18 00:31:52 +08:00
|
|
|
; CHECK-NEXT: s_cbranch_execz [[END]]
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK-NOT: branch
|
|
|
|
|
2016-08-11 03:11:42 +08:00
|
|
|
; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8
|
2016-07-15 08:58:15 +08:00
|
|
|
; CHECK: buffer_store_dword
|
|
|
|
|
|
|
|
; CHECK: [[END]]:
|
|
|
|
; CHECK: s_endpgm
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
|
2016-07-15 08:58:15 +08:00
|
|
|
bb:
|
|
|
|
%tmp = fcmp ult float %arg1, 0.000000e+00
|
|
|
|
br i1 %tmp, label %bb3, label %bb4
|
|
|
|
|
|
|
|
bb3: ; preds = %bb
|
2018-12-08 01:46:16 +08:00
|
|
|
%cmp.arg = fcmp olt float %arg, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %cmp.arg)
|
2016-07-15 08:58:15 +08:00
|
|
|
br label %bb4
|
|
|
|
|
|
|
|
bb4: ; preds = %bb3, %bb
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 16, float %arg2, float %arg3, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
|
2016-07-15 08:58:15 +08:00
|
|
|
%tmp6 = extractelement <4 x float> %tmp5, i32 0
|
|
|
|
%tmp7 = fcmp une float %tmp6, 0.000000e+00
|
|
|
|
br i1 %tmp7, label %bb8, label %bb9
|
|
|
|
|
|
|
|
bb8: ; preds = %bb9, %bb4
|
|
|
|
store volatile i32 9, i32 addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
|
|
|
|
bb9: ; preds = %bb4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
2018-12-08 01:46:16 +08:00
|
|
|
declare void @llvm.amdgcn.kill(i1) #0
|
2016-07-13 03:01:23 +08:00
|
|
|
|
|
|
|
attributes #0 = { nounwind }
|
2017-03-22 00:24:12 +08:00
|
|
|
attributes #1 = { nounwind readonly }
|