AMDGPU: Don't required structured CFG

The structured CFG is just an aid to inserting exec
mask modification instructions, once that is done
we don't really need it anymore. We also
do not analyze blocks with terminators that
modify exec, so this should only be impacting
true branches.

llvm-svn: 288744
This commit is contained in:
Matt Arsenault 2016-12-06 01:02:51 +00:00
parent 9642b36e91
commit ad55ee5869
10 changed files with 107 additions and 47 deletions

View File

@ -162,7 +162,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
TLOF(createTLOF(getTargetTriple())),
IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
}
@ -191,7 +190,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
TargetOptions Options,
Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
}
const R600Subtarget *R600TargetMachine::getSubtargetImpl(
const Function &F) const {

View File

@ -8,13 +8,15 @@
; GCNNOOPT: v_writelane_b32
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
; GCN: ; BB#1
; GCNNOOPT: v_readlane_b32
; GCNNOOPT: v_readlane_b32
; GCN: buffer_store_dword
; GCN: s_endpgm
; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; TODO: This waitcnt can be eliminated
; GCN: {{^}}[[END]]
; GCN: {{^}}[[END]]:
; GCN: s_endpgm
define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
%cmp = icmp ne i32 %val, 0
@ -35,9 +37,10 @@ end:
; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
; GCN: buffer_store_dword
; GCN: s_endpgm
; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; TODO: This waitcnt can be eliminated
; GCN: {{^}}[[END]]
; GCN: {{^}}[[END]]:
; GCN: s_endpgm
define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
%cmp0 = icmp ne i1 %val, 0

View File

@ -12,9 +12,10 @@
; GCN: s_cbranch_vccnz
; GCN: one{{$}}
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short v[[A_F16]]
; GCN: s_endpgm
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
; SI: s_branch
; VI: buffer_store_short
; VI: s_endpgm
; GCN: two{{$}}
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
@ -47,17 +48,19 @@ two:
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; GCN: s_cbranch_vccnz
; SI: s_cbranch_vccz
; VI: s_cbranch_vccnz
; GCN: one{{$}}
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
; GCN: buffer_store_short v[[A_F16]]
; GCN: s_endpgm
; VI: one{{$}}
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
; GCN: two{{$}}
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
; GCN: buffer_store_short v[[B_F16]]
; GCN: s_endpgm
; SI: one{{$}}
; SI: buffer_store_short v[[A_F16]]
; SI: s_endpgm
define void @br_cc_f16_imm_a(
half addrspace(1)* %r,
half addrspace(1)* %b) {
@ -87,8 +90,6 @@ two:
; GCN: one{{$}}
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short v[[A_F16]]
; GCN: s_endpgm
; GCN: two{{$}}
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}

View File

@ -475,14 +475,13 @@ ret:
; GCN-LABEL: {{^}}long_branch_hang:
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
; GCN-NEXT: s_cbranch_scc1 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
; GCN-NEXT: s_branch [[SHORTB:BB[0-9]+_[0-9]+]]
; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
; GCN-NEXT: [[LONG_BR_0]]:
; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
; GCN: s_setpc_b64
; GCN: [[SHORTB]]:
; GCN-NEXT: [[LONG_BR_0]]:
; GCN-DAG: v_cmp_lt_i32
; GCN-DAG: v_cmp_gt_i32
; GCN: s_cbranch_vccnz
@ -492,7 +491,6 @@ ret:
; GCN: [[LONG_BR_DEST0]]
; GCN: v_cmp_ne_u32_e32
; GCN-NEXT: ; implicit-def
; GCN-NEXT: s_cbranch_vccz
; GCN: s_setpc_b64

View File

@ -506,11 +506,13 @@ bb:
bb1:
%tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
%tmp3 = extractelement <4 x float> %tmp2, i32 undef
call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
br label %bb7
bb4:
%tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
%tmp6 = extractelement <4 x float> %tmp5, i32 undef
call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
br label %bb7
bb7:
@ -554,11 +556,13 @@ bb:
bb1: ; preds = %bb
%tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
%tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
br label %bb7
bb4: ; preds = %bb
%tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
%tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
br label %bb7
bb7: ; preds = %bb4, %bb1
@ -745,6 +749,8 @@ bb8: ; preds = %bb2
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare void @llvm.amdgcn.s.barrier() #2
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind convergent }

View File

@ -1,8 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; FIXME: Enabling critical edge splitting will fix this.
; XFAIL: *
; Make sure that m0 is not reinitialized in the loop.
; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
@ -12,7 +9,9 @@
; GCN: s_mov_b32 m0, -1
; GCN: BB0_2:
; GCN-NOT: m0
; GCN: ds_read_b32
; GCN-NOT: m0
; GCN: buffer_store_dword
; GCN: s_cbranch_scc0 BB0_2

View File

@ -1,4 +1,4 @@
; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
;
;
; Most SALU instructions ignore control flow, so we need to make sure
@ -9,13 +9,54 @@
; about instructions in different blocks overwriting each other.
; SI-LABEL: {{^}}sgpr_if_else_salu_br:
; SI: s_add
; SI: s_add
; SI: s_branch
; SI: s_sub
define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
entry:
%0 = icmp eq i32 %a, 0
br i1 %0, label %if, label %else
if:
%1 = sub i32 %b, %c
br label %endif
else:
%2 = add i32 %d, %e
br label %endif
endif:
%3 = phi i32 [%1, %if], [%2, %else]
%4 = add i32 %3, %a
store i32 %4, i32 addrspace(1)* %out
ret void
}
; SI-LABEL: {{^}}sgpr_if_else_salu_br_opt:
; SI: s_cmp_lg_u32
; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]]
; SI: ; BB#1: ; %else
; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe
; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf
; SI-NOT: add
; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
; SI: [[IF]]: ; %if
; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-NOT: add
; SI: [[ENDIF]]: ; %endif
; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
; SI: buffer_store_dword
; SI-NEXT: s_endpgm
define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
entry:
%0 = icmp eq i32 %a, 0
br i1 %0, label %if, label %else
if:
%1 = add i32 %b, %c
br label %endif
@ -67,7 +108,7 @@ endif:
; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
; SI: BB2_2:
; SI: BB{{[0-9]+}}_2:
; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]

View File

@ -1,5 +1,5 @@
; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; This test checks that no VGPR to SGPR copies are created by the register
; allocator.
@ -223,8 +223,15 @@ declare i32 @llvm.SI.packf16(float, float) #1
; an assertion failure.
; CHECK-LABEL: {{^}}sample_v3:
; CHECK: image_sample
; CHECK: image_sample
; CHECK: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13
; CHECK: s_branch
; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 5
; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7
; CHECK: BB{{[0-9]+_[0-9]+}}:
; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
; CHECK: exp
; CHECK: s_endpgm
define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
@ -241,14 +248,14 @@ entry:
br i1 %tmp27, label %if, label %else
if: ; preds = %entry
%val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.if.0 = extractelement <4 x float> %val.if, i32 0
%val.if.1 = extractelement <4 x float> %val.if, i32 1
%val.if.2 = extractelement <4 x float> %val.if, i32 2
br label %endif
else: ; preds = %entry
%val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1, i32 0>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%val.else.0 = extractelement <4 x float> %val.else, i32 0
%val.else.1 = extractelement <4 x float> %val.else, i32 1
%val.else.2 = extractelement <4 x float> %val.else, i32 2
@ -317,9 +324,15 @@ ENDIF69: ; preds = %LOOP68
; This test checks that image_sample resource descriptors aren't loaded into
; vgprs. The verifier will fail if this happens.
; CHECK-LABEL:{{^}}sample_rsrc:
; CHECK: image_sample
; CHECK: image_sample
; CHECK-LABEL:{{^}}sample_rsrc
; CHECK: s_cmp_eq_u32
; CHECK: s_cbranch_scc0 [[END:BB[0-9]+_[0-9]+]]
; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
; [[END]]:
; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
; CHECK: s_endpgm
define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
bb:

View File

@ -308,10 +308,8 @@ end:
; CHECK: s_mov_b64 exec, 0
; CHECK: [[SKIPKILL]]:
; CHECK: v_cmp_nge_f32
; CHECK-NEXT: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]]
; CHECK: [[UNREACHABLE]]:
; CHECK: v_cmp_nge_f32_e32 vcc
; CHECK-NEXT: BB#3: ; %bb5
; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
bb:

View File

@ -197,15 +197,15 @@ if.end: ; preds = %if.else, %if.then
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
; GCN: buffer_store_dword [[TWO]]
; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2
; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]
; GCN: [[IF_LABEL]]:
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
; GCN: buffer_store_dword [[ONE]]
; GCN-NEXT: v_mov_b32_e32 [[IMM_REG]], 1
; GCN-NEXT: [[ENDIF_LABEL]]:
; GCN: buffer_store_dword [[IMM_REG]]
; GCN: [[ENDIF_LABEL]]:
; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
; GCN: buffer_store_dword [[THREE]]
; GCN: s_endpgm