AMDGPU: Don't required structured CFG

The structured CFG is just an aid to inserting exec mask modification instructions, once that is done we don't really need it anymore. We also do not analyze blocks with terminators that modify exec, so this should only be impacting true branches. llvm-svn: 288744
2016-12-06 01:02:51 +00:00 · 2016-12-06 01:02:51 +00:00 · ad55ee5869
parent 9642b36e91
commit ad55ee5869
10 changed files with 107 additions and 47 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -162,7 +162,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                      FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
    TLOF(createTLOF(getTargetTriple())),
    IntrinsicInfo() {
-  setRequiresStructuredCFG(true);
  initAsmInfo();
 }

@ -191,7 +190,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
                                     TargetOptions Options,
                                     Optional<Reloc::Model> RM,
                                     CodeModel::Model CM, CodeGenOpt::Level OL)
-  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+  setRequiresStructuredCFG(true);
+}

 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
  const Function &F) const {
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@ -8,13 +8,15 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]

+
 ; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCN: s_endpgm
+; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; TODO: This waitcnt can be eliminated

-; GCN: {{^}}[[END]]
+; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
 define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
  %cmp = icmp ne i32 %val, 0
@ -35,9 +37,10 @@ end:
 ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]

 ; GCN: buffer_store_dword
-; GCN: s_endpgm
+; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; TODO: This waitcnt can be eliminated

-; GCN: {{^}}[[END]]
+; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
 define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
  %cmp0 = icmp ne i1 %val, 0
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@ -12,9 +12,10 @@
 ; GCN: s_cbranch_vccnz

 ; GCN: one{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm
+; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
+; SI: s_branch
+; VI: buffer_store_short
+; VI: s_endpgm

 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
@ -47,17 +48,19 @@ two:
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
 ; VI:  v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
-; GCN: s_cbranch_vccnz
+; SI: s_cbranch_vccz
+; VI: s_cbranch_vccnz

-; GCN: one{{$}}
-; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm
+; VI: one{{$}}
+; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}

 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
-; GCN: buffer_store_short v[[B_F16]]
-; GCN: s_endpgm
+
+; SI: one{{$}}
+; SI: buffer_store_short v[[A_F16]]
+; SI: s_endpgm
+
 define void @br_cc_f16_imm_a(
    half addrspace(1)* %r,
    half addrspace(1)* %b) {
@ -87,8 +90,6 @@ two:

 ; GCN: one{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_short v[[A_F16]]
-; GCN: s_endpgm

 ; GCN: two{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@ -475,14 +475,13 @@ ret:

 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc1 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
-; GCN-NEXT: s_branch  [[SHORTB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:

-; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 ; GCN: s_setpc_b64

-; GCN: [[SHORTB]]:
+; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN-DAG: v_cmp_lt_i32
 ; GCN-DAG: v_cmp_gt_i32
 ; GCN: s_cbranch_vccnz
@ -492,7 +491,6 @@ ret:

 ; GCN: [[LONG_BR_DEST0]]
 ; GCN: v_cmp_ne_u32_e32
-; GCN-NEXT: ; implicit-def
 ; GCN-NEXT: s_cbranch_vccz
 ; GCN: s_setpc_b64

--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@ -506,11 +506,13 @@ bb:
 bb1:
  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
  br label %bb7

 bb4:
  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
  br label %bb7

 bb7:
@ -554,11 +556,13 @@ bb:
 bb1:                                              ; preds = %bb
  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
  br label %bb7

 bb4:                                              ; preds = %bb
  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
  br label %bb7

 bb7:                                              ; preds = %bb4, %bb1
@ -745,6 +749,8 @@ bb8:                                              ; preds = %bb2
 }

 declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.s.barrier() #2

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind convergent }
--- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@ -1,8 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

-; FIXME: Enabling critical edge splitting will fix this.
-; XFAIL: *
-
 ; Make sure that m0 is not reinitialized in the loop.

 ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init:
@ -12,7 +9,9 @@
 ; GCN: s_mov_b32 m0, -1

 ; GCN: BB0_2:
+; GCN-NOT: m0
 ; GCN: ds_read_b32
+; GCN-NOT: m0
 ; GCN: buffer_store_dword

 ; GCN: s_cbranch_scc0 BB0_2
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ;
 ;
 ; Most SALU instructions ignore control flow, so we need to make sure
@ -9,13 +9,54 @@
 ; about instructions in different blocks overwriting each other.
 ; SI-LABEL: {{^}}sgpr_if_else_salu_br:
 ; SI: s_add
-; SI: s_add
+; SI: s_branch
+
+; SI: s_sub

 define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
  %0 = icmp eq i32 %a, 0
  br i1 %0, label %if, label %else

+if:
+  %1 = sub i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_if_else_salu_br_opt:
+; SI: s_cmp_lg_u32
+; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]]
+
+; SI: ; BB#1: ; %else
+; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf
+; SI-NOT: add
+; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; SI: [[IF]]: ; %if
+; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NOT: add
+
+; SI: [[ENDIF]]: ; %endif
+; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
+; SI: buffer_store_dword
+; SI-NEXT: s_endpgm
+define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
 if:
  %1 = add i32 %b, %c
  br label %endif
@ -67,7 +108,7 @@ endif:
 ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]

-; SI: BB2_2:
+; SI: BB{{[0-9]+}}_2:
 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
 ; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s

 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
@ -223,8 +223,15 @@ declare i32 @llvm.SI.packf16(float, float) #1
 ; an assertion failure.

 ; CHECK-LABEL: {{^}}sample_v3:
-; CHECK: image_sample
-; CHECK: image_sample
+; CHECK: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
+; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13
+; CHECK: s_branch
+
+; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 5
+; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7
+
+; CHECK: BB{{[0-9]+_[0-9]+}}:
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
 ; CHECK: exp
 ; CHECK: s_endpgm
 define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
@ -241,14 +248,14 @@ entry:
  br i1 %tmp27, label %if, label %else

 if:                                               ; preds = %entry
-  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %val.if.0 = extractelement <4 x float> %val.if, i32 0
  %val.if.1 = extractelement <4 x float> %val.if, i32 1
  %val.if.2 = extractelement <4 x float> %val.if, i32 2
  br label %endif

 else:                                             ; preds = %entry
-  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1, i32 0>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %val.else.0 = extractelement <4 x float> %val.else, i32 0
  %val.else.1 = extractelement <4 x float> %val.else, i32 1
  %val.else.2 = extractelement <4 x float> %val.else, i32 2
@ -317,9 +324,15 @@ ENDIF69:                                          ; preds = %LOOP68

 ; This test checks that image_sample resource descriptors aren't loaded into
 ; vgprs.  The verifier will fail if this happens.
-; CHECK-LABEL:{{^}}sample_rsrc:
-; CHECK: image_sample
-; CHECK: image_sample
+; CHECK-LABEL:{{^}}sample_rsrc
+
+; CHECK: s_cmp_eq_u32
+; CHECK: s_cbranch_scc0 [[END:BB[0-9]+_[0-9]+]]
+
+; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
+
+; [[END]]:
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
 ; CHECK: s_endpgm
 define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@ -308,10 +308,8 @@ end:
 ; CHECK: s_mov_b64 exec, 0

 ; CHECK: [[SKIPKILL]]:
-; CHECK: v_cmp_nge_f32
-; CHECK-NEXT: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]]
-
-; CHECK: [[UNREACHABLE]]:
+; CHECK: v_cmp_nge_f32_e32 vcc
+; CHECK-NEXT: BB#3: ; %bb5
 ; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
 define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
 bb:
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@ -197,15 +197,15 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]

-; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN: buffer_store_dword [[TWO]]
+; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2
 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]]

 ; GCN: [[IF_LABEL]]:
-; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN: buffer_store_dword [[ONE]]
+; GCN-NEXT: v_mov_b32_e32 [[IMM_REG]], 1
+
+; GCN-NEXT: [[ENDIF_LABEL]]:
+; GCN: buffer_store_dword [[IMM_REG]]

-; GCN: [[ENDIF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm