diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 7287b56aa6da..e1fd95d09174 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -162,7 +162,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), TLOF(createTLOF(getTargetTriple())), IntrinsicInfo() { - setRequiresStructuredCFG(true); initAsmInfo(); } @@ -191,7 +190,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, TargetOptions Options, Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { + setRequiresStructuredCFG(true); +} const R600Subtarget *R600TargetMachine::getSubtargetImpl( const Function &F) const { diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll index 83313ed5327c..24874ee7fa98 100644 --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -8,13 +8,15 @@ ; GCNNOOPT: v_writelane_b32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] + ; GCN: ; BB#1 ; GCNNOOPT: v_readlane_b32 ; GCNNOOPT: v_readlane_b32 ; GCN: buffer_store_dword -; GCN: s_endpgm +; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; TODO: This waitcnt can be eliminated -; GCN: {{^}}[[END]] +; GCN: {{^}}[[END]]: ; GCN: s_endpgm define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 { %cmp = icmp ne i32 %val, 0 @@ -35,9 +37,10 @@ end: ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]] ; GCN: buffer_store_dword -; GCN: s_endpgm +; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; TODO: This waitcnt can be eliminated -; GCN: {{^}}[[END]] +; GCN: {{^}}[[END]]: ; GCN: s_endpgm define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { %cmp0 = icmp ne i1 %val, 0 diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 6cf3fdad3e3f..970260412c44 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -12,9 +12,10 @@ ; GCN: s_cbranch_vccnz ; GCN: one{{$}} -; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] -; GCN: buffer_store_short v[[A_F16]] -; GCN: s_endpgm +; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] +; SI: s_branch +; VI: buffer_store_short +; VI: s_endpgm ; GCN: two{{$}} ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] @@ -47,17 +48,19 @@ two: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] ; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]] -; GCN: s_cbranch_vccnz +; SI: s_cbranch_vccz +; VI: s_cbranch_vccnz -; GCN: one{{$}} -; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}} -; GCN: buffer_store_short v[[A_F16]] -; GCN: s_endpgm +; VI: one{{$}} +; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}} ; GCN: two{{$}} ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]] -; GCN: buffer_store_short v[[B_F16]] -; GCN: s_endpgm + +; SI: one{{$}} +; SI: buffer_store_short v[[A_F16]] +; SI: s_endpgm + define void @br_cc_f16_imm_a( half addrspace(1)* %r, half addrspace(1)* %b) { @@ -87,8 +90,6 @@ two: ; GCN: one{{$}} ; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]] -; GCN: buffer_store_short v[[A_F16]] -; GCN: s_endpgm ; GCN: two{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 92debd8b927d..39505404a868 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -475,14 +475,13 @@ ret: ; GCN-LABEL: {{^}}long_branch_hang: ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6 -; GCN-NEXT: s_cbranch_scc1 [[LONG_BR_0:BB[0-9]+_[0-9]+]] -; GCN-NEXT: s_branch [[SHORTB:BB[0-9]+_[0-9]+]] +; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]] +; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: -; GCN-NEXT: [[LONG_BR_0]]: ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-( ; GCN: s_setpc_b64 -; GCN: [[SHORTB]]: +; GCN-NEXT: [[LONG_BR_0]]: ; GCN-DAG: v_cmp_lt_i32 ; GCN-DAG: v_cmp_gt_i32 ; GCN: s_cbranch_vccnz @@ -492,7 +491,6 @@ ret: ; GCN: [[LONG_BR_DEST0]] ; GCN: v_cmp_ne_u32_e32 -; GCN-NEXT: ; implicit-def ; GCN-NEXT: s_cbranch_vccz ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 3a933306c64e..528e12b76ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -506,11 +506,13 @@ bb: bb1: %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef %tmp3 = extractelement <4 x float> %tmp2, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out br label %bb7 bb4: %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef %tmp6 = extractelement <4 x float> %tmp5, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out br label %bb7 bb7: @@ -554,11 +556,13 @@ bb: bb1: ; preds = %bb %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out br label %bb7 bb4: ; preds = %bb %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef + call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out br label %bb7 bb7: ; preds = %bb4, %bb1 @@ -745,6 +749,8 @@ bb8: ; preds = %bb2 } declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind convergent } diff --git a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll index 8dbec18dbf2b..078d6330ce04 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -1,8 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; FIXME: Enabling critical edge splitting will fix this. -; XFAIL: * - ; Make sure that m0 is not reinitialized in the loop. ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init: @@ -12,7 +9,9 @@ ; GCN: s_mov_b32 m0, -1 ; GCN: BB0_2: +; GCN-NOT: m0 ; GCN: ds_read_b32 +; GCN-NOT: m0 ; GCN: buffer_store_dword ; GCN: s_cbranch_scc0 BB0_2 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index bb3f94914359..d5d2f6b717f9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; ; ; Most SALU instructions ignore control flow, so we need to make sure @@ -9,13 +9,54 @@ ; about instructions in different blocks overwriting each other. ; SI-LABEL: {{^}}sgpr_if_else_salu_br: ; SI: s_add -; SI: s_add +; SI: s_branch + +; SI: s_sub define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else +if: + %1 = sub i32 %b, %c + br label %endif + +else: + %2 = add i32 %d, %e + br label %endif + +endif: + %3 = phi i32 [%1, %if], [%2, %else] + %4 = add i32 %3, %a + store i32 %4, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}sgpr_if_else_salu_br_opt: +; SI: s_cmp_lg_u32 +; SI: s_cbranch_scc0 [[IF:BB[0-9]+_[0-9]+]] + +; SI: ; BB#1: ; %else +; SI: s_load_dword [[LOAD0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xe +; SI: s_load_dword [[LOAD1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xf +; SI-NOT: add +; SI: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] + +; SI: [[IF]]: ; %if +; SI: s_load_dword [[LOAD0]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_load_dword [[LOAD1]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-NOT: add + +; SI: [[ENDIF]]: ; %endif +; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]] +; SI: buffer_store_dword +; SI-NEXT: s_endpgm +define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else + if: %1 = add i32 %b, %c br label %endif @@ -67,7 +108,7 @@ endif: ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] -; SI: BB2_2: +; SI: BB{{[0-9]+}}_2: ; SI: buffer_load_dword [[AVAL:v[0-9]+]] ; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll index da270c533ece..e65f1e2da570 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; This test checks that no VGPR to SGPR copies are created by the register ; allocator. @@ -223,8 +223,15 @@ declare i32 @llvm.SI.packf16(float, float) #1 ; an assertion failure. ; CHECK-LABEL: {{^}}sample_v3: -; CHECK: image_sample -; CHECK: image_sample +; CHECK: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11 +; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13 +; CHECK: s_branch + +; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 5 +; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7 + +; CHECK: BB{{[0-9]+_[0-9]+}}: +; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}} ; CHECK: exp ; CHECK: s_endpgm define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { @@ -241,14 +248,14 @@ entry: br i1 %tmp27, label %if, label %else if: ; preds = %entry - %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> zeroinitializer, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %val.if.0 = extractelement <4 x float> %val.if, i32 0 %val.if.1 = extractelement <4 x float> %val.if, i32 1 %val.if.2 = extractelement <4 x float> %val.if, i32 2 br label %endif else: ; preds = %entry - %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> , <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %val.else.0 = extractelement <4 x float> %val.else, i32 0 %val.else.1 = extractelement <4 x float> %val.else, i32 1 %val.else.2 = extractelement <4 x float> %val.else, i32 2 @@ -317,9 +324,15 @@ ENDIF69: ; preds = %LOOP68 ; This test checks that image_sample resource descriptors aren't loaded into ; vgprs. The verifier will fail if this happens. -; CHECK-LABEL:{{^}}sample_rsrc: -; CHECK: image_sample -; CHECK: image_sample +; CHECK-LABEL:{{^}}sample_rsrc + +; CHECK: s_cmp_eq_u32 +; CHECK: s_cbranch_scc0 [[END:BB[0-9]+_[0-9]+]] + +; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}} + +; [[END]]: +; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}} ; CHECK: s_endpgm define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index d458faa818b8..33f5e98fcc7e 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -308,10 +308,8 @@ end: ; CHECK: s_mov_b64 exec, 0 ; CHECK: [[SKIPKILL]]: -; CHECK: v_cmp_nge_f32 -; CHECK-NEXT: s_cbranch_vccz [[UNREACHABLE:BB[0-9]+_[0-9]+]] - -; CHECK: [[UNREACHABLE]]: +; CHECK: v_cmp_nge_f32_e32 vcc +; CHECK-NEXT: BB#3: ; %bb5 ; CHECK-NEXT: .Lfunc_end{{[0-9]+}} define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll index a5d1cd2281c5..a0060bd368be 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -197,15 +197,15 @@ if.end: ; preds = %if.else, %if.then ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]] -; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN: buffer_store_dword [[TWO]] +; GCN: v_mov_b32_e32 [[IMM_REG:v[0-9]+]], 2 ; GCN: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]] ; GCN: [[IF_LABEL]]: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: buffer_store_dword [[ONE]] +; GCN-NEXT: v_mov_b32_e32 [[IMM_REG]], 1 + +; GCN-NEXT: [[ENDIF_LABEL]]: +; GCN: buffer_store_dword [[IMM_REG]] -; GCN: [[ENDIF_LABEL]]: ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 ; GCN: buffer_store_dword [[THREE]] ; GCN: s_endpgm