From d8f651d3e8e2a49730a18926eb2325b7793638f8 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Tue, 9 Jun 2020 13:14:15 +0530 Subject: [PATCH] [AMDGPU] Enable structurizer workarounds by default Reviewed By: nhaehnle Differential Revision: https://reviews.llvm.org/D81211 --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 26 +-- llvm/test/CodeGen/AMDGPU/multilevel-break.ll | 57 +++--- .../CodeGen/AMDGPU/nested-loop-conditions.ll | 36 ++-- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 166 ++++++++++++------ 5 files changed, 167 insertions(+), 120 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4ec903e37653..36c0096a47fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -195,7 +195,7 @@ static cl::opt EnableScalarIRPasses( static cl::opt EnableStructurizerWorkarounds( "amdgpu-enable-structurizer-workarounds", - cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(false), + cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 6a759e7c1122..b2acc37493e4 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -21,7 +21,6 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) { ; IR: loop: ; IR-NEXT: store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4 ; IR-NEXT: br label [[LOOP]] -; entry: br label %loop @@ -59,7 +58,6 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void -; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -119,7 +117,6 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void -; entry: br i1 undef, label %loop1, label %loop2 @@ -140,33 +137,29 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) { ; SI-NEXT: s_cbranch_execz BB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 -; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: BB3_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 ; SI-NEXT: ; Child Loop BB3_3 Depth 2 -; SI-NEXT: s_and_b64 s[8:9], exec, vcc -; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: BB3_3: ; %inner_loop ; SI-NEXT: ; Parent Loop BB3_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 -; SI-NEXT: s_and_b64 s[10:11], exec, s[0:1] -; SI-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] +; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz BB3_3 -; SI-NEXT: ; %bb.4: ; %Flow -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz BB3_2 +; SI-NEXT: s_cbranch_execnz BB3_3 +; SI-NEXT: ; %bb.4: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_and_b64 vcc, exec, 0 +; SI-NEXT: s_cbranch_vccz BB3_2 ; SI-NEXT: BB3_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( @@ -184,7 +177,6 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void -; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp eq i32 %tmp, 1 diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll index 932e6ce11045..c34ae7a99df5 100644 --- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -46,52 +46,47 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { ; ; GCN-LABEL: multi_else_break: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_branch BB0_2 -; GCN-NEXT: BB0_1: ; %Flow2 +; GCN-NEXT: BB0_1: ; %loop.exit.guard ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_and_b64 s[0:1], exec, s[8:9] -; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execz BB0_6 ; GCN-NEXT: BB0_2: ; %LOOP.outer ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GCN-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_branch BB0_4 ; GCN-NEXT: BB0_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: s_or_b64 exec, exec, s[10:11] -; GCN-NEXT: s_and_b64 s[0:1], exec, s[6:7] -; GCN-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execz BB0_1 ; GCN-NEXT: BB0_4: ; %LOOP ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v2 -; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v2, v4 -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], exec +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB0_3 ; GCN-NEXT: ; %bb.5: ; %ENDIF ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v1 -; GCN-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; GCN-NEXT: s_and_b64 s[12:13], vcc, exec -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_and_b64 s[10:11], vcc, exec +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GCN-NEXT: s_branch BB0_3 ; GCN-NEXT: BB0_6: ; %IF ; GCN-NEXT: s_endpgm @@ -204,7 +199,10 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; GCN-NEXT: s_mov_b64 s[6:7], -1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GCN-NEXT: s_mov_b64 s[10:11], -1 ; GCN-NEXT: s_cbranch_vccnz BB1_6 ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 @@ -223,15 +221,11 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; GCN-NEXT: BB1_5: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_mov_b64 s[10:11], 0 +; GCN-NEXT: BB1_6: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_and_b64 vcc, exec, s[10:11] ; GCN-NEXT: s_cbranch_vccz BB1_1 -; GCN-NEXT: s_branch BB1_7 -; GCN-NEXT: BB1_6: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: s_mov_b64 s[6:7], -1 -; GCN-NEXT: s_and_b64 vcc, exec, -1 -; GCN-NEXT: s_cbranch_execz BB1_1 -; GCN-NEXT: BB1_7: ; %LeafBlock +; GCN-NEXT: ; %bb.7: ; %LeafBlock ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc @@ -247,9 +241,10 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GCN-NEXT: s_branch BB1_1 -; GCN-NEXT: BB1_9: ; %Flow6 +; GCN-NEXT: BB1_9: ; %loop.exit.guard ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 66847f146bd3..8bdc05bafacd 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -14,28 +14,36 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_mov_b32 m0, -1 +; GCN-NEXT: s_mov_b64 s[2:3], -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_and_b64 vcc, exec, 0 -; GCN-NEXT: BB0_1: ; %bb5 +; GCN-NEXT: s_and_b64 s[0:1], exec, -1 +; GCN-NEXT: s_branch BB0_2 +; GCN-NEXT: BB0_1: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_cbranch_vccz BB0_4 +; GCN-NEXT: BB0_2: ; %bb5 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_cmp_lg_u32 s0, 1 -; GCN-NEXT: s_cbranch_scc0 BB0_3 -; GCN-NEXT: ; %bb.2: ; %bb10 -; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: ; implicit-def: $sgpr0 +; GCN-NEXT: s_mov_b64 vcc, s[0:1] ; GCN-NEXT: s_cbranch_vccnz BB0_1 -; GCN-NEXT: s_branch BB0_5 -; GCN-NEXT: BB0_3: ; %bb8 +; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 +; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN-NEXT: s_cbranch_vccnz BB0_2 +; GCN-NEXT: BB0_4: ; %loop.exit.guard +; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_cbranch_vccz BB0_7 +; GCN-NEXT: ; %bb.5: ; %bb8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: s_and_b64 vcc, exec, 0 -; GCN-NEXT: BB0_4: ; %bb9 +; GCN-NEXT: BB0_6: ; %bb9 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: s_cbranch_vccz BB0_4 -; GCN-NEXT: BB0_5: ; %DummyReturnBlock +; GCN-NEXT: s_cbranch_vccz BB0_6 +; GCN-NEXT: BB0_7: ; %DummyReturnBlock ; GCN-NEXT: s_endpgm ; IR-LABEL: @reduced_nested_loop_conditions( ; IR-NEXT: bb: @@ -84,7 +92,6 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca ; IR: bb23: ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]]) ; IR-NEXT: ret void -; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp @@ -268,7 +275,6 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]]) ; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef ; IR-NEXT: ret void -; bb: %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %my.tmp1 = zext i32 %my.tmp to i64 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll index fd3d3857404f..7387e98ae864 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -166,45 +166,72 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_load_dword s4, s[0:1], 0xc -; SI-NEXT: s_brev_b32 s5, 44 +; SI-NEXT: s_load_dword s8, s[0:1], 0xc +; SI-NEXT: s_brev_b32 s9, 44 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, 0 -; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], s3, 4 -; SI-NEXT: s_or_b64 s[8:9], s[0:1], s[2:3] -; SI-NEXT: s_and_b64 s[0:1], exec, s[2:3] -; SI-NEXT: s_and_b64 s[2:3], exec, s[8:9] +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3 +; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_f32_e64 s[8:9], |v0|, s5 +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 +; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, 3 -; SI-NEXT: BB3_1: ; %while.cond -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 vcc, s[0:1] -; SI-NEXT: s_cbranch_vccz BB3_5 -; SI-NEXT: ; %bb.2: ; %convex.exit -; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; SI-NEXT: s_mov_b64 vcc, s[2:3] +; SI-NEXT: s_branch BB3_4 +; SI-NEXT: BB3_1: ; %Flow6 +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[10:11], 0 +; SI-NEXT: BB3_2: ; %Flow5 +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[14:15], 0 +; SI-NEXT: BB3_3: ; %Flow +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[12:13] ; SI-NEXT: s_cbranch_vccnz BB3_8 -; SI-NEXT: ; %bb.3: ; %if.end -; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz BB3_1 -; SI-NEXT: ; %bb.4: ; %if.else -; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1 +; SI-NEXT: BB3_4: ; %while.cond +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_mov_b64 s[14:15], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 +; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 vcc, s[0:1] +; SI-NEXT: s_cbranch_vccz BB3_3 +; SI-NEXT: ; %bb.5: ; %convex.exit +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[10:11], -1 +; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 vcc, s[2:3] +; SI-NEXT: s_cbranch_vccz BB3_2 +; SI-NEXT: ; %bb.6: ; %if.end +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 vcc, s[4:5] +; SI-NEXT: s_cbranch_vccz BB3_1 +; SI-NEXT: ; %bb.7: ; %if.else +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[12:13], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_branch BB3_1 -; SI-NEXT: BB3_5: ; %for.cond.preheader +; SI-NEXT: BB3_8: ; %loop.exit.guard4 +; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[10:11] +; SI-NEXT: s_cbranch_vccz BB3_4 +; SI-NEXT: ; %bb.9: ; %loop.exit.guard +; SI-NEXT: s_and_b64 vcc, exec, s[14:15] +; SI-NEXT: s_cbranch_vccz BB3_13 +; SI-NEXT: ; %bb.10: ; %for.cond.preheader ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e8 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0 ; SI-NEXT: s_and_b64 vcc, exec, vcc -; SI-NEXT: s_cbranch_vccz BB3_8 -; SI-NEXT: ; %bb.6: ; %for.body +; SI-NEXT: s_cbranch_vccz BB3_13 +; SI-NEXT: ; %bb.11: ; %for.body ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: BB3_7: ; %self.loop +; SI-NEXT: BB3_12: ; %self.loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_cbranch_vccz BB3_7 -; SI-NEXT: BB3_8: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz BB3_12 +; SI-NEXT: BB3_13: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: loop_land_info_assert: @@ -213,44 +240,71 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s4, s[0:1], 0x30 -; FLAT-NEXT: s_brev_b32 s5, 44 +; FLAT-NEXT: s_load_dword s8, s[0:1], 0x30 +; FLAT-NEXT: s_brev_b32 s9, 44 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, 0 -; FLAT-NEXT: v_cmp_lt_i32_e64 s[2:3], s3, 4 -; FLAT-NEXT: s_or_b64 s[8:9], s[0:1], s[2:3] -; FLAT-NEXT: s_and_b64 s[0:1], exec, s[2:3] -; FLAT-NEXT: s_and_b64 s[2:3], exec, s[8:9] +; FLAT-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 +; FLAT-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 +; FLAT-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3 +; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] +; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_cmp_lt_f32_e64 s[8:9], |v0|, s5 +; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 +; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; FLAT-NEXT: v_mov_b32_e32 v0, 3 -; FLAT-NEXT: BB3_1: ; %while.cond -; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_mov_b64 vcc, s[0:1] -; FLAT-NEXT: s_cbranch_vccz BB3_5 -; FLAT-NEXT: ; %bb.2: ; %convex.exit -; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; FLAT-NEXT: s_mov_b64 vcc, s[2:3] +; FLAT-NEXT: s_branch BB3_4 +; FLAT-NEXT: BB3_1: ; %Flow6 +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[10:11], 0 +; FLAT-NEXT: BB3_2: ; %Flow5 +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[14:15], 0 +; FLAT-NEXT: BB3_3: ; %Flow +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] ; FLAT-NEXT: s_cbranch_vccnz BB3_8 -; FLAT-NEXT: ; %bb.3: ; %if.end -; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; FLAT-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; FLAT-NEXT: s_cbranch_vccnz BB3_1 -; FLAT-NEXT: ; %bb.4: ; %if.else -; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1 +; FLAT-NEXT: BB3_4: ; %while.cond +; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 +; FLAT-NEXT: s_mov_b64 s[14:15], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 +; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 vcc, s[0:1] +; FLAT-NEXT: s_cbranch_vccz BB3_3 +; FLAT-NEXT: ; %bb.5: ; %convex.exit +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 +; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 vcc, s[2:3] +; FLAT-NEXT: s_cbranch_vccz BB3_2 +; FLAT-NEXT: ; %bb.6: ; %if.end +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 vcc, s[4:5] +; FLAT-NEXT: s_cbranch_vccz BB3_1 +; FLAT-NEXT: ; %bb.7: ; %if.else +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[12:13], 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_branch BB3_1 -; FLAT-NEXT: BB3_5: ; %for.cond.preheader +; FLAT-NEXT: BB3_8: ; %loop.exit.guard4 +; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11] +; FLAT-NEXT: s_cbranch_vccz BB3_4 +; FLAT-NEXT: ; %bb.9: ; %loop.exit.guard +; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] +; FLAT-NEXT: s_cbranch_vccz BB3_13 +; FLAT-NEXT: ; %bb.10: ; %for.cond.preheader ; FLAT-NEXT: v_mov_b32_e32 v0, 0x3e8 -; FLAT-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; FLAT-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0 ; FLAT-NEXT: s_and_b64 vcc, exec, vcc -; FLAT-NEXT: s_cbranch_vccz BB3_8 -; FLAT-NEXT: ; %bb.6: ; %for.body +; FLAT-NEXT: s_cbranch_vccz BB3_13 +; FLAT-NEXT: ; %bb.11: ; %for.body ; FLAT-NEXT: s_and_b64 vcc, exec, 0 -; FLAT-NEXT: BB3_7: ; %self.loop +; FLAT-NEXT: BB3_12: ; %self.loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_cbranch_vccz BB3_7 -; FLAT-NEXT: BB3_8: ; %DummyReturnBlock +; FLAT-NEXT: s_cbranch_vccz BB3_12 +; FLAT-NEXT: BB3_13: ; %DummyReturnBlock ; FLAT-NEXT: s_endpgm entry: %cmp = icmp sgt i32 %c0, 0