llvm-project/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s

; GCN-LABEL: {{^}}test_loop:
; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
; GCN: ds_read_b32
; GCN: ds_write_b32
; GCN: s_branch [[LABEL]]
; GCN: s_endpgm
define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
  %cmp = icmp eq i32 %n, -1
  br i1 %cmp, label %for.exit, label %for.body

for.exit:
  ret void

for.body:
  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %tmp = add i32 %indvar, 32
  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
  %vecload = load float, float addrspace(3)* %arrayidx, align 4
  %add = fadd float %vecload, 1.0
  store float %add, float addrspace(3)* %arrayidx, align 8
  %inc = add i32 %indvar, 1
  br label %for.body
}

; GCN-LABEL: @loop_const_true
; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
; GCN: ds_read_b32
; GCN: ds_write_b32
; GCN: s_branch [[LABEL]]
define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
  br label %for.body

for.exit:
  ret void

for.body:
  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %tmp = add i32 %indvar, 32
  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
  %vecload = load float, float addrspace(3)* %arrayidx, align 4
  %add = fadd float %vecload, 1.0
  store float %add, float addrspace(3)* %arrayidx, align 8
  %inc = add i32 %indvar, 1
  br i1 true, label %for.body, label %for.exit
}

; GCN-LABEL: {{^}}loop_const_false:
; GCN-NOT: s_branch
; GCN: s_endpgm
define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
  br label %for.body

for.exit:
  ret void

; XXX - Should there be an S_ENDPGM?
for.body:
  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %tmp = add i32 %indvar, 32
  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
  %vecload = load float, float addrspace(3)* %arrayidx, align 4
  %add = fadd float %vecload, 1.0
  store float %add, float addrspace(3)* %arrayidx, align 8
  %inc = add i32 %indvar, 1
  br i1 false, label %for.body, label %for.exit
}

; GCN-LABEL: {{^}}loop_const_undef:
; GCN-NOT: s_branch
; GCN: s_endpgm
define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
entry:
  br label %for.body

for.exit:
  ret void

; XXX - Should there be an s_endpgm?
for.body:
  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %tmp = add i32 %indvar, 32
  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
  %vecload = load float, float addrspace(3)* %arrayidx, align 4
  %add = fadd float %vecload, 1.0
  store float %add, float addrspace(3)* %arrayidx, align 8
  %inc = add i32 %indvar, 1
  br i1 undef, label %for.body, label %for.exit
}

; GCN-LABEL: {{^}}loop_arg_0:
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
; GCN: v_cmp_eq_u32{{[^,]*}}, 1,

; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]
; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80
; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 4

; GCN: s_cbranch_vccnz [[LOOPBB]]
; GCN-NEXT: ; %bb.2
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
entry:
  br label %for.body

for.exit:
  ret void

for.body:
  %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  %tmp = add i32 %indvar, 32
  %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp
  %vecload = load float, float addrspace(3)* %arrayidx, align 4
  %add = fadd float %vecload, 1.0
  store float %add, float addrspace(3)* %arrayidx, align 8
  %inc = add i32 %indvar, 1
  br i1 %cond, label %for.body, label %for.exit
}
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
			`; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s`

			`; GCN-LABEL: {{^}}test_loop:`
Codegen: Make chains from trellis-shaped CFGs Lay out trellis-shaped CFGs optimally. A trellis of the shape below: A B \|\ /\| \| \ / \| \| X \| \| / \ \| \|/ \\| C D would be laid out A; B->C ; D by the current layout algorithm. Now we identify trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an increasing number of predecessors. A trellis is a a group of 2 or more predecessor blocks that all have the same successors. because of this we can tail duplicate to extend existing trellises. As an example consider the following CFG: B D F H / \ / \ / \ / \ A---C---E---G---Ret Where A,C,E,G are all small (Currently 2 instructions). The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret. The current code will copy C into B, E into D and G into F and yield the layout A,C,B(C),E,D(E),F(G),G,H,ret define void @straight_test(i32 %tag) { entry: br label %test1 test1: ; A %tagbit1 = and i32 %tag, 1 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 br i1 %tagbit1eq0, label %test2, label %optional1 optional1: ; B call void @a() br label %test2 test2: ; C %tagbit2 = and i32 %tag, 2 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 br i1 %tagbit2eq0, label %test3, label %optional2 optional2: ; D call void @b() br label %test3 test3: ; E %tagbit3 = and i32 %tag, 4 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 br i1 %tagbit3eq0, label %test4, label %optional3 optional3: ; F call void @c() br label %test4 test4: ; G %tagbit4 = and i32 %tag, 8 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 br i1 %tagbit4eq0, label %exit, label %optional4 optional4: ; H call void @d() br label %exit exit: ret void } here is the layout after D27742: straight_test: # @straight_test ; ... Prologue elided ; BB#0: # %entry ; A (merged with test1) ; ... More prologue elided mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_2 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_3 b .LBB0_4 .LBB0_2: # %optional1 ; B (copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_4 .LBB0_3: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_5 b .LBB0_6 .LBB0_4: # %optional2 ; D (copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_5: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 b .LBB0_7 .LBB0_6: # %optional3 ; F (copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit ; Ret ld 30, 96(1) # 8-byte Folded Reload addi 1, 1, 112 ld 0, 16(1) mtlr 0 blr The tail-duplication has produced some benefit, but it has also produced a trellis which is not laid out optimally. With this patch, we improve the layouts of such trellises, and decrease the cost calculation for tail-duplication accordingly. This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have back edges, which is a negative, but it has a bigger compensating positive, which is that it handles the case where there are long strings of skipped blocks much better than the original layout. Both layouts handle runs of executed blocks equally well. Branch prediction also improves if there is any correlation between subsequent optional blocks. Here is the resulting concrete layout: straight_test: # @straight_test ; BB#0: # %entry ; A (merged with test1) mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_4 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_5 .LBB0_2: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_3: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 bne 0, .LBB0_7 b .LBB0_8 .LBB0_4: # %optional1 ; B (Copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_2 .LBB0_5: # %optional2 ; D (Copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_3 .LBB0_6: # %optional3 ; F (Copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit Differential Revision: https://reviews.llvm.org/D28522 llvm-svn: 295223 2017-02-16 03:49:14 +08:00			`; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`; GCN: ds_read_b32`
			`; GCN: ds_write_b32`
			`; GCN: s_branch [[LABEL]]`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`entry:`
			`%cmp = icmp eq i32 %n, -1`
			`br i1 %cmp, label %for.exit, label %for.body`

			`for.exit:`
			`ret void`

			`for.body:`
			`%indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]`
			`%tmp = add i32 %indvar, 32`
			`%arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp`
			`%vecload = load float, float addrspace(3)* %arrayidx, align 4`
			`%add = fadd float %vecload, 1.0`
			`store float %add, float addrspace(3)* %arrayidx, align 8`
			`%inc = add i32 %indvar, 1`
			`br label %for.body`
			`}`

			`; GCN-LABEL: @loop_const_true`
			`; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:`
			`; GCN: ds_read_b32`
			`; GCN: ds_write_b32`
			`; GCN: s_branch [[LABEL]]`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`entry:`
			`br label %for.body`

			`for.exit:`
			`ret void`

			`for.body:`
			`%indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]`
			`%tmp = add i32 %indvar, 32`
			`%arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp`
			`%vecload = load float, float addrspace(3)* %arrayidx, align 4`
			`%add = fadd float %vecload, 1.0`
			`store float %add, float addrspace(3)* %arrayidx, align 8`
			`%inc = add i32 %indvar, 1`
			`br i1 true, label %for.body, label %for.exit`
			`}`

			`; GCN-LABEL: {{^}}loop_const_false:`
			`; GCN-NOT: s_branch`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`entry:`
			`br label %for.body`

			`for.exit:`
			`ret void`

			`; XXX - Should there be an S_ENDPGM?`
			`for.body:`
			`%indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]`
			`%tmp = add i32 %indvar, 32`
			`%arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp`
			`%vecload = load float, float addrspace(3)* %arrayidx, align 4`
			`%add = fadd float %vecload, 1.0`
			`store float %add, float addrspace(3)* %arrayidx, align 8`
			`%inc = add i32 %indvar, 1`
			`br i1 false, label %for.body, label %for.exit`
			`}`

			`; GCN-LABEL: {{^}}loop_const_undef:`
			`; GCN-NOT: s_branch`
			`; GCN: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`entry:`
			`br label %for.body`

			`for.exit:`
			`ret void`

			`; XXX - Should there be an s_endpgm?`
			`for.body:`
			`%indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]`
			`%tmp = add i32 %indvar, 32`
			`%arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp`
			`%vecload = load float, float addrspace(3)* %arrayidx, align 4`
			`%add = fadd float %vecload, 1.0`
			`store float %add, float addrspace(3)* %arrayidx, align 8`
			`%inc = add i32 %indvar, 1`
			`br i1 undef, label %for.body, label %for.exit`
			`}`

			`; GCN-LABEL: {{^}}loop_arg_0:`
			`; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}`
[AMDGPU] Fixed incorrect uniform branch condition Summary: I had a case where multiple nested uniform ifs resulted in code that did v_cmp comparisons, combining the results with s_and_b64, s_or_b64 and s_xor_b64 and using the resulting mask in s_cbranch_vccnz, without first ensuring that bits for inactive lanes were clear. There was already code for inserting an "s_and_b64 vcc, exec, vcc" to clear bits for inactive lanes in the case that the branch is instruction selected as s_cbranch_scc1 and is then changed to s_cbranch_vccnz in SIFixSGPRCopies. I have added the same code into SILowerControlFlow for the case that the branch is instruction selected as s_cbranch_vccnz. This de-optimizes the code in some cases where the s_and is not needed, because vcc is the result of a v_cmp, or multiple v_cmp instructions combined by s_and/s_or. We should add a pass to re-optimize those cases. Reviewers: arsenm, kzhuravl Subscribers: wdng, yaxunl, t-tye, llvm-commits, dstuttard, timcorringham, nhaehnle Differential Revision: https://reviews.llvm.org/D41292 llvm-svn: 322119 2018-01-10 05:34:43 +08:00			`; GCN: v_cmp_eq_u32{{[^,]*}}, 1,`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00
			`; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]`
AMDGPU/SI: Avoid moving PHIs to VALU when phi values are defined in scalar branches Reviewers: arsenm Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: https://reviews.llvm.org/D23417 llvm-svn: 288095 2016-11-29 08:46:46 +08:00			`; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80`
			`; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, 4`

AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`; GCN: s_cbranch_vccnz [[LOOPBB]]`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; GCN-NEXT: ; %bb.2`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`; GCN-NEXT: s_endpgm`
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {`
AMDGPU: Un-xfail and add tests Un XFAIL a few tests plus a few more I had lying around in my tree, which seem to all work now but I don't see tests that quite test the same things. llvm-svn: 273655 2016-06-24 14:58:01 +08:00			`entry:`
			`br label %for.body`

			`for.exit:`
			`ret void`

			`for.body:`
			`%indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]`
			`%tmp = add i32 %indvar, 32`
			`%arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp`
			`%vecload = load float, float addrspace(3)* %arrayidx, align 4`
			`%add = fadd float %vecload, 1.0`
			`store float %add, float addrspace(3)* %arrayidx, align 8`
			`%inc = add i32 %indvar, 1`
			`br i1 %cond, label %for.body, label %for.exit`
			`}`