llvm-project/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}simple_nested_if:
; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]
; GCN-NEXT: s_cbranch_execz [[ENDIF]]
; GCN:      s_and_b64 exec, exec, vcc
; GCN-NEXT: ; mask branch [[ENDIF]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF]]:
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                    ; preds = %bb
  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
  store i32 0, i32 addrspace(1)* %tmp4, align 4
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.outer.end, label %bb.inner.then

bb.inner.then:                                    ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 1
  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
  store i32 1, i32 addrspace(1)* %tmp9, align 4
  br label %bb.outer.end

bb.outer.end:                                     ; preds = %bb.outer.then, %bb.inner.then, %bb
  ret void
}

; GCN-LABEL: {{^}}uncollapsable_nested_if:
; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
; GCN:      s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF_INNER:BB[0-9_]+]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF_INNER]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]]
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                    ; preds = %bb
  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
  store i32 0, i32 addrspace(1)* %tmp4, align 4
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.inner.end, label %bb.inner.then

bb.inner.then:                                    ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 1
  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
  store i32 1, i32 addrspace(1)* %tmp8, align 4
  br label %bb.inner.end

bb.inner.end:                                     ; preds = %bb.inner.then, %bb.outer.then
  %tmp9 = add i32 %tmp, 2
  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp9
  store i32 2, i32 addrspace(1)* %tmp10, align 4
  br label %bb.outer.end

bb.outer.end:                                     ; preds = %bb.inner.then, %bb
  ret void
}

; GCN-LABEL: {{^}}nested_if_if_else:
; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
; GCN:      s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]]
; GCN-NEXT: ; mask branch [[THEN_INNER:BB[0-9_]+]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[THEN_INNER]]:
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]]
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]]
; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
  store i32 0, i32 addrspace(1)* %tmp1, align 4
  %tmp2 = icmp ugt i32 %tmp, 1
  br i1 %tmp2, label %bb.outer.then, label %bb.outer.end

bb.outer.then:                                       ; preds = %bb
  %tmp5 = icmp eq i32 %tmp, 2
  br i1 %tmp5, label %bb.then, label %bb.else

bb.then:                                             ; preds = %bb.outer.then
  %tmp3 = add i32 %tmp, 1
  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp3
  store i32 1, i32 addrspace(1)* %tmp4, align 4
  br label %bb.outer.end

bb.else:                                             ; preds = %bb.outer.then
  %tmp7 = add i32 %tmp, 2
  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7
  store i32 2, i32 addrspace(1)* %tmp9, align 4
  br label %bb.outer.end

bb.outer.end:                                        ; preds = %bb, %bb.then, %bb.else
  ret void
}

; GCN-LABEL: {{^}}nested_if_else_if:
; GCN:      s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]]
; GCN-NEXT: ; mask branch [[THEN_OUTER:BB[0-9_]+]]
; GCN-NEXT: s_cbranch_execz [[THEN_OUTER]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]]
; GCN-NEXT: {{^}}[[THEN_OUTER]]:
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]]
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]]
; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
  store i32 0, i32 addrspace(1)* %tmp1, align 4
  %cc1 = icmp ugt i32 %tmp, 1
  br i1 %cc1, label %bb.outer.then, label %bb.outer.else

bb.outer.then:
  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 1
  store i32 1, i32 addrspace(1)* %tmp2, align 4
  %cc2 = icmp eq i32 %tmp, 2
  br i1 %cc2, label %bb.inner.then, label %bb.outer.end

bb.inner.then:
  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 2
  store i32 2, i32 addrspace(1)* %tmp3, align 4
  br label %bb.outer.end

bb.outer.else:
  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 3
  store i32 3, i32 addrspace(1)* %tmp4, align 4
  %cc3 = icmp eq i32 %tmp, 2
  br i1 %cc3, label %bb.inner.then2, label %bb.outer.end

bb.inner.then2:
  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 4
  store i32 4, i32 addrspace(1)* %tmp5, align 4
  br label %bb.outer.end

bb.outer.end:
  ret void
}

; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:
; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]
; GCN-NEXT: {{^BB[0-9_]+}}:
; GCN:      store_dword
; GCN-NEXT: {{^}}[[ENDIF]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]
; GCN:      s_barrier
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = icmp ugt i32 %tmp, 1
  br i1 %tmp1, label %bb.then, label %bb.end

bb.then:                                          ; preds = %bb
  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
  store i32 0, i32 addrspace(1)* %tmp4, align 4
  br label %bb.end

bb.end:                                           ; preds = %bb.then, %bb
  call void @llvm.amdgcn.s.barrier()
  ret void
}

; Make sure scc liveness is updated if sor_b64 is removed
; GCN-LABEL: {{^}}scc_liveness:

; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]:
; GCN: s_andn2_b64 exec, exec,
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]

; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: s_and_b64 exec, exec, vcc

; GCN-NOT: s_or_b64 exec, exec

; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
; GCN: s_andn2_b64
; GCN-NEXT: s_cbranch_execnz

; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: s_setpc_b64
define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
bb:
  br label %bb1

bb1:                                              ; preds = %Flow1, %bb1, %bb
  %tmp = icmp slt i32 %arg, 519
  br i1 %tmp, label %bb2, label %bb1

bb2:                                              ; preds = %bb1
  %tmp3 = icmp eq i32 %arg, 0
  br i1 %tmp3, label %bb4, label %bb10

bb4:                                              ; preds = %bb2
  %tmp6 = load float, float addrspace(5)* undef
  %tmp7 = fcmp olt float %tmp6, 0.0
  br i1 %tmp7, label %bb8, label %Flow

bb8:                                              ; preds = %bb4
  %tmp9 = insertelement <4 x float> undef, float 0.0, i32 1
  br label %Flow

Flow:                                             ; preds = %bb8, %bb4
  %tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]
  br label %bb10

bb10:                                             ; preds = %Flow, %bb2
  %tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]
  br i1 %tmp3, label %bb12, label %Flow1

Flow1:                                            ; preds = %bb10
  br label %bb1

bb12:                                             ; preds = %bb10
  store volatile <4 x float> %tmp11, <4 x float> addrspace(5)* undef, align 16
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0
declare void @llvm.amdgcn.s.barrier() #1

attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind convergent }
attributes #2 = { nounwind }
AMDGPU: Cleanup subtarget features Try to avoid mutually exclusive features. Don't use a real default GPU, and use a fake "generic". The goal is to make it easier to see which set of features are incompatible between feature strings. Most of the test changes are due to random scheduling changes from not having a default fullspeed model. llvm-svn: 310258 2017-08-07 22:58:04 +08:00			`; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s`
[AMDGPU] Collapse adjacent SI_END_CF Add a pass to remove redundant S_OR_B64 instructions enabling lanes in the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any vector instructions between them we can only keep outer SI_END_CF, given that CFG is structured and exec bits of the outer end statement are always not less than exec bit of the inner one. This needs to be done before the RA to eliminate saved exec bits registers but after register coalescer to have no vector registers copies in between of different end cf statements. Differential Revision: https://reviews.llvm.org/D35967 llvm-svn: 309762 2017-08-02 07:14:32 +08:00
			`; GCN-LABEL: {{^}}simple_nested_if:`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]`
			`; GCN-NEXT: s_cbranch_execz [[ENDIF]]`
[AMDGPU] Turn s_and_saveexec_b64 into s_and_b64 if result is unused With SI_END_CF elimination for some nested control flow we can now eliminate saved exec register completely by turning a saveexec version of instruction into just a logical instruction. Differential Revision: https://reviews.llvm.org/D36007 llvm-svn: 309766 2017-08-02 07:44:35 +08:00			`; GCN: s_and_b64 exec, exec, vcc`
[AMDGPU] Collapse adjacent SI_END_CF Add a pass to remove redundant S_OR_B64 instructions enabling lanes in the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any vector instructions between them we can only keep outer SI_END_CF, given that CFG is structured and exec bits of the outer end statement are always not less than exec bit of the inner one. This needs to be done before the RA to eliminate saved exec bits registers but after register coalescer to have no vector registers copies in between of different end cf statements. Differential Revision: https://reviews.llvm.org/D35967 llvm-svn: 309762 2017-08-02 07:14:32 +08:00			`; GCN-NEXT: ; mask branch [[ENDIF]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF]]:`
			`; GCN-NEXT: s_endpgm`
			`define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = icmp ugt i32 %tmp, 1`
			`br i1 %tmp1, label %bb.outer.then, label %bb.outer.end`

			`bb.outer.then: ; preds = %bb`
			`%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp`
			`store i32 0, i32 addrspace(1)* %tmp4, align 4`
			`%tmp5 = icmp eq i32 %tmp, 2`
			`br i1 %tmp5, label %bb.outer.end, label %bb.inner.then`

			`bb.inner.then: ; preds = %bb.outer.then`
			`%tmp7 = add i32 %tmp, 1`
			`%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7`
			`store i32 1, i32 addrspace(1)* %tmp9, align 4`
			`br label %bb.outer.end`

			`bb.outer.end: ; preds = %bb.outer.then, %bb.inner.then, %bb`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}uncollapsable_nested_if:`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]`
			`; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[ENDIF_INNER:BB[0-9_]+]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF_INNER]]:`
			`; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]]`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:`
			`; GCN-NEXT: s_endpgm`
			`define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = icmp ugt i32 %tmp, 1`
			`br i1 %tmp1, label %bb.outer.then, label %bb.outer.end`

			`bb.outer.then: ; preds = %bb`
			`%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp`
			`store i32 0, i32 addrspace(1)* %tmp4, align 4`
			`%tmp5 = icmp eq i32 %tmp, 2`
			`br i1 %tmp5, label %bb.inner.end, label %bb.inner.then`

			`bb.inner.then: ; preds = %bb.outer.then`
			`%tmp7 = add i32 %tmp, 1`
			`%tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7`
			`store i32 1, i32 addrspace(1)* %tmp8, align 4`
			`br label %bb.inner.end`

			`bb.inner.end: ; preds = %bb.inner.then, %bb.outer.then`
			`%tmp9 = add i32 %tmp, 2`
			`%tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp9`
			`store i32 2, i32 addrspace(1)* %tmp10, align 4`
			`br label %bb.outer.end`

			`bb.outer.end: ; preds = %bb.inner.then, %bb`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}nested_if_if_else:`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]`
			`; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]`
			`; GCN-NEXT: s_xor_b64 [[SAVEEXEC_INNER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_INNER]]`
			`; GCN-NEXT: ; mask branch [[THEN_INNER:BB[0-9_]+]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[THEN_INNER]]:`
			`; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]]`
			`; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]]`
			`; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:`
			`; GCN-NEXT: s_endpgm`
			`define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp`
			`store i32 0, i32 addrspace(1)* %tmp1, align 4`
			`%tmp2 = icmp ugt i32 %tmp, 1`
			`br i1 %tmp2, label %bb.outer.then, label %bb.outer.end`

			`bb.outer.then: ; preds = %bb`
			`%tmp5 = icmp eq i32 %tmp, 2`
			`br i1 %tmp5, label %bb.then, label %bb.else`

			`bb.then: ; preds = %bb.outer.then`
			`%tmp3 = add i32 %tmp, 1`
			`%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp3`
			`store i32 1, i32 addrspace(1)* %tmp4, align 4`
			`br label %bb.outer.end`

			`bb.else: ; preds = %bb.outer.then`
			`%tmp7 = add i32 %tmp, 2`
			`%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp7`
			`store i32 2, i32 addrspace(1)* %tmp9, align 4`
			`br label %bb.outer.end`

			`bb.outer.end: ; preds = %bb, %bb.then, %bb.else`
			`ret void`
			`}`

			`; GCN-LABEL: {{^}}nested_if_else_if:`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]`
			`; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]]`
			`; GCN-NEXT: ; mask branch [[THEN_OUTER:BB[0-9_]+]]`
			`; GCN-NEXT: s_cbranch_execz [[THEN_OUTER]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_ELSE:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[THEN_OUTER_FLOW:BB[0-9_]+]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]:`
			`; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]]`
			`; GCN-NEXT: {{^}}[[THEN_OUTER]]:`
			`; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]]`
			`; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]]`
			`; GCN-NEXT: ; mask branch [[ENDIF_OUTER:BB[0-9_]+]]`
			`; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]]`
[AMDGPU] Eliminate no effect instructions before s_endpgm Differential Revision: https://reviews.llvm.org/D36585 llvm-svn: 310987 2017-08-16 12:43:49 +08:00			`; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]`
[AMDGPU] Collapse adjacent SI_END_CF Add a pass to remove redundant S_OR_B64 instructions enabling lanes in the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any vector instructions between them we can only keep outer SI_END_CF, given that CFG is structured and exec bits of the outer end statement are always not less than exec bit of the inner one. This needs to be done before the RA to eliminate saved exec bits registers but after register coalescer to have no vector registers copies in between of different end cf statements. Differential Revision: https://reviews.llvm.org/D35967 llvm-svn: 309762 2017-08-02 07:14:32 +08:00			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:`
			`; GCN-NEXT: s_endpgm`
			`define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp`
			`store i32 0, i32 addrspace(1)* %tmp1, align 4`
			`%cc1 = icmp ugt i32 %tmp, 1`
			`br i1 %cc1, label %bb.outer.then, label %bb.outer.else`

			`bb.outer.then:`
			`%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 1`
			`store i32 1, i32 addrspace(1)* %tmp2, align 4`
			`%cc2 = icmp eq i32 %tmp, 2`
			`br i1 %cc2, label %bb.inner.then, label %bb.outer.end`

			`bb.inner.then:`
			`%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 2`
			`store i32 2, i32 addrspace(1)* %tmp3, align 4`
			`br label %bb.outer.end`

			`bb.outer.else:`
			`%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 3`
			`store i32 3, i32 addrspace(1)* %tmp4, align 4`
			`%cc3 = icmp eq i32 %tmp, 2`
			`br i1 %cc3, label %bb.inner.then2, label %bb.outer.end`

			`bb.inner.then2:`
			`%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %tmp1, i32 4`
			`store i32 4, i32 addrspace(1)* %tmp5, align 4`
			`br label %bb.outer.end`

			`bb.outer.end:`
			`ret void`
			`}`

[AMDGPU] Eliminate no effect instructions before s_endpgm Differential Revision: https://reviews.llvm.org/D36585 llvm-svn: 310987 2017-08-16 12:43:49 +08:00			`; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:`
			`; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]`
			`; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]`
			`; GCN-NEXT: {{^BB[0-9_]+}}:`
			`; GCN: store_dword`
			`; GCN-NEXT: {{^}}[[ENDIF]]:`
			`; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]`
			`; GCN: s_barrier`
			`; GCN-NEXT: s_endpgm`
			`define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = icmp ugt i32 %tmp, 1`
			`br i1 %tmp1, label %bb.then, label %bb.end`

			`bb.then: ; preds = %bb`
			`%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp`
			`store i32 0, i32 addrspace(1)* %tmp4, align 4`
			`br label %bb.end`

			`bb.end: ; preds = %bb.then, %bb`
			`call void @llvm.amdgcn.s.barrier()`
			`ret void`
			`}`

AMDGPU: Recompute scc liveness The various scalar bit operations set SCC, so one is erased or moved it needs to be recomputed. Not sure why the existing tests don't fail on this. llvm-svn: 312819 2017-09-09 02:51:26 +08:00			`; Make sure scc liveness is updated if sor_b64 is removed`
			`; GCN-LABEL: {{^}}scc_liveness:`

			`; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]:`
			`; GCN: s_andn2_b64 exec, exec,`
			`; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]`

			`; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen`
			`; GCN: s_and_b64 exec, exec, vcc`

			`; GCN-NOT: s_or_b64 exec, exec`

			`; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}`
			`; GCN: s_andn2_b64`
			`; GCN-NEXT: s_cbranch_execnz`

			`; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}`
			`; GCN: buffer_store_dword`
			`; GCN: buffer_store_dword`
			`; GCN: buffer_store_dword`
			`; GCN: buffer_store_dword`
			`; GCN: s_setpc_b64`
			`define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {`
			`bb:`
			`br label %bb1`

			`bb1: ; preds = %Flow1, %bb1, %bb`
			`%tmp = icmp slt i32 %arg, 519`
			`br i1 %tmp, label %bb2, label %bb1`

			`bb2: ; preds = %bb1`
			`%tmp3 = icmp eq i32 %arg, 0`
			`br i1 %tmp3, label %bb4, label %bb10`

			`bb4: ; preds = %bb2`
[AMDGPU] Switch to the new addr space mapping by default This requires corresponding clang change. Differential Revision: https://reviews.llvm.org/D40955 llvm-svn: 324101 2018-02-03 00:07:16 +08:00			`%tmp6 = load float, float addrspace(5)* undef`
AMDGPU: Recompute scc liveness The various scalar bit operations set SCC, so one is erased or moved it needs to be recomputed. Not sure why the existing tests don't fail on this. llvm-svn: 312819 2017-09-09 02:51:26 +08:00			`%tmp7 = fcmp olt float %tmp6, 0.0`
			`br i1 %tmp7, label %bb8, label %Flow`

			`bb8: ; preds = %bb4`
			`%tmp9 = insertelement <4 x float> undef, float 0.0, i32 1`
			`br label %Flow`

			`Flow: ; preds = %bb8, %bb4`
			`%tmp8 = phi <4 x float> [ %tmp9, %bb8 ], [ zeroinitializer, %bb4 ]`
			`br label %bb10`

			`bb10: ; preds = %Flow, %bb2`
			`%tmp11 = phi <4 x float> [ zeroinitializer, %bb2 ], [ %tmp8, %Flow ]`
			`br i1 %tmp3, label %bb12, label %Flow1`

			`Flow1: ; preds = %bb10`
			`br label %bb1`

			`bb12: ; preds = %bb10`
[AMDGPU] Switch to the new addr space mapping by default This requires corresponding clang change. Differential Revision: https://reviews.llvm.org/D40955 llvm-svn: 324101 2018-02-03 00:07:16 +08:00			`store volatile <4 x float> %tmp11, <4 x float> addrspace(5)* undef, align 16`
AMDGPU: Recompute scc liveness The various scalar bit operations set SCC, so one is erased or moved it needs to be recomputed. Not sure why the existing tests don't fail on this. llvm-svn: 312819 2017-09-09 02:51:26 +08:00			`ret void`
			`}`

[AMDGPU] Collapse adjacent SI_END_CF Add a pass to remove redundant S_OR_B64 instructions enabling lanes in the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any vector instructions between them we can only keep outer SI_END_CF, given that CFG is structured and exec bits of the outer end statement are always not less than exec bit of the inner one. This needs to be done before the RA to eliminate saved exec bits registers but after register coalescer to have no vector registers copies in between of different end cf statements. Differential Revision: https://reviews.llvm.org/D35967 llvm-svn: 309762 2017-08-02 07:14:32 +08:00			`declare i32 @llvm.amdgcn.workitem.id.x() #0`
[AMDGPU] Eliminate no effect instructions before s_endpgm Differential Revision: https://reviews.llvm.org/D36585 llvm-svn: 310987 2017-08-16 12:43:49 +08:00			`declare void @llvm.amdgcn.s.barrier() #1`
[AMDGPU] Collapse adjacent SI_END_CF Add a pass to remove redundant S_OR_B64 instructions enabling lanes in the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any vector instructions between them we can only keep outer SI_END_CF, given that CFG is structured and exec bits of the outer end statement are always not less than exec bit of the inner one. This needs to be done before the RA to eliminate saved exec bits registers but after register coalescer to have no vector registers copies in between of different end cf statements. Differential Revision: https://reviews.llvm.org/D35967 llvm-svn: 309762 2017-08-02 07:14:32 +08:00
			`attributes #0 = { nounwind readnone speculatable }`
[AMDGPU] Eliminate no effect instructions before s_endpgm Differential Revision: https://reviews.llvm.org/D36585 llvm-svn: 310987 2017-08-16 12:43:49 +08:00			`attributes #1 = { nounwind convergent }`
AMDGPU: Recompute scc liveness The various scalar bit operations set SCC, so one is erased or moved it needs to be recomputed. Not sure why the existing tests don't fail on this. llvm-svn: 312819 2017-09-09 02:51:26 +08:00			`attributes #2 = { nounwind }`