llvm-project/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s

; SI-LABEL: {{^}}br_i1_phi:

; SI: ; %bb
; SI:    s_mov_b64           [[TMP:s\[[0-9]+:[0-9]+\]]], 0

; SI: ; %bb2
; SI:    s_mov_b64           [[TMP]], exec

; SI: ; %bb3
; SI:    s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]

define amdgpu_kernel void @br_i1_phi(i32 %arg) {
bb:
  %tidig = call i32 @llvm.amdgcn.workitem.id.x()
  %cmp = trunc i32 %tidig to i1
  br i1 %cmp, label %bb2, label %bb3

bb2:                                              ; preds = %bb
  br label %bb3

bb3:                                              ; preds = %bb2, %bb
  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
  br i1 %tmp, label %bb4, label %bb6

bb4:                                              ; preds = %bb3
  %val = load volatile i32, i32 addrspace(1)* undef
  %tmp5 = mul i32 %val, %arg
  br label %bb6

bb6:                                              ; preds = %bb4, %bb3
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }

; Make sure this won't crash.
; SI-LABEL: {{^}}vcopy_i1_undef
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
define <2 x float> @vcopy_i1_undef(<2 x float> addrspace(1)* %p) {
entry:
  br i1 undef, label %exit, label %false

false:
  %x = load <2 x float>, <2 x float> addrspace(1)* %p
  %cmp = fcmp one <2 x float> %x, zeroinitializer
  br label %exit

exit:
  %c = phi <2 x i1> [ undef, %entry ], [ %cmp, %false ]
  %ret = select <2 x i1> %c, <2 x float> <float 2.0, float 2.0>, <2 x float> <float 4.0, float 4.0>
  ret <2 x float> %ret
}
Enable FeatureFlatForGlobal on Volcanic Islands This switches to the workaround that HSA defaults to for the mesa path. This should be applied to the 4.0 branch. Patch by Vedran Miletić <vedran@miletic.net> llvm-svn: 292982 2017-01-25 06:02:15 +08:00			`; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s`
R600/SI: Enable all tests that pass on VI without changes llvm-svn: 227214 2015-01-28 01:27:15 +08:00			`; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s`
R600/SI: Add another failing testcase for i1 copies It's not handling phis. llvm-svn: 220371 2014-10-22 13:30:42 +08:00
R600/SI: Move SIFixSGPRCopies to inst selector passes This should expose more of the actually used VALU instructions to the machine optimization passes. This also should help getting i1 handling into a better state. For not entirly understood reasons, this fixes the split-scalar-i64-add.ll test where a 64-bit add would only partially be moved to the VALU resulting in use of undefined VCC. llvm-svn: 222256 2014-11-19 05:06:58 +08:00			`; SI-LABEL: {{^}}br_i1_phi:`
AMDGPU: Rewrite SILowerI1Copies to always stay on SALU Summary: Instead of writing boolean values temporarily into 32-bit VGPRs if they are involved in PHIs or are observed from outside a loop, we use bitwise masking operations to combine lane masks in a way that is consistent with wave control flow. Move SIFixSGPRCopies to before this pass, since that pass incorrectly attempts to move SGPR phis to VGPRs. This should recover most of the code quality that was lost with the bug fix in "AMDGPU: Remove PHI loop condition optimization". There are still some relevant cases where code quality could be improved, in particular: - We often introduce redundant masks with EXEC. Ideally, we'd have a generic computeKnownBits-like analysis to determine whether masks are already masked by EXEC, so we can avoid this masking both here and when lowering uniform control flow. - The criterion we use to determine whether a def is observed from outside a loop is conservative: it doesn't check whether (loop) branch conditions are uniform. Change-Id: Ibabdb373a7510e426b90deef00f5e16c5d56e64b Reviewers: arsenm, rampitec, tpr Subscribers: kzhuravl, jvesely, wdng, mgorny, yaxunl, dstuttard, t-tye, eraman, llvm-commits Differential Revision: https://reviews.llvm.org/D53496 llvm-svn: 345719 2018-10-31 21:27:08 +08:00
			`; SI: ; %bb`
			`; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], 0`

			`; SI: ; %bb2`
			`; SI: s_mov_b64 [[TMP]], exec`

			`; SI: ; %bb3`
			`; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]`

AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444 2017-03-22 05:39:51 +08:00			`define amdgpu_kernel void @br_i1_phi(i32 %arg) {`
R600/SI: Add another failing testcase for i1 copies It's not handling phis. llvm-svn: 220371 2014-10-22 13:30:42 +08:00			`bb:`
AMDGPU: Remove old intrinsic uses llvm-svn: 303305 2017-05-18 05:38:21 +08:00			`%tidig = call i32 @llvm.amdgcn.workitem.id.x()`
AMDGPU/SI: Detect uniform branches and emit s_cbranch instructions Reviewers: arsenm Subscribers: mareko, MatzeB, qcolombet, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D16603 llvm-svn: 260765 2016-02-13 07:45:29 +08:00			`%cmp = trunc i32 %tidig to i1`
			`br i1 %cmp, label %bb2, label %bb3`
R600/SI: Add another failing testcase for i1 copies It's not handling phis. llvm-svn: 220371 2014-10-22 13:30:42 +08:00
			`bb2: ; preds = %bb`
			`br label %bb3`

			`bb3: ; preds = %bb2, %bb`
			`%tmp = phi i1 [ true, %bb2 ], [ false, %bb ]`
			`br i1 %tmp, label %bb4, label %bb6`

			`bb4: ; preds = %bb3`
AMDGPU: Fix a few slightly broken tests Fix minor bugs and uses of undef which break when pointer related optimization passes are run. llvm-svn: 269944 2016-05-18 23:48:44 +08:00			`%val = load volatile i32, i32 addrspace(1)* undef`
			`%tmp5 = mul i32 %val, %arg`
R600/SI: Add another failing testcase for i1 copies It's not handling phis. llvm-svn: 220371 2014-10-22 13:30:42 +08:00			`br label %bb6`

			`bb6: ; preds = %bb4, %bb3`
			`ret void`
			`}`
AMDGPU/SI: Detect uniform branches and emit s_cbranch instructions Reviewers: arsenm Subscribers: mareko, MatzeB, qcolombet, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D16603 llvm-svn: 260765 2016-02-13 07:45:29 +08:00
AMDGPU: Remove old intrinsic uses llvm-svn: 303305 2017-05-18 05:38:21 +08:00			`declare i32 @llvm.amdgcn.workitem.id.x() #0`
AMDGPU/SI: Detect uniform branches and emit s_cbranch instructions Reviewers: arsenm Subscribers: mareko, MatzeB, qcolombet, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D16603 llvm-svn: 260765 2016-02-13 07:45:29 +08:00
AMDGPU: Remove old intrinsic uses llvm-svn: 303305 2017-05-18 05:38:21 +08:00			`attributes #0 = { nounwind readnone }`
[SelectionDAG] Enhance the simplification of `copyto` from `implicit-def`. Summary: - The current implementation simplifies the case where the source of `copyto` is `implicit-def`ed. However, it only works when that `implicit-def` is single-used since it detects that from `implicit-def` and cannot determine which destination vreg should be used if there are multiple uses. - This patch changes that detection when `copyto` is being emitted. If that `copyto`'s source is defined from `implicit-def`, it simplifies it. Hence, it works even that `implicit-def` is multi-used. - Except it simplifies the internal IR, it won't improve the quality of code generation. However, it helps to detect 'implicit-def` in a straight-forward manner in some passes, such as `si-i1-copies`. A test case is added. Reviewers: sunfish, nhaehnle Subscribers: jvesely, hiraditya, asbirlea, llvm-commits, yaxunl Tags: #llvm Differential Revision: https://reviews.llvm.org/D62342 llvm-svn: 361777 2019-05-28 02:26:29 +08:00
			`; Make sure this won't crash.`
			`; SI-LABEL: {{^}}vcopy_i1_undef`
			`; SI: v_cndmask_b32_e64`
			`; SI: v_cndmask_b32_e64`
			`define <2 x float> @vcopy_i1_undef(<2 x float> addrspace(1)* %p) {`
			`entry:`
			`br i1 undef, label %exit, label %false`

			`false:`
			`%x = load <2 x float>, <2 x float> addrspace(1)* %p`
			`%cmp = fcmp one <2 x float> %x, zeroinitializer`
			`br label %exit`

			`exit:`
			`%c = phi <2 x i1> [ undef, %entry ], [ %cmp, %false ]`
			`%ret = select <2 x i1> %c, <2 x float> <float 2.0, float 2.0>, <2 x float> <float 4.0, float 4.0>`
			`ret <2 x float> %ret`
			`}`