llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s

; VI-LABEL: {{^}}dpp_test:
; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
; VI: v_mov_b32_e32 v1, s{{[0-9]+}}
; VI-OPT: s_nop 1
; VI-NOOPT: s_nop 0
; VI-NOOPT: s_nop 0
; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1) #0
  store i32 %tmp0, i32 addrspace(1)* %out
  ret void
}

; VI-LABEL: {{^}}dpp_test1:
; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
; VI-NEXT: s_nop 0
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
  %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
  fence syncscope("workgroup-one-as") release
  tail call void @llvm.amdgcn.s.barrier()
  fence syncscope("workgroup-one-as") acquire
  %tmp4 = add nsw i32 %tmp3, %tmp3
  %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
  %tmp6 = add nsw i32 %tmp5, %tmp4
  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
  store i32 %tmp6, i32* %tmp7, align 4
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }
[AMDGPU]: Turn on the DPP combiner by default Differential revision: https://reviews.llvm.org/D55314 llvm-svn: 348371 2018-12-05 23:21:17 +08:00			`; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefix=VI -check-prefix=VI-OPT %s`
			`; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-09 02:52:22 +08:00
			`; VI-LABEL: {{^}}dpp_test:`
			`; VI: v_mov_b32_e32 v0, s{{[0-9]+}}`
			`; VI: v_mov_b32_e32 v1, s{{[0-9]+}}`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`; VI-OPT: s_nop 1`
			`; VI-NOOPT: s_nop 0`
			`; VI-NOOPT: s_nop 0`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-09 02:52:22 +08:00			`; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]`
			`define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {`
			`%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1) #0`
			`store i32 %tmp0, i32 addrspace(1)* %out`
			`ret void`
			`}`

run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`; VI-LABEL: {{^}}dpp_test1:`
AMDGPU: Select VOP3 form of add The VOP3 form should always be the preferred selection, to be shrunk later. This should only be an optimization issue, but this partially works around a problem from clobbering VCC when SIFixSGPRCopies rewrites an SCC defining operation directly to VCC. 3 of the testcases are regressions from failing to fold the immediate in cases it should. These can be avoided by improving the VCC liveness handling in SIFoldOperands. Simply increasing the threshold to computeRegisterLiveness works, although this is common enough that VCC liveness should probably be tracked throughout the pass. The hack of leaving behind an implicit_def instruction to avoid breaking iterator wastes instruction count, which inhibits finding the VCC def in long chains of adds. Doing this however exposes different, worse looking regressions from poor scheduling behavior. This could probably be avoided around by forcing the shrink of the addc here, but the scheduler should probably be fixed. The r600 add test needs to be split out because it asserts on the arguments in the new test during the calling convention lowering. llvm-svn: 360293 2019-05-09 06:09:57 +08:00			`; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}`
			`; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0`
			`; VI-NEXT: s_nop 0`
			`; VI-NEXT: s_nop 0`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-30 04:10:42 +08:00			`; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4`
			`define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = zext i32 %tmp to i64`
			`%tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp`
			`%tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4`
AMDGPU: Add support for cross address space synchronization scopes Differential Revision: https://reviews.llvm.org/D59517 llvm-svn: 356946 2019-03-26 04:50:21 +08:00			`fence syncscope("workgroup-one-as") release`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`tail call void @llvm.amdgcn.s.barrier()`
AMDGPU: Add support for cross address space synchronization scopes Differential Revision: https://reviews.llvm.org/D59517 llvm-svn: 356946 2019-03-26 04:50:21 +08:00			`fence syncscope("workgroup-one-as") acquire`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 18:02:41 +08:00			`%tmp4 = add nsw i32 %tmp3, %tmp3`
			`%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)`
			`%tmp6 = add nsw i32 %tmp5, %tmp4`
			`%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1`
			`store i32 %tmp6, i32* %tmp7, align 4`
			`ret void`
			`}`

			`declare i32 @llvm.amdgcn.workitem.id.x()`
			`declare void @llvm.amdgcn.s.barrier()`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-09 02:52:22 +08:00			`declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0`

			`attributes #0 = { nounwind readnone convergent }`