llvm-project/llvm/test/CodeGen/AMDGPU/si-scheduler.ll

; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so
; we need to disable this when the si scheduler is being used.
; The only way the subtarget knows that the si machine scheduler is being used
; is to specify -mattr=si-scheduler.  If we just pass --misched=si, the backend
; won't know what scheduler we are using.
; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 --misched=si -mattr=si-scheduler < %s | FileCheck %s

; The test checks the "si" machine scheduler pass works correctly.

; CHECK-LABEL: {{^}}main:
; CHECK: s_wqm
; CHECK: s_load_dwordx8
; CHECK: s_load_dwordx4
; CHECK: s_waitcnt lgkmcnt(0)
; CHECK: image_sample
; CHECK: s_waitcnt vmcnt(0)
; CHECK: exp
; CHECK: s_endpgm
define amdgpu_ps void @main([6 x <16 x i8>] addrspace(4)* inreg %arg, [17 x <16 x i8>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
main_body:
  %tmp = bitcast [34 x <8 x i32>] addrspace(4)* %arg3 to <32 x i8> addrspace(4)*
  %tmp22 = load <32 x i8>, <32 x i8> addrspace(4)* %tmp, align 32, !tbaa !0
  %tmp23 = bitcast [17 x <4 x i32>] addrspace(4)* %arg2 to <16 x i8> addrspace(4)*
  %tmp24 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp23, align 16, !tbaa !0
  %i.i = extractelement <2 x i32> %arg11, i32 0
  %j.i = extractelement <2 x i32> %arg11, i32 1
  %i.f.i = bitcast i32 %i.i to float
  %j.f.i = bitcast i32 %j.i to float
  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1
  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1
  %i.i1 = extractelement <2 x i32> %arg11, i32 0
  %j.i2 = extractelement <2 x i32> %arg11, i32 1
  %i.f.i3 = bitcast i32 %i.i1 to float
  %j.f.i4 = bitcast i32 %j.i2 to float
  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1
  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1
  %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>
  %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>
  %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i, float %p2.i6, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i1 0, i32 0, i32 0)

  %tmp32 = extractelement <4 x float> %tmp31, i32 0
  %tmp33 = extractelement <4 x float> %tmp31, i32 1
  %tmp34 = extractelement <4 x float> %tmp31, i32 2
  %tmp35 = extractelement <4 x float> %tmp31, i32 3
  %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33)
  %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35)
  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0
  ret void
}

declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readonly }

!0 = !{!1, !1, i64 0, i32 1}
!1 = !{!"const", !2}
!2 = !{!"tbaa root"}


; CHECK-LABEL: amdgpu_ps_main:
; CHECK: s_buffer_load_dword
define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr {
.entry:
  %tmp = insertelement <2 x i32> zeroinitializer, i32 %arg, i32 0
  %tmp1 = bitcast <2 x i32> %tmp to i64
  %tmp2 = inttoptr i64 %tmp1 to <4 x i32> addrspace(4)*
  %tmp3 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp2, align 16
  %tmp4 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp3, i32 0, i32 0) #0
  switch i32 %tmp4, label %bb [
    i32 0, label %bb5
    i32 1, label %bb6
  ]

bb:                                               ; preds = %.entry
  unreachable

bb5:                                              ; preds = %.entry
  unreachable

bb6:                                              ; preds = %.entry
  unreachable
}

; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #1
AMDGPU/SI: Enable lanemask tracking in misched Summary: This results in higher register usage, but should make it easier for the compiler to hide latency. This pass is a prerequisite for some more scheduler improvements, and I think the increase register usage with this patch is acceptable, because when combined with the scheduler improvements, the total register usage will decrease. shader-db stats: 2382 shaders in 478 tests Totals: SGPRS: 48672 -> 49088 (0.85 %) VGPRS: 34148 -> 34847 (2.05 %) Code Size: 1285816 -> 1289128 (0.26 %) bytes LDS: 28 -> 28 (0.00 %) blocks Scratch: 492544 -> 573440 (16.42 %) bytes per wave Max Waves: 6856 -> 6846 (-0.15 %) Wait states: 0 -> 0 (0.00 %) Depends on D18451 Reviewers: nhaehnle, arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18452 llvm-svn: 264876 2016-03-31 00:35:09 +08:00			`; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so`
			`; we need to disable this when the si scheduler is being used.`
			`; The only way the subtarget knows that the si machine scheduler is being used`
			`; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend`
			`; won't know what scheduler we are using.`
AMDGPU: Remove some uses of llvm.SI.export in tests Merge some of the old, smaller tests into more complete versions. llvm-svn: 295792 2017-02-22 08:02:21 +08:00			`; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s \| FileCheck %s`
[AMDGPU] gfx1010 tests. NFC. llvm-svn: 360615 2019-05-14 03:30:06 +08:00			`; RUN: llc -march=amdgcn -mcpu=gfx1010 --misched=si -mattr=si-scheduler < %s \| FileCheck %s`
AMDGPU/SI: Add SI Machine Scheduler Summary: It is off by default, but can be used with --misched=si Patch by: Axel Davy Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: nhaehnle, solenskiner, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D11885 llvm-svn: 257609 2016-01-14 00:10:10 +08:00
			`; The test checks the "si" machine scheduler pass works correctly.`

			`; CHECK-LABEL: {{^}}main:`
			`; CHECK: s_wqm`
			`; CHECK: s_load_dwordx8`
AMDGPU: Decompose all values to 32-bit pieces for calling conventions This is the more natural lowering, and presents more opportunities to reduce 64-bit ops to 32-bit. This should also help avoid issues graphics shaders have had with 64-bit values, and simplify argument lowering in globalisel. llvm-svn: 366578 2019-07-19 21:57:44 +08:00			`; CHECK: s_load_dwordx4`
AMDGPU/SI: Add SI Machine Scheduler Summary: It is off by default, but can be used with --misched=si Patch by: Axel Davy Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: nhaehnle, solenskiner, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D11885 llvm-svn: 257609 2016-01-14 00:10:10 +08:00			`; CHECK: s_waitcnt lgkmcnt(0)`
			`; CHECK: image_sample`
			`; CHECK: s_waitcnt vmcnt(0)`
			`; CHECK: exp`
			`; CHECK: s_endpgm`
AMDGPU: Drop remnants of byval support for shaders Before 2018, mesa used to use byval interchangably with inreg, which didn't really make sense. Fix tests still using it to avoid breaking in a future commit. llvm-svn: 365953 2019-07-13 04:12:17 +08:00			`define amdgpu_ps void @main([6 x <16 x i8>] addrspace(4)* inreg %arg, [17 x <16 x i8>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {`
AMDGPU/SI: Add SI Machine Scheduler Summary: It is off by default, but can be used with --misched=si Patch by: Axel Davy Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: nhaehnle, solenskiner, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D11885 llvm-svn: 257609 2016-01-14 00:10:10 +08:00			`main_body:`
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 llvm-svn: 325030 2018-02-14 02:00:25 +08:00			`%tmp = bitcast [34 x <8 x i32>] addrspace(4)* %arg3 to <32 x i8> addrspace(4)*`
			`%tmp22 = load <32 x i8>, <32 x i8> addrspace(4)* %tmp, align 32, !tbaa !0`
			`%tmp23 = bitcast [17 x <4 x i32>] addrspace(4)* %arg2 to <16 x i8> addrspace(4)*`
			`%tmp24 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp23, align 16, !tbaa !0`
AMDGPU: Remove SI_fs_constant and SI_fs_interp intrinsics Update test uses with expansion in terms of new intrinsics. llvm-svn: 295269 2017-02-16 10:01:13 +08:00			`%i.i = extractelement <2 x i32> %arg11, i32 0`
			`%j.i = extractelement <2 x i32> %arg11, i32 1`
			`%i.f.i = bitcast i32 %i.i to float`
			`%j.f.i = bitcast i32 %j.i to float`
			`%p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1`
			`%p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1`
			`%i.i1 = extractelement <2 x i32> %arg11, i32 0`
			`%j.i2 = extractelement <2 x i32> %arg11, i32 1`
			`%i.f.i3 = bitcast i32 %i.i1 to float`
			`%j.f.i4 = bitcast i32 %j.i2 to float`
			`%p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1`
			`%p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1`
AMDGPU: Remove old sample intrinsics I did my best to try to update all the uses in tests that just happened to use the old ones to the newer intrinsics. I'm not sure I got all of the immediate operand conversions correct, since the value seems to have been ignored by the old pattern but I don't think it really matters. llvm-svn: 258787 2016-01-26 12:38:08 +08:00			`%tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>`
			`%tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>`
AMDGPU: Convert test cases to the dimension-aware intrinsics Summary: Also explicitly port over some tests in llvm.amdgcn.image.* that were missing. Some tests are removed because they no longer apply (i.e. explicitly testing building an address vector via insertelement). This is in preparation for the eventual removal of the old-style intrinsics. Some additional notes: - constant-address-space-32bit.ll: change some GCN-NEXT to GCN because the instruction schedule was subtly altered - insert_vector_elt.ll: the old test didn't actually test anything, because %tmp1 was not used; remove the load, because it doesn't work (Because of the amdgpu_ps calling convention? In any case, it's orthogonal to what the test claims to be testing.) Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf Reviewers: arsenm, rampitec Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits Differential Revision: https://reviews.llvm.org/D48018 llvm-svn: 335229 2018-06-21 21:37:19 +08:00			`%tmp31 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i, float %p2.i6, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i1 0, i32 0, i32 0)`
AMDGPU: Convert image intrinsic uses in tests llvm-svn: 298386 2017-03-22 00:24:12 +08:00
AMDGPU: Run instnamer on a few tests This will make future test updates easier llvm-svn: 258613 2016-01-23 13:42:43 +08:00			`%tmp32 = extractelement <4 x float> %tmp31, i32 0`
			`%tmp33 = extractelement <4 x float> %tmp31, i32 1`
			`%tmp34 = extractelement <4 x float> %tmp31, i32 2`
			`%tmp35 = extractelement <4 x float> %tmp31, i32 3`
AMDGPU: Add cvt.pkrtz intrinsic Convert llvm.SI.packf16 test uses llvm-svn: 295797 2017-02-22 08:27:34 +08:00			`%tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33)`
			`%tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35)`
			`call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0`
AMDGPU/SI: Add SI Machine Scheduler Summary: It is off by default, but can be used with --misched=si Patch by: Axel Davy Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: nhaehnle, solenskiner, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D11885 llvm-svn: 257609 2016-01-14 00:10:10 +08:00			`ret void`
			`}`

AMDGPU: Remove some uses of llvm.SI.export in tests Merge some of the old, smaller tests into more complete versions. llvm-svn: 295792 2017-02-22 08:02:21 +08:00			`declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1`
			`declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1`
			`declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0`
AMDGPU: Add cvt.pkrtz intrinsic Convert llvm.SI.packf16 test uses llvm-svn: 295797 2017-02-22 08:27:34 +08:00			`declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1`
AMDGPU: Convert test cases to the dimension-aware intrinsics Summary: Also explicitly port over some tests in llvm.amdgcn.image.* that were missing. Some tests are removed because they no longer apply (i.e. explicitly testing building an address vector via insertelement). This is in preparation for the eventual removal of the old-style intrinsics. Some additional notes: - constant-address-space-32bit.ll: change some GCN-NEXT to GCN because the instruction schedule was subtly altered - insert_vector_elt.ll: the old test didn't actually test anything, because %tmp1 was not used; remove the load, because it doesn't work (Because of the amdgpu_ps calling convention? In any case, it's orthogonal to what the test claims to be testing.) Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf Reviewers: arsenm, rampitec Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits Differential Revision: https://reviews.llvm.org/D48018 llvm-svn: 335229 2018-06-21 21:37:19 +08:00			`declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2`
AMDGPU: Remove old sample intrinsics I did my best to try to update all the uses in tests that just happened to use the old ones to the newer intrinsics. I'm not sure I got all of the immediate operand conversions correct, since the value seems to have been ignored by the old pattern but I don't think it really matters. llvm-svn: 258787 2016-01-26 12:38:08 +08:00
AMDGPU: Remove some uses of llvm.SI.export in tests Merge some of the old, smaller tests into more complete versions. llvm-svn: 295792 2017-02-22 08:02:21 +08:00			`attributes #0 = { nounwind }`
			`attributes #1 = { nounwind readnone }`
AMDGPU: Convert image intrinsic uses in tests llvm-svn: 298386 2017-03-22 00:24:12 +08:00			`attributes #2 = { nounwind readonly }`
AMDGPU/SI: Add SI Machine Scheduler Summary: It is off by default, but can be used with --misched=si Patch by: Axel Davy Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: nhaehnle, solenskiner, arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D11885 llvm-svn: 257609 2016-01-14 00:10:10 +08:00
AMDGPU: Run instnamer on a few tests This will make future test updates easier llvm-svn: 258613 2016-01-23 13:42:43 +08:00			`!0 = !{!1, !1, i64 0, i32 1}`
[Verifier] Add verification for TBAA metadata Summary: This change adds some verification in the IR verifier around struct path TBAA metadata. Other than some basic sanity checks (e.g. we get constant integers where we expect constant integers), this checks: - That by the time an struct access tuple `(base-type, offset)` is "reduced" to a scalar base type, the offset is `0`. For instance, in C++ you can't start from, say `("struct-a", 16)`, and end up with `("int", 4)` -- by the time the base type is `"int"`, the offset better be zero. In particular, a variant of this invariant is needed for `llvm::getMostGenericTBAA` to be correct. - That there are no cycles in a struct path. - That struct type nodes have their offsets listed in an ascending order. - That when generating the struct access path, you eventually reach the access type listed in the tbaa tag node. Reviewers: dexonsmith, chandlerc, reames, mehdi_amini, manmanren Subscribers: mcrosier, llvm-commits Differential Revision: https://reviews.llvm.org/D26438 llvm-svn: 289402 2016-12-12 04:07:15 +08:00			`!1 = !{!"const", !2}`
			`!2 = !{!"tbaa root"}`
[AMDGPU] gfx1010 tests. NFC. llvm-svn: 360615 2019-05-14 03:30:06 +08:00

			`; CHECK-LABEL: amdgpu_ps_main:`
[llvm] Fix missing FileCheck directive colons https://reviews.llvm.org/D77352 2020-04-03 06:28:32 +08:00			`; CHECK: s_buffer_load_dword`
[AMDGPU] gfx1010 tests. NFC. llvm-svn: 360615 2019-05-14 03:30:06 +08:00			`define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr {`
			`.entry:`
			`%tmp = insertelement <2 x i32> zeroinitializer, i32 %arg, i32 0`
			`%tmp1 = bitcast <2 x i32> %tmp to i64`
			`%tmp2 = inttoptr i64 %tmp1 to <4 x i32> addrspace(4)*`
			`%tmp3 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp2, align 16`
			`%tmp4 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp3, i32 0, i32 0) #0`
			`switch i32 %tmp4, label %bb [`
			`i32 0, label %bb5`
			`i32 1, label %bb6`
			`]`

			`bb: ; preds = %.entry`
			`unreachable`

			`bb5: ; preds = %.entry`
			`unreachable`

			`bb6: ; preds = %.entry`
			`unreachable`
			`}`

			`; Function Attrs: nounwind readnone`
			`declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) #1`