2017-02-22 08:02:21 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
; Check that WQM isn't triggered by image load/store intrinsics.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test1:
|
|
|
|
;CHECK-NOT: s_wqm
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
|
|
|
|
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
2017-09-11 21:55:39 +08:00
|
|
|
; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
|
2016-03-22 04:28:33 +08:00
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test2:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
2017-09-11 21:55:39 +08:00
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
2017-09-11 21:55:39 +08:00
|
|
|
;CHECK: interp
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK-NOT: interp
|
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK-NOT: exec
|
2017-09-11 21:55:39 +08:00
|
|
|
;CHECK: .size test2
|
|
|
|
define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-09-11 21:55:39 +08:00
|
|
|
%inst23 = extractelement <2 x float> %pos, i32 0
|
|
|
|
%inst24 = extractelement <2 x float> %pos, i32 1
|
|
|
|
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
|
|
|
|
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
|
|
|
|
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
|
|
|
|
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2017-09-11 21:55:39 +08:00
|
|
|
ret <4 x float> %tex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
2017-09-11 21:55:39 +08:00
|
|
|
; ... but disabled for stores (and, in this simple case, not re-enabled) ...
|
2016-03-22 04:28:33 +08:00
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test3:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: store
|
|
|
|
;CHECK-NOT: exec
|
2016-06-21 01:33:43 +08:00
|
|
|
;CHECK: .size test3
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
|
|
|
|
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
2017-09-11 21:55:39 +08:00
|
|
|
; ... and disabled for export.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test3x:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: image_sample
|
|
|
|
;CHECK: exp
|
|
|
|
;CHECK-NOT: exec
|
|
|
|
;CHECK: .size test3x
|
|
|
|
define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
|
|
|
|
main_body:
|
|
|
|
%inst23 = extractelement <2 x float> %pos, i32 0
|
|
|
|
%inst24 = extractelement <2 x float> %pos, i32 1
|
|
|
|
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
|
|
|
|
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
|
|
|
|
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
|
|
|
|
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2017-09-11 21:55:39 +08:00
|
|
|
%tex.0 = extractelement <4 x float> %tex, i32 0
|
|
|
|
%tex.1 = extractelement <4 x float> %tex, i32 1
|
|
|
|
%tex.2 = extractelement <4 x float> %tex, i32 2
|
|
|
|
%tex.3 = extractelement <4 x float> %tex, i32 3
|
|
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-03-22 04:28:33 +08:00
|
|
|
; Check that WQM is re-enabled when required.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test4:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
2019-05-07 06:27:05 +08:00
|
|
|
;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
|
|
|
;CHECK: image_sample
|
2016-04-07 03:40:20 +08:00
|
|
|
define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%c.1 = mul i32 %c, %d
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.1.bc = bitcast i32 %c.1 to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
; Check that WQM is triggered by the wqm intrinsic.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test5:
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
[AMDGPU] Remove unnecessary v_mov from a register to itself in WQM lowering.
Summary:
- SI Whole Quad Mode phase is replacing WQM pseudo instructions with v_mov instructions.
While this is necessary for the special handling of moving results out of WWM live ranges,
it is not necessary for WQM live ranges. The result is a v_mov from a register to itself after every
WQM operation. This change uses a COPY psuedo in these cases, which allows the register
allocator to coalesce the moves away.
Reviewers: tpr, dstuttard, foad, nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71386
2019-12-12 11:31:32 +08:00
|
|
|
; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
|
|
|
|
; does not happen - the v_add should write the return reg directly.
|
|
|
|
;CHECK-NOT: v_mov_b32_e32
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
%out = fadd float %src0, %src1
|
|
|
|
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
|
|
|
|
ret float %out.0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that the wqm intrinsic works correctly for integers.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test6:
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
%out = fadd float %src0, %src1
|
|
|
|
%out.0 = bitcast float %out to i32
|
|
|
|
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
|
|
|
|
%out.2 = bitcast i32 %out.1 to float
|
|
|
|
ret float %out.2
|
|
|
|
}
|
|
|
|
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
; Check that WWM is triggered by the wwm intrinsic.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm1:
|
|
|
|
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
%out = fadd float %src0, %src1
|
|
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
|
|
ret float %out.0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Same as above, but with an integer type.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm2:
|
|
|
|
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_load_dword
|
2017-11-21 02:24:21 +08:00
|
|
|
;CHECK: v_add_{{[iu]}}32_e32
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
%src0.0 = bitcast float %src0 to i32
|
|
|
|
%src1.0 = bitcast float %src1 to i32
|
|
|
|
%out = add i32 %src0.0, %src1.0
|
|
|
|
%out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
|
|
|
|
%out.1 = bitcast i32 %out.0 to float
|
|
|
|
ret float %out.1
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that we don't leave WWM on for computations that don't require WWM,
|
|
|
|
; since that will lead clobbering things that aren't supposed to be clobbered
|
|
|
|
; in cases like this.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm3:
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG]]
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
|
|
|
|
main_body:
|
|
|
|
; use mbcnt to make sure the branch is divergent
|
|
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
|
|
%cc = icmp uge i32 %hi, 32
|
|
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
|
|
|
|
if:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
%out = fadd float %src, %src
|
|
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
|
|
%out.1 = fadd float %src, %out.0
|
|
|
|
br label %endif
|
|
|
|
|
|
|
|
endif:
|
|
|
|
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
|
|
|
ret float %out.2
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
|
|
|
|
; write could clobber disabled channels in the non-WWM one.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm4:
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG]]
|
|
|
|
;CHECK-NEXT: v_mov_b32_e32
|
|
|
|
define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
|
|
|
|
main_body:
|
|
|
|
; use mbcnt to make sure the branch is divergent
|
|
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
|
|
%cc = icmp uge i32 %hi, 32
|
|
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
|
|
|
|
if:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
%out = fadd float %src, %src
|
|
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
|
|
br label %endif
|
|
|
|
|
|
|
|
endif:
|
|
|
|
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
|
|
|
ret float %out.1
|
|
|
|
}
|
|
|
|
|
|
|
|
; Make sure the transition from Exact to WWM then WQM works properly.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm5:
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_store_dword
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG]]
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
%temp = fadd float %src1, %src1
|
|
|
|
%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
|
|
|
|
%out = fadd float %temp.0, %temp.0
|
|
|
|
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
|
|
|
|
ret float %out.0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that WWM is turned on correctly across basic block boundaries.
|
[AMDGPU] Reworked SIFixWWMLiveness
Summary:
I encountered some problems with SIFixWWMLiveness when WWM is in a loop:
1. It sometimes gave invalid MIR where there is some control flow path
to the new implicit use of a register on EXIT_WWM that does not pass
through any def.
2. There were lots of false positives of registers that needed to have
an implicit use added to EXIT_WWM.
3. Adding an implicit use to EXIT_WWM (and adding an implicit def just
before the WWM code, which I tried in order to fix (1)) caused lots
of the values to be spilled and reloaded unnecessarily.
This commit is a rework of SIFixWWMLiveness, with the following changes:
1. Instead of considering any register with a def that can reach the WWM
code and a def that can be reached from the WWM code, it now
considers three specific cases that need to be handled.
2. A register that needs liveness over WWM to be synthesized now has it
done by adding itself as an implicit use to defs other than the
dominant one.
Also added the following fixmes:
FIXME: We should detect whether a register in one of the above
categories is already live at the WWM code before deciding to add the
implicit uses to synthesize its liveness.
FIXME: I believe this whole scheme may be flawed due to the possibility
of the register allocator doing live interval splitting.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D46756
Change-Id: Ie7fba0ede0378849181df3f1a9a7a39ed1a94a94
llvm-svn: 338783
2018-08-03 07:31:32 +08:00
|
|
|
; if..then..endif version
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
;
|
[AMDGPU] Reworked SIFixWWMLiveness
Summary:
I encountered some problems with SIFixWWMLiveness when WWM is in a loop:
1. It sometimes gave invalid MIR where there is some control flow path
to the new implicit use of a register on EXIT_WWM that does not pass
through any def.
2. There were lots of false positives of registers that needed to have
an implicit use added to EXIT_WWM.
3. Adding an implicit use to EXIT_WWM (and adding an implicit def just
before the WWM code, which I tried in order to fix (1)) caused lots
of the values to be spilled and reloaded unnecessarily.
This commit is a rework of SIFixWWMLiveness, with the following changes:
1. Instead of considering any register with a def that can reach the WWM
code and a def that can be reached from the WWM code, it now
considers three specific cases that need to be handled.
2. A register that needs liveness over WWM to be synthesized now has it
done by adding itself as an implicit use to defs other than the
dominant one.
Also added the following fixmes:
FIXME: We should detect whether a register in one of the above
categories is already live at the WWM code before deciding to add the
implicit uses to synthesize its liveness.
FIXME: I believe this whole scheme may be flawed due to the possibility
of the register allocator doing live interval splitting.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D46756
Change-Id: Ie7fba0ede0378849181df3f1a9a7a39ed1a94a94
llvm-svn: 338783
2018-08-03 07:31:32 +08:00
|
|
|
;CHECK-LABEL: {{^}}test_wwm6_then:
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;SI-CHECK: buffer_load_dword
|
|
|
|
;VI-CHECK: flat_load_dword
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG]]
|
|
|
|
;CHECK: %if
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;SI-CHECK: buffer_load_dword
|
|
|
|
;VI-CHECK: flat_load_dword
|
|
|
|
;CHECK: v_add_f32_e32
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
[AMDGPU] Reworked SIFixWWMLiveness
Summary:
I encountered some problems with SIFixWWMLiveness when WWM is in a loop:
1. It sometimes gave invalid MIR where there is some control flow path
to the new implicit use of a register on EXIT_WWM that does not pass
through any def.
2. There were lots of false positives of registers that needed to have
an implicit use added to EXIT_WWM.
3. Adding an implicit use to EXIT_WWM (and adding an implicit def just
before the WWM code, which I tried in order to fix (1)) caused lots
of the values to be spilled and reloaded unnecessarily.
This commit is a rework of SIFixWWMLiveness, with the following changes:
1. Instead of considering any register with a def that can reach the WWM
code and a def that can be reached from the WWM code, it now
considers three specific cases that need to be handled.
2. A register that needs liveness over WWM to be synthesized now has it
done by adding itself as an implicit use to defs other than the
dominant one.
Also added the following fixmes:
FIXME: We should detect whether a register in one of the above
categories is already live at the WWM code before deciding to add the
implicit uses to synthesize its liveness.
FIXME: I believe this whole scheme may be flawed due to the possibility
of the register allocator doing live interval splitting.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D46756
Change-Id: Ie7fba0ede0378849181df3f1a9a7a39ed1a94a94
llvm-svn: 338783
2018-08-03 07:31:32 +08:00
|
|
|
define amdgpu_ps float @test_wwm6_then() {
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
main_body:
|
|
|
|
%src0 = load volatile float, float addrspace(1)* undef
|
|
|
|
; use mbcnt to make sure the branch is divergent
|
|
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
|
|
%cc = icmp uge i32 %hi, 32
|
|
|
|
br i1 %cc, label %endif, label %if
|
|
|
|
|
|
|
|
if:
|
|
|
|
%src1 = load volatile float, float addrspace(1)* undef
|
|
|
|
%out = fadd float %src0, %src1
|
|
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
|
|
br label %endif
|
|
|
|
|
|
|
|
endif:
|
|
|
|
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
|
|
|
ret float %out.1
|
|
|
|
}
|
|
|
|
|
[AMDGPU] Reworked SIFixWWMLiveness
Summary:
I encountered some problems with SIFixWWMLiveness when WWM is in a loop:
1. It sometimes gave invalid MIR where there is some control flow path
to the new implicit use of a register on EXIT_WWM that does not pass
through any def.
2. There were lots of false positives of registers that needed to have
an implicit use added to EXIT_WWM.
3. Adding an implicit use to EXIT_WWM (and adding an implicit def just
before the WWM code, which I tried in order to fix (1)) caused lots
of the values to be spilled and reloaded unnecessarily.
This commit is a rework of SIFixWWMLiveness, with the following changes:
1. Instead of considering any register with a def that can reach the WWM
code and a def that can be reached from the WWM code, it now
considers three specific cases that need to be handled.
2. A register that needs liveness over WWM to be synthesized now has it
done by adding itself as an implicit use to defs other than the
dominant one.
Also added the following fixmes:
FIXME: We should detect whether a register in one of the above
categories is already live at the WWM code before deciding to add the
implicit uses to synthesize its liveness.
FIXME: I believe this whole scheme may be flawed due to the possibility
of the register allocator doing live interval splitting.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D46756
Change-Id: Ie7fba0ede0378849181df3f1a9a7a39ed1a94a94
llvm-svn: 338783
2018-08-03 07:31:32 +08:00
|
|
|
; Check that WWM is turned on correctly across basic block boundaries.
|
|
|
|
; loop version
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm6_loop:
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;SI-CHECK: buffer_load_dword
|
|
|
|
;VI-CHECK: flat_load_dword
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG]]
|
|
|
|
;CHECK: %loop
|
|
|
|
;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
|
|
|
|
;SI-CHECK: buffer_load_dword
|
|
|
|
;VI-CHECK: flat_load_dword
|
|
|
|
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
|
|
|
define amdgpu_ps float @test_wwm6_loop() {
|
|
|
|
main_body:
|
|
|
|
%src0 = load volatile float, float addrspace(1)* undef
|
|
|
|
; use mbcnt to make sure the branch is divergent
|
|
|
|
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
|
|
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
|
|
|
br label %loop
|
|
|
|
|
|
|
|
loop:
|
|
|
|
%counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
|
|
|
|
%src1 = load volatile float, float addrspace(1)* undef
|
|
|
|
%out = fadd float %src0, %src1
|
|
|
|
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
|
|
|
|
%counter.1 = sub i32 %counter, 1
|
|
|
|
%cc = icmp ne i32 %counter.1, 0
|
|
|
|
br i1 %cc, label %loop, label %endloop
|
|
|
|
|
|
|
|
endloop:
|
|
|
|
ret float %out.0
|
|
|
|
}
|
|
|
|
|
2017-08-05 02:36:54 +08:00
|
|
|
; Check that @llvm.amdgcn.set.inactive disables WWM.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_set_inactive1:
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: s_not_b64 exec, exec
|
|
|
|
;CHECK: v_mov_b32_e32
|
|
|
|
;CHECK: s_not_b64 exec, exec
|
|
|
|
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
2017-11-21 02:24:21 +08:00
|
|
|
;CHECK: v_add_{{[iu]}}32_e32
|
2017-08-05 02:36:54 +08:00
|
|
|
define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
2017-08-05 02:36:54 +08:00
|
|
|
%src.0 = bitcast float %src to i32
|
|
|
|
%src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
|
|
|
|
%out = add i32 %src.1, %src.1
|
|
|
|
%out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
|
|
|
|
%out.1 = bitcast i32 %out.0 to float
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
2017-08-05 02:36:54 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that enabling WQM anywhere enables WQM for the set.inactive source.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_set_inactive2:
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
;CHECK: buffer_load_dword
|
|
|
|
define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
|
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
2017-08-05 02:36:54 +08:00
|
|
|
%src1.0 = bitcast float %src1 to i32
|
|
|
|
%src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
|
2020-01-17 00:34:19 +08:00
|
|
|
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
2017-08-05 02:36:54 +08:00
|
|
|
%src0.0 = bitcast float %src0 to i32
|
|
|
|
%src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
|
|
|
|
%out = add i32 %src0.1, %src1.1
|
|
|
|
%out.0 = bitcast i32 %out to float
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
2017-08-05 02:36:54 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-03-22 04:28:33 +08:00
|
|
|
; Check a case of one branch of an if-else requiring WQM, the other requiring
|
|
|
|
; exact.
|
|
|
|
;
|
|
|
|
; Note: In this particular case, the save-and-restore could be avoided if the
|
|
|
|
; analysis understood that the two branches of the if-else are mutually
|
|
|
|
; exclusive.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_0:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %ELSE
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_mov_b64 exec, [[SAVED]]
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cmp = icmp eq i32 %z, 0
|
|
|
|
br i1 %cmp, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%data.if = extractelement <4 x float> %dtex, i32 0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
|
|
|
ret float %r
|
|
|
|
}
|
|
|
|
|
|
|
|
; Reverse branch order compared to the previous test.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_1:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: %Flow
|
|
|
|
;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
|
|
|
|
;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
|
|
|
|
;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
|
2020-01-22 12:07:55 +08:00
|
|
|
;CHECK-NEXT: s_cbranch_execz [[END_BB:BB[0-9]+_[0-9]+]]
|
|
|
|
;CHECK-NEXT: ; %bb.{{[0-9]+}}: ; %ELSE
|
2016-06-23 04:15:28 +08:00
|
|
|
;CHECK: store_dword
|
|
|
|
;CHECK: [[END_BB]]: ; %END
|
|
|
|
;CHECK: s_or_b64 exec, exec,
|
|
|
|
;CHECK: v_mov_b32_e32 v0
|
|
|
|
;CHECK: ; return
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cmp = icmp eq i32 %z, 0
|
|
|
|
br i1 %cmp, label %ELSE, label %IF
|
|
|
|
|
|
|
|
IF:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%data.if = extractelement <4 x float> %dtex, i32 0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
|
|
|
ret float %r
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that branch conditions are properly marked as needing WQM...
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_2:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: load
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: v_cmp
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%idx.1 = extractelement <3 x i32> %idx, i32 0
|
|
|
|
%data.1 = extractelement <2 x float> %data, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
; The load that determines the branch (and should therefore be WQM) is
|
|
|
|
; surrounded by stores that require disabled WQM.
|
|
|
|
%idx.2 = extractelement <3 x i32> %idx, i32 1
|
2020-01-17 00:34:19 +08:00
|
|
|
%z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
%idx.3 = extractelement <3 x i32> %idx, i32 2
|
|
|
|
%data.3 = extractelement <2 x float> %data, i32 1
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
%cc = fcmp ogt float %z, 0.0
|
|
|
|
br i1 %cc, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
|
|
|
%coord.IF = mul i32 %coord, 3
|
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
|
|
|
%coord.ELSE = mul i32 %coord, 4
|
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
|
2017-03-22 00:24:12 +08:00
|
|
|
%coord.END.bc = bitcast i32 %coord.END to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
|
|
|
; ... but only if they really do need it.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_3:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: image_sample
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2017-09-20 04:54:38 +08:00
|
|
|
;CHECK-DAG: v_cmp
|
|
|
|
;CHECK-DAG: store
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%dtex.1 = extractelement <4 x float> %dtex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2016-09-03 20:26:38 +08:00
|
|
|
%cc = fcmp ogt float %dtex.1, 0.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br i1 %cc, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
2016-09-03 20:26:38 +08:00
|
|
|
%tex.IF = fmul float %dtex.1, 3.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2016-09-03 20:26:38 +08:00
|
|
|
%tex.ELSE = fmul float %dtex.1, 4.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
|
|
|
|
ret float %tex.END
|
|
|
|
}
|
|
|
|
|
|
|
|
; Another test that failed at some point because of terminator handling.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_4:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: load
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_mov_b64 exec, [[SAVE]]
|
|
|
|
;CHECK: %END
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cond = icmp eq i32 %y, 0
|
|
|
|
br i1 %cond, label %IF, label %END
|
|
|
|
|
|
|
|
IF:
|
2020-01-17 00:34:19 +08:00
|
|
|
%data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0)
|
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
; Kill is performed in WQM mode so that uniform kill behaves correctly ...
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_kill_0:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
;CHECK: buffer_store_dword
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: v_cmpx_
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
2016-08-03 03:31:14 +08:00
|
|
|
;CHECK: buffer_store_dword
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: s_mov_b64 exec, [[SAVE]]
|
|
|
|
;CHECK: image_sample
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
%idx.0 = extractelement <2 x i32> %idx, i32 0
|
|
|
|
%data.0 = extractelement <2 x float> %data, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2018-12-08 01:46:16 +08:00
|
|
|
%z.cmp = fcmp olt float %z, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %z.cmp)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
%idx.1 = extractelement <2 x i32> %idx, i32 1
|
|
|
|
%data.1 = extractelement <2 x float> %data, i32 1
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
|
|
|
|
%tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex2.0 = extractelement <4 x float> %tex2, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%out = fadd <4 x float> %tex, %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
ret <4 x float> %out
|
|
|
|
}
|
|
|
|
|
|
|
|
; ... but only if WQM is necessary.
|
|
|
|
;
|
2016-05-21 11:55:07 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_kill_1:
|
|
|
|
; CHECK-NEXT: ; %main_body
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
; CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: buffer_store_dword
|
2016-05-21 11:55:07 +08:00
|
|
|
; CHECK-NOT: wqm
|
|
|
|
; CHECK: v_cmpx_
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2018-12-08 01:46:16 +08:00
|
|
|
%z.cmp = fcmp olt float %z, 0.0
|
|
|
|
call void @llvm.amdgcn.kill(i1 %z.cmp)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Add amdgpu-ps-wqm-outputs function attributes
Summary:
The presence of this attribute indicates that VGPR outputs should be computed
in whole quad mode. This will be used by Mesa for prolog pixel shaders, so
that derivatives can be taken of shader inputs computed by the prolog, fixing
a bug.
The generated code could certainly be improved: if a prolog pixel shader is
used (which isn't common in modern OpenGL - they're used for gl_Color, polygon
stipples, and forcing per-sample interpolation), Mesa will use this attribute
unconditionally, because it has to be conservative. So WQM may be used in the
prolog when it isn't really needed, and furthermore a silly back-and-forth
switch is likely to happen at the boundary between prolog and main shader
parts.
Fixing this is a bit involved: we'd first have to add a mechanism by which
LLVM writes the WQM-related input requirements to the main shader part binary,
and then Mesa specializes the prolog part accordingly. At that point, we may
as well just compile a monolithic shader...
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=95130
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: http://reviews.llvm.org/D20839
llvm-svn: 272063
2016-06-08 05:37:17 +08:00
|
|
|
; Check prolog shaders.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_prolog_1:
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: v_add_f32_e32 v0,
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2017-08-05 02:36:54 +08:00
|
|
|
define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
|
AMDGPU: Add amdgpu-ps-wqm-outputs function attributes
Summary:
The presence of this attribute indicates that VGPR outputs should be computed
in whole quad mode. This will be used by Mesa for prolog pixel shaders, so
that derivatives can be taken of shader inputs computed by the prolog, fixing
a bug.
The generated code could certainly be improved: if a prolog pixel shader is
used (which isn't common in modern OpenGL - they're used for gl_Color, polygon
stipples, and forcing per-sample interpolation), Mesa will use this attribute
unconditionally, because it has to be conservative. So WQM may be used in the
prolog when it isn't really needed, and furthermore a silly back-and-forth
switch is likely to happen at the boundary between prolog and main shader
parts.
Fixing this is a bit involved: we'd first have to add a mechanism by which
LLVM writes the WQM-related input requirements to the main shader part binary,
and then Mesa specializes the prolog part accordingly. At that point, we may
as well just compile a monolithic shader...
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=95130
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: http://reviews.llvm.org/D20839
llvm-svn: 272063
2016-06-08 05:37:17 +08:00
|
|
|
main_body:
|
|
|
|
%s = fadd float %a, %b
|
|
|
|
ret float %s
|
|
|
|
}
|
|
|
|
|
2016-08-03 03:17:37 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_loop_vcc:
|
|
|
|
; CHECK-NEXT: ; %entry
|
|
|
|
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: image_store
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
2016-08-19 05:21:53 +08:00
|
|
|
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
2018-09-11 19:56:50 +08:00
|
|
|
; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
|
2016-08-03 03:17:37 +08:00
|
|
|
|
2020-04-06 21:05:58 +08:00
|
|
|
; CHECK: ; %body
|
|
|
|
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
2019-08-30 03:03:58 +08:00
|
|
|
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
|
2019-08-23 00:21:32 +08:00
|
|
|
; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
|
2020-04-06 21:05:58 +08:00
|
|
|
; CHECK: s_cbranch_vccz
|
2019-08-23 00:21:32 +08:00
|
|
|
|
2020-04-06 21:05:58 +08:00
|
|
|
; CHECK: s_cbranch_vccnz [[LOOPHDR]]
|
2019-08-30 03:03:58 +08:00
|
|
|
|
|
|
|
; CHECK: ; %break
|
2016-08-03 03:17:37 +08:00
|
|
|
; CHECK: ; return
|
|
|
|
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
|
|
|
|
entry:
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
|
2016-08-03 03:17:37 +08:00
|
|
|
br label %loop
|
|
|
|
|
|
|
|
loop:
|
2016-08-19 05:21:53 +08:00
|
|
|
%ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
|
2016-08-03 03:17:37 +08:00
|
|
|
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
|
2016-08-19 05:21:53 +08:00
|
|
|
%cc = fcmp ogt float %ctr.iv, 7.0
|
2016-08-03 03:17:37 +08:00
|
|
|
br i1 %cc, label %break, label %body
|
|
|
|
|
|
|
|
body:
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%c.iv0 = extractelement <4 x float> %c.iv, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
2016-08-19 05:21:53 +08:00
|
|
|
%ctr.next = fadd float %ctr.iv, 2.0
|
2016-08-03 03:17:37 +08:00
|
|
|
br label %loop
|
|
|
|
|
|
|
|
break:
|
|
|
|
ret <4 x float> %c.iv
|
|
|
|
}
|
|
|
|
|
2016-08-03 03:31:14 +08:00
|
|
|
; Only intrinsic stores need exact execution -- other stores do not have
|
|
|
|
; externally visible effects and may require WQM for correctness.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_alloca:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
2020-01-22 06:27:57 +08:00
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
2020-01-22 06:27:57 +08:00
|
|
|
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
|
2016-08-03 03:31:14 +08:00
|
|
|
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
2016-09-03 20:26:38 +08:00
|
|
|
; CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: buffer_store_dwordx4
|
|
|
|
define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
|
|
|
|
entry:
|
2018-02-03 00:07:16 +08:00
|
|
|
%array = alloca [32 x i32], align 4, addrspace(5)
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
%s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
|
|
|
|
store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
%c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
|
|
|
|
%c = load i32, i32 addrspace(5)* %c.gep, align 4
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
|
|
|
call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
|
2016-08-03 03:31:14 +08:00
|
|
|
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-09-03 20:26:32 +08:00
|
|
|
; Must return to exact at the end of a non-void returning shader,
|
|
|
|
; otherwise the EXEC mask exported by the epilog will be wrong. This is true
|
|
|
|
; even if the shader has no kills, because a kill could have happened in a
|
|
|
|
; previous shader fragment.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_nonvoid_return:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
;
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK-NOT: exec
|
|
|
|
define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:32 +08:00
|
|
|
ret <4 x float> %dtex
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
;
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK-NOT: exec
|
|
|
|
define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
|
|
|
|
entry:
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
2016-09-03 20:26:32 +08:00
|
|
|
%cc = icmp sgt i32 %c, 0
|
|
|
|
br i1 %cc, label %if, label %else
|
|
|
|
|
|
|
|
if:
|
2016-10-29 03:43:31 +08:00
|
|
|
store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
|
2016-09-03 20:26:32 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
else:
|
|
|
|
ret <4 x float> %dtex
|
|
|
|
}
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2016-09-13 00:25:20 +08:00
|
|
|
; Test awareness that s_wqm_b64 clobbers SCC.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_scc:
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: s_cmp_
|
|
|
|
; CHECK-NEXT: s_cbranch_scc
|
2020-04-06 17:56:11 +08:00
|
|
|
; CHECK: ; %else
|
2020-04-06 21:05:58 +08:00
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: ; %if
|
2020-04-06 17:56:11 +08:00
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: ; %end
|
2020-04-06 21:05:58 +08:00
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-13 00:25:20 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
|
|
|
|
main_body:
|
|
|
|
%cc = icmp sgt i32 %sel, 0
|
|
|
|
br i1 %cc, label %if, label %else
|
|
|
|
|
|
|
|
if:
|
2020-01-17 00:34:19 +08:00
|
|
|
%r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
2016-09-13 00:25:20 +08:00
|
|
|
br label %end
|
|
|
|
|
|
|
|
else:
|
2020-01-17 00:34:19 +08:00
|
|
|
%r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
|
2016-09-13 00:25:20 +08:00
|
|
|
br label %end
|
|
|
|
|
|
|
|
end:
|
|
|
|
%r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
|
2020-01-17 00:34:19 +08:00
|
|
|
call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
2016-09-13 00:25:20 +08:00
|
|
|
ret <4 x float> %r
|
|
|
|
}
|
|
|
|
|
2018-05-28 01:26:11 +08:00
|
|
|
; Check a case of a block being entirely WQM except for a bit of WWM.
|
|
|
|
; There was a bug where it forgot to enter and leave WWM.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_wwm_within_wqm:
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: s_or_saveexec_b64 {{.*}}, -1
|
|
|
|
;CHECK: ds_swizzle
|
|
|
|
;
|
|
|
|
define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
|
|
|
main_body:
|
|
|
|
%c.bc = bitcast i32 %c to float
|
2020-01-17 00:34:19 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
2020-01-17 00:34:19 +08:00
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
2018-05-28 01:26:11 +08:00
|
|
|
%cmp = icmp eq i32 %z, 0
|
|
|
|
br i1 %cmp, label %IF, label %ENDIF
|
|
|
|
|
|
|
|
IF:
|
|
|
|
%dataf = extractelement <4 x float> %dtex, i32 0
|
|
|
|
%data1 = fptosi float %dataf to i32
|
|
|
|
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
|
|
|
|
%data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
|
|
|
|
%data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
|
|
|
|
%data4f = sitofp i32 %data4 to float
|
|
|
|
br label %ENDIF
|
|
|
|
|
|
|
|
ENDIF:
|
|
|
|
%r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
|
|
|
|
ret float %r
|
|
|
|
}
|
|
|
|
|
2017-02-22 08:02:21 +08:00
|
|
|
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
|
2020-01-17 00:34:19 +08:00
|
|
|
|
|
|
|
declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
|
|
|
|
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
|
|
|
|
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
|
|
|
|
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
|
|
|
|
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
|
|
|
|
declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
|
|
|
|
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
|
2018-12-08 01:46:16 +08:00
|
|
|
declare void @llvm.amdgcn.kill(i1) #1
|
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
2017-08-05 02:36:49 +08:00
|
|
|
declare float @llvm.amdgcn.wqm.f32(float) #3
|
|
|
|
declare i32 @llvm.amdgcn.wqm.i32(i32) #3
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
declare float @llvm.amdgcn.wwm.f32(float) #3
|
|
|
|
declare i32 @llvm.amdgcn.wwm.i32(i32) #3
|
2017-08-05 02:36:54 +08:00
|
|
|
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
|
[AMDGPU] Add support for Whole Wavefront Mode
Summary:
Whole Wavefront Wode (WWM) is similar to WQM, except that all of the
lanes are always enabled, regardless of control flow. This is required
for implementing wavefront reductions in non-uniform control flow, where
we need to use the inactive lanes to propagate intermediate results, so
they need to be enabled. We need to propagate WWM to uses (unless
they're explicitly marked as exact) so that they also propagate
intermediate results correctly. We do the analysis and exec mask munging
during the WQM pass, since there are interactions with WQM for things
that require both WQM and WWM. For simplicity, WWM is entirely
block-local -- blocks are never WWM on entry or exit of a block, and WWM
is not propagated to the block level. This means that computations
involving WWM cannot involve control flow, but we only ever plan to use
WWM for a few limited purposes (none of which involve control flow)
anyways.
Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There
isn't yet a way to turn WWM off -- that will be added in a future
change.
Finally, it turns out that turning on inactive lanes causes a number of
problems with register allocation. While the best long-term solution
seems like teaching LLVM's register allocator about predication, for now
we need to add some hacks to prevent ourselves from getting into trouble
due to constraints that aren't currently expressed in LLVM. For the gory
details, see the comments at the top of SIFixWWMLiveness.cpp.
Reviewers: arsenm, nhaehnle, tpr
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D35524
llvm-svn: 310087
2017-08-05 02:36:52 +08:00
|
|
|
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
|
|
|
|
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
|
2017-09-11 21:55:39 +08:00
|
|
|
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
|
|
|
|
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
|
|
|
|
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
|
|
|
|
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
|
2018-05-28 01:26:11 +08:00
|
|
|
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
attributes #1 = { nounwind }
|
|
|
|
attributes #2 = { nounwind readonly }
|
|
|
|
attributes #3 = { nounwind readnone }
|
2017-08-05 02:36:54 +08:00
|
|
|
attributes #4 = { nounwind readnone convergent }
|
|
|
|
attributes #5 = { "amdgpu-ps-wqm-outputs" }
|
2017-09-11 21:55:39 +08:00
|
|
|
attributes #6 = { nounwind "InitialPSInputAddr"="2" }
|