2017-02-22 08:02:21 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
; Check that WQM isn't triggered by image load/store intrinsics.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test1:
|
|
|
|
;CHECK-NOT: s_wqm
|
2016-04-07 03:40:20 +08:00
|
|
|
define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2016-10-13 00:35:29 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
|
|
|
|
call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that WQM is triggered by image samples and left untouched for loads...
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test2:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK-NOT: exec
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
%c.2 = bitcast <4 x float> %c.1 to <4 x i32>
|
|
|
|
%c.3 = extractelement <4 x i32> %c.2, i32 0
|
|
|
|
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
|
|
|
|
%data = load float, float addrspace(1)* %gep
|
2017-02-22 08:02:21 +08:00
|
|
|
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
|
2016-09-03 20:26:32 +08:00
|
|
|
ret void
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
; ... but disabled for stores (and, in this simple case, not re-enabled).
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test3:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: store
|
|
|
|
;CHECK-NOT: exec
|
2016-06-21 01:33:43 +08:00
|
|
|
;CHECK: .size test3
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
|
|
|
|
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
|
2016-08-03 03:31:14 +08:00
|
|
|
|
|
|
|
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
|
|
|
|
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that WQM is re-enabled when required.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test4:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
|
|
|
;CHECK: image_sample
|
2016-04-07 03:40:20 +08:00
|
|
|
define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%c.1 = mul i32 %c, %d
|
2016-08-03 03:31:14 +08:00
|
|
|
|
|
|
|
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.1.bc = bitcast i32 %c.1 to float
|
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
; Check a case of one branch of an if-else requiring WQM, the other requiring
|
|
|
|
; exact.
|
|
|
|
;
|
|
|
|
; Note: In this particular case, the save-and-restore could be avoided if the
|
|
|
|
; analysis understood that the two branches of the if-else are mutually
|
|
|
|
; exclusive.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_0:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %ELSE
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_mov_b64 exec, [[SAVED]]
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cmp = icmp eq i32 %z, 0
|
|
|
|
br i1 %cmp, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%data.if = extractelement <4 x float> %dtex, i32 0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
|
|
|
ret float %r
|
|
|
|
}
|
|
|
|
|
|
|
|
; Reverse branch order compared to the previous test.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_1:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: %Flow
|
|
|
|
;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
|
|
|
|
;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
|
|
|
|
;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
|
2016-06-23 04:15:28 +08:00
|
|
|
;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
|
2016-08-11 03:11:42 +08:00
|
|
|
;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE
|
2016-06-23 04:15:28 +08:00
|
|
|
;CHECK: store_dword
|
|
|
|
;CHECK: [[END_BB]]: ; %END
|
|
|
|
;CHECK: s_or_b64 exec, exec,
|
|
|
|
;CHECK: v_mov_b32_e32 v0
|
|
|
|
;CHECK: ; return
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cmp = icmp eq i32 %z, 0
|
|
|
|
br i1 %cmp, label %ELSE, label %IF
|
|
|
|
|
|
|
|
IF:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%data.if = extractelement <4 x float> %dtex, i32 0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
|
|
|
|
ret float %r
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that branch conditions are properly marked as needing WQM...
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_2:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: load
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: v_cmp
|
2016-08-03 03:31:14 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%idx.1 = extractelement <3 x i32> %idx, i32 0
|
|
|
|
%data.1 = extractelement <2 x float> %data, i32 0
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
; The load that determines the branch (and should therefore be WQM) is
|
|
|
|
; surrounded by stores that require disabled WQM.
|
|
|
|
%idx.2 = extractelement <3 x i32> %idx, i32 1
|
2016-08-03 03:31:14 +08:00
|
|
|
%z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
%idx.3 = extractelement <3 x i32> %idx, i32 2
|
|
|
|
%data.3 = extractelement <2 x float> %data, i32 1
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
%cc = fcmp ogt float %z, 0.0
|
|
|
|
br i1 %cc, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
|
|
|
%coord.IF = mul i32 %coord, 3
|
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
|
|
|
%coord.ELSE = mul i32 %coord, 4
|
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
|
2017-03-22 00:24:12 +08:00
|
|
|
%coord.END.bc = bitcast i32 %coord.END to float
|
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
ret <4 x float> %tex
|
|
|
|
}
|
|
|
|
|
|
|
|
; ... but only if they really do need it.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_3:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: image_sample
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: v_cmp
|
2016-11-11 09:34:21 +08:00
|
|
|
;CHECK: store
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%dtex.1 = extractelement <4 x float> %dtex, i32 0
|
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2016-09-03 20:26:38 +08:00
|
|
|
%cc = fcmp ogt float %dtex.1, 0.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br i1 %cc, label %IF, label %ELSE
|
|
|
|
|
|
|
|
IF:
|
2016-09-03 20:26:38 +08:00
|
|
|
%tex.IF = fmul float %dtex.1, 3.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
ELSE:
|
2016-09-03 20:26:38 +08:00
|
|
|
%tex.ELSE = fmul float %dtex.1, 4.0
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
|
|
|
%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
|
|
|
|
ret float %tex.END
|
|
|
|
}
|
|
|
|
|
|
|
|
; Another test that failed at some point because of terminator handling.
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_control_flow_4:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: %IF
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: load
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: store
|
|
|
|
;CHECK: s_mov_b64 exec, [[SAVE]]
|
|
|
|
;CHECK: %END
|
|
|
|
;CHECK: image_sample
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
|
|
|
%cond = icmp eq i32 %y, 0
|
|
|
|
br i1 %cond, label %IF, label %END
|
|
|
|
|
|
|
|
IF:
|
2016-08-03 03:31:14 +08:00
|
|
|
%data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
|
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
br label %END
|
|
|
|
|
|
|
|
END:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
; Kill is performed in WQM mode so that uniform kill behaves correctly ...
|
|
|
|
;
|
|
|
|
;CHECK-LABEL: {{^}}test_kill_0:
|
|
|
|
;CHECK-NEXT: ; %main_body
|
|
|
|
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
;CHECK-NEXT: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
;CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
;CHECK: buffer_store_dword
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: s_wqm_b64 exec, exec
|
|
|
|
;CHECK: v_cmpx_
|
|
|
|
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
|
2016-08-03 03:31:14 +08:00
|
|
|
;CHECK: buffer_store_dword
|
2016-03-22 04:28:33 +08:00
|
|
|
;CHECK: s_mov_b64 exec, [[SAVE]]
|
|
|
|
;CHECK: image_sample
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
%idx.0 = extractelement <2 x i32> %idx, i32 0
|
|
|
|
%data.0 = extractelement <2 x float> %data, i32 0
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
call void @llvm.AMDGPU.kill(float %z)
|
|
|
|
|
|
|
|
%idx.1 = extractelement <2 x i32> %idx, i32 1
|
|
|
|
%data.1 = extractelement <2 x float> %data, i32 1
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:38 +08:00
|
|
|
%out = fadd <4 x float> %tex, %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
ret <4 x float> %out
|
|
|
|
}
|
|
|
|
|
|
|
|
; ... but only if WQM is necessary.
|
|
|
|
;
|
2016-05-21 11:55:07 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_kill_1:
|
|
|
|
; CHECK-NEXT: ; %main_body
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
2016-09-03 20:26:38 +08:00
|
|
|
; CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: buffer_store_dword
|
2016-05-21 11:55:07 +08:00
|
|
|
; CHECK-NOT: wqm
|
|
|
|
; CHECK: v_cmpx_
|
2017-03-22 00:24:12 +08:00
|
|
|
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
|
2016-03-22 04:28:33 +08:00
|
|
|
main_body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-03-22 04:28:33 +08:00
|
|
|
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
call void @llvm.AMDGPU.kill(float %z)
|
|
|
|
|
2016-09-03 20:26:38 +08:00
|
|
|
ret <4 x float> %dtex
|
2016-03-22 04:28:33 +08:00
|
|
|
}
|
|
|
|
|
AMDGPU: Add amdgpu-ps-wqm-outputs function attributes
Summary:
The presence of this attribute indicates that VGPR outputs should be computed
in whole quad mode. This will be used by Mesa for prolog pixel shaders, so
that derivatives can be taken of shader inputs computed by the prolog, fixing
a bug.
The generated code could certainly be improved: if a prolog pixel shader is
used (which isn't common in modern OpenGL - they're used for gl_Color, polygon
stipples, and forcing per-sample interpolation), Mesa will use this attribute
unconditionally, because it has to be conservative. So WQM may be used in the
prolog when it isn't really needed, and furthermore a silly back-and-forth
switch is likely to happen at the boundary between prolog and main shader
parts.
Fixing this is a bit involved: we'd first have to add a mechanism by which
LLVM writes the WQM-related input requirements to the main shader part binary,
and then Mesa specializes the prolog part accordingly. At that point, we may
as well just compile a monolithic shader...
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=95130
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: http://reviews.llvm.org/D20839
llvm-svn: 272063
2016-06-08 05:37:17 +08:00
|
|
|
; Check prolog shaders.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_prolog_1:
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: v_add_f32_e32 v0,
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
|
|
|
|
main_body:
|
|
|
|
%s = fadd float %a, %b
|
|
|
|
ret float %s
|
|
|
|
}
|
|
|
|
|
2016-08-03 03:17:37 +08:00
|
|
|
; CHECK-LABEL: {{^}}test_loop_vcc:
|
|
|
|
; CHECK-NEXT: ; %entry
|
|
|
|
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: image_store
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
2016-08-19 05:21:53 +08:00
|
|
|
; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
|
|
|
|
; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
|
2016-08-03 03:17:37 +08:00
|
|
|
|
2016-11-08 03:09:33 +08:00
|
|
|
; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
|
2016-08-19 05:21:53 +08:00
|
|
|
; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
|
2017-05-31 00:49:24 +08:00
|
|
|
; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
|
2016-11-08 03:09:33 +08:00
|
|
|
; CHECK: s_cbranch_vccz [[LOOPHDR]]
|
2016-08-03 03:17:37 +08:00
|
|
|
; CHECK: ; %break
|
|
|
|
|
|
|
|
; CHECK: ; return
|
|
|
|
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
|
|
|
|
entry:
|
2016-10-13 00:35:29 +08:00
|
|
|
call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
|
2016-08-03 03:17:37 +08:00
|
|
|
br label %loop
|
|
|
|
|
|
|
|
loop:
|
2016-08-19 05:21:53 +08:00
|
|
|
%ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
|
2016-08-03 03:17:37 +08:00
|
|
|
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
|
2016-08-19 05:21:53 +08:00
|
|
|
%cc = fcmp ogt float %ctr.iv, 7.0
|
2016-08-03 03:17:37 +08:00
|
|
|
br i1 %cc, label %break, label %body
|
|
|
|
|
|
|
|
body:
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-08-19 05:21:53 +08:00
|
|
|
%ctr.next = fadd float %ctr.iv, 2.0
|
2016-08-03 03:17:37 +08:00
|
|
|
br label %loop
|
|
|
|
|
|
|
|
break:
|
|
|
|
ret <4 x float> %c.iv
|
|
|
|
}
|
|
|
|
|
2016-08-03 03:31:14 +08:00
|
|
|
; Only intrinsic stores need exact execution -- other stores do not have
|
|
|
|
; externally visible effects and may require WQM for correctness.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_alloca:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
2017-02-23 05:05:25 +08:00
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}}
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
|
|
|
|
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
2016-09-03 20:26:38 +08:00
|
|
|
; CHECK: image_sample
|
2016-08-03 03:31:14 +08:00
|
|
|
; CHECK: buffer_store_dwordx4
|
|
|
|
define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
|
|
|
|
entry:
|
|
|
|
%array = alloca [32 x i32], align 4
|
|
|
|
|
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
|
|
|
|
|
|
|
|
%s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
|
|
|
|
store volatile i32 %a, i32* %s.gep, align 4
|
|
|
|
|
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
|
|
|
|
|
|
|
|
%c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
|
|
|
|
%c = load i32, i32* %c.gep, align 4
|
2017-03-22 00:24:12 +08:00
|
|
|
%c.bc = bitcast i32 %c to float
|
|
|
|
%t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-08-03 03:31:14 +08:00
|
|
|
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
|
|
|
|
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-09-03 20:26:32 +08:00
|
|
|
; Must return to exact at the end of a non-void returning shader,
|
|
|
|
; otherwise the EXEC mask exported by the epilog will be wrong. This is true
|
|
|
|
; even if the shader has no kills, because a kill could have happened in a
|
|
|
|
; previous shader fragment.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_nonvoid_return:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
;
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK-NOT: exec
|
|
|
|
define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:32 +08:00
|
|
|
ret <4 x float> %dtex
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
|
|
|
|
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
;
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[LIVE]]
|
|
|
|
; CHECK-NOT: exec
|
|
|
|
define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
|
|
|
|
entry:
|
2017-03-22 00:24:12 +08:00
|
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
|
|
|
%dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-03 20:26:32 +08:00
|
|
|
%cc = icmp sgt i32 %c, 0
|
|
|
|
br i1 %cc, label %if, label %else
|
|
|
|
|
|
|
|
if:
|
2016-10-29 03:43:31 +08:00
|
|
|
store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
|
2016-09-03 20:26:32 +08:00
|
|
|
unreachable
|
|
|
|
|
|
|
|
else:
|
|
|
|
ret <4 x float> %dtex
|
|
|
|
}
|
2016-08-03 03:31:14 +08:00
|
|
|
|
2016-09-13 00:25:20 +08:00
|
|
|
; Test awareness that s_wqm_b64 clobbers SCC.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: {{^}}test_scc:
|
|
|
|
; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
|
|
|
|
; CHECK: s_wqm_b64 exec, exec
|
|
|
|
; CHECK: s_cmp_
|
|
|
|
; CHECK-NEXT: s_cbranch_scc
|
|
|
|
; CHECK: ; %if
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: ; %else
|
|
|
|
; CHECK: s_and_b64 exec, exec, [[ORIG]]
|
|
|
|
; CHECK: image_sample
|
|
|
|
; CHECK: ; %end
|
|
|
|
define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
|
|
|
|
main_body:
|
|
|
|
%cc = icmp sgt i32 %sel, 0
|
|
|
|
br i1 %cc, label %if, label %else
|
|
|
|
|
|
|
|
if:
|
2017-03-22 00:24:12 +08:00
|
|
|
%r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-13 00:25:20 +08:00
|
|
|
br label %end
|
|
|
|
|
|
|
|
else:
|
2017-03-22 00:24:12 +08:00
|
|
|
%r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0.0, float bitcast (i32 1 to float)>, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
|
2016-09-13 00:25:20 +08:00
|
|
|
br label %end
|
|
|
|
|
|
|
|
end:
|
|
|
|
%r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
|
|
|
|
call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
|
|
|
|
ret <4 x float> %r
|
|
|
|
}
|
|
|
|
|
2017-02-22 08:02:21 +08:00
|
|
|
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
|
2016-10-13 00:35:29 +08:00
|
|
|
declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
|
2017-03-22 00:24:12 +08:00
|
|
|
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
|
|
|
|
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3
|
|
|
|
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
|
|
|
|
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
|
2017-02-22 08:02:21 +08:00
|
|
|
declare void @llvm.AMDGPU.kill(float) #1
|
2016-03-22 04:28:33 +08:00
|
|
|
|
|
|
|
attributes #1 = { nounwind }
|
|
|
|
attributes #2 = { nounwind readonly }
|
|
|
|
attributes #3 = { nounwind readnone }
|
AMDGPU: Add amdgpu-ps-wqm-outputs function attributes
Summary:
The presence of this attribute indicates that VGPR outputs should be computed
in whole quad mode. This will be used by Mesa for prolog pixel shaders, so
that derivatives can be taken of shader inputs computed by the prolog, fixing
a bug.
The generated code could certainly be improved: if a prolog pixel shader is
used (which isn't common in modern OpenGL - they're used for gl_Color, polygon
stipples, and forcing per-sample interpolation), Mesa will use this attribute
unconditionally, because it has to be conservative. So WQM may be used in the
prolog when it isn't really needed, and furthermore a silly back-and-forth
switch is likely to happen at the boundary between prolog and main shader
parts.
Fixing this is a bit involved: we'd first have to add a mechanism by which
LLVM writes the WQM-related input requirements to the main shader part binary,
and then Mesa specializes the prolog part accordingly. At that point, we may
as well just compile a monolithic shader...
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=95130
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits, kzhuravl
Differential Revision: http://reviews.llvm.org/D20839
llvm-svn: 272063
2016-06-08 05:37:17 +08:00
|
|
|
attributes #4 = { "amdgpu-ps-wqm-outputs" }
|