forked from OSchip/llvm-project
1202 lines
49 KiB
LLVM
1202 lines
49 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
|
|
|
|
define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
|
|
; SI-LABEL: static_exact:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_andn2_b64 exec, exec, exec
|
|
; SI-NEXT: s_cbranch_scc0 BB0_2
|
|
; SI-NEXT: ; %bb.1: ; %.entry
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB0_2:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: static_exact:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB0_2
|
|
; GFX9-NEXT: ; %bb.1: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB0_2:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-32-LABEL: static_exact:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB0_2
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB0_2:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
;
|
|
; GFX10-64-LABEL: static_exact:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB0_2
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB0_2:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
.entry:
|
|
%c0 = fcmp olt float %arg0, 0.000000e+00
|
|
%c1 = fcmp oge float %arg1, 0.0
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
%tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
|
|
call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
|
|
; SI-LABEL: dynamic_exact:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
|
|
; SI-NEXT: s_mov_b64 s[2:3], exec
|
|
; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec
|
|
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_cbranch_scc0 BB1_2
|
|
; SI-NEXT: ; %bb.1: ; %.entry
|
|
; SI-NEXT: s_and_b64 exec, exec, s[2:3]
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB1_2:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: dynamic_exact:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
|
|
; GFX9-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec
|
|
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
|
|
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_cbranch_scc0 BB1_2
|
|
; GFX9-NEXT: ; %bb.1: ; %.entry
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB1_2:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-32-LABEL: dynamic_exact:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1
|
|
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo
|
|
; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB1_2
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1
|
|
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB1_2:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
;
|
|
; GFX10-64-LABEL: dynamic_exact:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
|
|
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec
|
|
; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB1_2
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3]
|
|
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB1_2:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
.entry:
|
|
%c0 = fcmp olt float %arg0, 0.000000e+00
|
|
%c1 = fcmp oge float %arg1, 0.0
|
|
call void @llvm.amdgcn.wqm.demote(i1 %c1)
|
|
%tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
|
|
call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @branch(float %arg0, float %arg1) {
|
|
; SI-LABEL: branch:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
|
|
; SI-NEXT: s_mov_b64 s[2:3], exec
|
|
; SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; SI-NEXT: v_and_b32_e32 v1, 1, v0
|
|
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
|
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
|
|
; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
|
|
; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5]
|
|
; SI-NEXT: s_cbranch_execz BB2_3
|
|
; SI-NEXT: ; %bb.1: ; %.demote
|
|
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB2_4
|
|
; SI-NEXT: ; %bb.2: ; %.demote
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: BB2_3: ; %.continue
|
|
; SI-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB2_4:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: branch:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
|
|
; GFX9-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 1, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
|
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
|
|
; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execz BB2_3
|
|
; GFX9-NEXT: ; %bb.1: ; %.demote
|
|
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB2_4
|
|
; GFX9-NEXT: ; %bb.2: ; %.demote
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: BB2_3: ; %.continue
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB2_4:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-32-LABEL: branch:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1
|
|
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0
|
|
; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
|
|
; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0
|
|
; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2
|
|
; GFX10-32-NEXT: s_cbranch_execz BB2_3
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB2_4
|
|
; GFX10-32-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: BB2_3: ; %.continue
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
|
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
|
|
; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB2_4:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
;
|
|
; GFX10-64-LABEL: branch:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1
|
|
; GFX10-64-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0
|
|
; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
|
; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1]
|
|
; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB2_3
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB2_4
|
|
; GFX10-64-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: BB2_3: ; %.continue
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
|
|
; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB2_4:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
.entry:
|
|
%i0 = fptosi float %arg0 to i32
|
|
%i1 = fptosi float %arg1 to i32
|
|
%c0 = or i32 %i0, %i1
|
|
%c1 = and i32 %c0, 1
|
|
%c2 = icmp eq i32 %c1, 0
|
|
br i1 %c2, label %.continue, label %.demote
|
|
|
|
.demote:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue
|
|
|
|
.continue:
|
|
%tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
|
|
call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
|
|
ret void
|
|
}
|
|
|
|
|
|
define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
|
|
; SI-LABEL: wqm_demote_1:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: s_mov_b64 s[12:13], exec
|
|
; SI-NEXT: s_wqm_b64 exec, exec
|
|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
|
|
; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
|
|
; SI-NEXT: s_cbranch_execz BB3_3
|
|
; SI-NEXT: ; %bb.1: ; %.demote
|
|
; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB3_4
|
|
; SI-NEXT: ; %bb.2: ; %.demote
|
|
; SI-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; SI-NEXT: BB3_3: ; %.continue
|
|
; SI-NEXT: s_or_b64 exec, exec, s[14:15]
|
|
; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; SI-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_branch BB3_5
|
|
; SI-NEXT: BB3_4:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB3_5:
|
|
;
|
|
; GFX9-LABEL: wqm_demote_1:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX9-NEXT: s_wqm_b64 exec, exec
|
|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
|
|
; GFX9-NEXT: s_cbranch_execz BB3_3
|
|
; GFX9-NEXT: ; %bb.1: ; %.demote
|
|
; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB3_4
|
|
; GFX9-NEXT: ; %bb.2: ; %.demote
|
|
; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; GFX9-NEXT: BB3_3: ; %.continue
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[14:15]
|
|
; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_branch BB3_5
|
|
; GFX9-NEXT: BB3_4:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB3_5:
|
|
;
|
|
; GFX10-32-LABEL: wqm_demote_1:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 s12, exec_lo
|
|
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo
|
|
; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13
|
|
; GFX10-32-NEXT: s_cbranch_execz BB3_3
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB3_4
|
|
; GFX10-32-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-32-NEXT: s_wqm_b32 s28, s12
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
|
|
; GFX10-32-NEXT: BB3_3: ; %.continue
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
|
|
; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12
|
|
; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: s_branch BB3_5
|
|
; GFX10-32-NEXT: BB3_4:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB3_5:
|
|
;
|
|
; GFX10-64-LABEL: wqm_demote_1:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX10-64-NEXT: s_wqm_b64 exec, exec
|
|
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB3_3
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB3_4
|
|
; GFX10-64-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; GFX10-64-NEXT: BB3_3: ; %.continue
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
|
|
; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: s_branch BB3_5
|
|
; GFX10-64-NEXT: BB3_4:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB3_5:
|
|
.entry:
|
|
%z.cmp = fcmp olt float %z, 0.0
|
|
br i1 %z.cmp, label %.continue, label %.demote
|
|
|
|
.demote:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue
|
|
|
|
.continue:
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
|
%tex1 = extractelement <4 x float> %tex, i32 0
|
|
%coord1 = fadd float %tex0, %tex1
|
|
%rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
|
|
ret <4 x float> %rtex
|
|
}
|
|
|
|
define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
|
|
; SI-LABEL: wqm_demote_2:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: s_mov_b64 s[12:13], exec
|
|
; SI-NEXT: s_wqm_b64 exec, exec
|
|
; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
|
|
; SI-NEXT: s_cbranch_execz BB4_3
|
|
; SI-NEXT: ; %bb.1: ; %.demote
|
|
; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB4_4
|
|
; SI-NEXT: ; %bb.2: ; %.demote
|
|
; SI-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; SI-NEXT: BB4_3: ; %.continue
|
|
; SI-NEXT: s_or_b64 exec, exec, s[14:15]
|
|
; SI-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; SI-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_branch BB4_5
|
|
; SI-NEXT: BB4_4:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB4_5:
|
|
;
|
|
; GFX9-LABEL: wqm_demote_2:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX9-NEXT: s_wqm_b64 exec, exec
|
|
; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
|
|
; GFX9-NEXT: s_cbranch_execz BB4_3
|
|
; GFX9-NEXT: ; %bb.1: ; %.demote
|
|
; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB4_4
|
|
; GFX9-NEXT: ; %bb.2: ; %.demote
|
|
; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; GFX9-NEXT: BB4_3: ; %.continue
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[14:15]
|
|
; GFX9-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_branch BB4_5
|
|
; GFX9-NEXT: BB4_4:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB4_5:
|
|
;
|
|
; GFX10-32-LABEL: wqm_demote_2:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 s12, exec_lo
|
|
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo
|
|
; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13
|
|
; GFX10-32-NEXT: s_cbranch_execz BB4_3
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB4_4
|
|
; GFX10-32-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-32-NEXT: s_wqm_b32 s28, s12
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28
|
|
; GFX10-32-NEXT: BB4_3: ; %.continue
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13
|
|
; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12
|
|
; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: s_branch BB4_5
|
|
; GFX10-32-NEXT: BB4_4:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB4_5:
|
|
;
|
|
; GFX10-64-LABEL: wqm_demote_2:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX10-64-NEXT: s_wqm_b64 exec, exec
|
|
; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc
|
|
; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB4_3
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.demote
|
|
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB4_4
|
|
; GFX10-64-NEXT: ; %bb.2: ; %.demote
|
|
; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17]
|
|
; GFX10-64-NEXT: BB4_3: ; %.continue
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29]
|
|
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: s_branch BB4_5
|
|
; GFX10-64-NEXT: BB4_4:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB4_5:
|
|
.entry:
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
|
%tex1 = extractelement <4 x float> %tex, i32 0
|
|
%z.cmp = fcmp olt float %tex0, 0.0
|
|
br i1 %z.cmp, label %.continue, label %.demote
|
|
|
|
.demote:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue
|
|
|
|
.continue:
|
|
%coord1 = fadd float %tex0, %tex1
|
|
%rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
|
|
ret <4 x float> %rtex
|
|
}
|
|
|
|
define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
|
|
; SI-LABEL: wqm_demote_dynamic:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: s_mov_b64 s[12:13], exec
|
|
; SI-NEXT: s_wqm_b64 exec, exec
|
|
; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_xor_b64 s[14:15], vcc, exec
|
|
; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
|
|
; SI-NEXT: s_cbranch_scc0 BB5_2
|
|
; SI-NEXT: ; %bb.1: ; %.entry
|
|
; SI-NEXT: s_wqm_b64 s[14:15], s[12:13]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[14:15]
|
|
; SI-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; SI-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_branch BB5_3
|
|
; SI-NEXT: BB5_2:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB5_3:
|
|
;
|
|
; GFX9-LABEL: wqm_demote_dynamic:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX9-NEXT: s_wqm_b64 exec, exec
|
|
; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec
|
|
; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
|
|
; GFX9-NEXT: s_cbranch_scc0 BB5_2
|
|
; GFX9-NEXT: ; %bb.1: ; %.entry
|
|
; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
|
; GFX9-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: s_branch BB5_3
|
|
; GFX9-NEXT: BB5_2:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB5_3:
|
|
;
|
|
; GFX10-32-LABEL: wqm_demote_dynamic:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 s12, exec_lo
|
|
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo
|
|
; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB5_2
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-32-NEXT: s_wqm_b32 s13, s12
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13
|
|
; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12
|
|
; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-32-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-32-NEXT: s_branch BB5_3
|
|
; GFX10-32-NEXT: BB5_2:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB5_3:
|
|
;
|
|
; GFX10-64-LABEL: wqm_demote_dynamic:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 s[12:13], exec
|
|
; GFX10-64-NEXT: s_wqm_b64 exec, exec
|
|
; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec
|
|
; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15]
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB5_2
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.entry
|
|
; GFX10-64-NEXT: s_wqm_b64 s[28:29], s[12:13]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[28:29]
|
|
; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13]
|
|
; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
|
|
; GFX10-64-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-64-NEXT: s_branch BB5_3
|
|
; GFX10-64-NEXT: BB5_2:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB5_3:
|
|
.entry:
|
|
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
%tex0 = extractelement <4 x float> %tex, i32 0
|
|
%tex1 = extractelement <4 x float> %tex, i32 0
|
|
%z.cmp = fcmp olt float %tex0, 0.0
|
|
call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
|
|
%coord1 = fadd float %tex0, %tex1
|
|
%rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
|
|
|
|
ret <4 x float> %rtex
|
|
}
|
|
|
|
|
|
define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
|
|
; SI-LABEL: wqm_deriv:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: s_mov_b64 s[0:1], exec
|
|
; SI-NEXT: s_wqm_b64 exec, exec
|
|
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
|
; SI-NEXT: s_cbranch_execz BB6_3
|
|
; SI-NEXT: ; %bb.1: ; %.demote0
|
|
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB6_7
|
|
; SI-NEXT: ; %bb.2: ; %.demote0
|
|
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
|
|
; SI-NEXT: BB6_3: ; %.continue0
|
|
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; SI-NEXT: s_mov_b64 s[2:3], s[0:1]
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
|
|
; SI-NEXT: v_mov_b32_e32 v1, v0
|
|
; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1
|
|
; SI-NEXT: s_nop 0
|
|
; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; SI-NEXT: s_nop 1
|
|
; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; SI-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc
|
|
; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
|
|
; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
|
|
; SI-NEXT: s_cbranch_execz BB6_6
|
|
; SI-NEXT: ; %bb.4: ; %.demote1
|
|
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB6_7
|
|
; SI-NEXT: ; %bb.5: ; %.demote1
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: BB6_6: ; %.continue1
|
|
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; SI-NEXT: v_bfrev_b32_e32 v0, 60
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x3c00
|
|
; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB6_7:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: wqm_deriv:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX9-NEXT: s_wqm_b64 exec, exec
|
|
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
|
; GFX9-NEXT: s_cbranch_execz BB6_3
|
|
; GFX9-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX9-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: BB6_3: ; %.continue0
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
|
|
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execz BB6_6
|
|
; GFX9-NEXT: ; %bb.4: ; %.demote1
|
|
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX9-NEXT: ; %bb.5: ; %.demote1
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: BB6_6: ; %.continue1
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX9-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB6_7:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-32-LABEL: wqm_deriv:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo
|
|
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1
|
|
; GFX10-32-NEXT: s_cbranch_execz BB6_3
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX10-32-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX10-32-NEXT: s_wqm_b32 s2, s0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2
|
|
; GFX10-32-NEXT: BB6_3: ; %.continue0
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX10-32-NEXT: s_mov_b32 s1, s0
|
|
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1
|
|
; GFX10-32-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0
|
|
; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_xor_b32 s1, s0, -1
|
|
; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1
|
|
; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2
|
|
; GFX10-32-NEXT: s_cbranch_execz BB6_6
|
|
; GFX10-32-NEXT: ; %bb.4: ; %.demote1
|
|
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX10-32-NEXT: ; %bb.5: ; %.demote1
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: BB6_6: ; %.continue1
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB6_7:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
;
|
|
; GFX10-64-LABEL: wqm_deriv:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX10-64-NEXT: s_wqm_b64 exec, exec
|
|
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
|
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB6_3
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX10-64-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5]
|
|
; GFX10-64-NEXT: BB6_3: ; %.continue0
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1]
|
|
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
|
|
; GFX10-64-NEXT: v_mov_b32_e32 v1, v0
|
|
; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1
|
|
; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
|
|
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB6_6
|
|
; GFX10-64-NEXT: ; %bb.4: ; %.demote1
|
|
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB6_7
|
|
; GFX10-64-NEXT: ; %bb.5: ; %.demote1
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: BB6_6: ; %.continue1
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
|
|
; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB6_7:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
.entry:
|
|
%p0 = extractelement <2 x float> %input, i32 0
|
|
%p1 = extractelement <2 x float> %input, i32 1
|
|
%x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
|
|
%x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
|
|
%argi = fptosi float %arg to i32
|
|
%cond0 = icmp eq i32 %argi, 0
|
|
br i1 %cond0, label %.continue0, label %.demote0
|
|
|
|
.demote0:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue0
|
|
|
|
.continue0:
|
|
%live = call i1 @llvm.amdgcn.live.mask()
|
|
%live.cond = select i1 %live, i32 0, i32 1065353216
|
|
%live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
|
|
%live.v0f = bitcast i32 %live.v0 to float
|
|
%live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
|
|
%live.v1f = bitcast i32 %live.v1 to float
|
|
%v0 = fsub float %live.v0f, %live.v1f
|
|
%v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
|
|
%cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
|
|
%cond2 = and i1 %live, %cond1
|
|
br i1 %cond2, label %.continue1, label %.demote1
|
|
|
|
.demote1:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue1
|
|
|
|
.continue1:
|
|
call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
|
|
; SI-LABEL: wqm_deriv_loop:
|
|
; SI: ; %bb.0: ; %.entry
|
|
; SI-NEXT: s_mov_b64 s[0:1], exec
|
|
; SI-NEXT: s_wqm_b64 exec, exec
|
|
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; SI-NEXT: s_mov_b32 s2, 0
|
|
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; SI-NEXT: s_cbranch_execz BB7_3
|
|
; SI-NEXT: ; %bb.1: ; %.demote0
|
|
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB7_9
|
|
; SI-NEXT: ; %bb.2: ; %.demote0
|
|
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
|
|
; SI-NEXT: BB7_3: ; %.continue0.preheader
|
|
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; SI-NEXT: s_mov_b64 s[4:5], 0
|
|
; SI-NEXT: s_branch BB7_5
|
|
; SI-NEXT: BB7_4: ; %.continue1
|
|
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; SI-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; SI-NEXT: s_add_i32 s2, s2, 1
|
|
; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1
|
|
; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; SI-NEXT: s_cbranch_execz BB7_8
|
|
; SI-NEXT: BB7_5: ; %.continue0
|
|
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s2
|
|
; SI-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7]
|
|
; SI-NEXT: v_mov_b32_e32 v2, v0
|
|
; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1
|
|
; SI-NEXT: s_nop 0
|
|
; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; SI-NEXT: s_nop 1
|
|
; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; SI-NEXT: s_or_b64 s[6:7], s[6:7], vcc
|
|
; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
|
|
; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
|
|
; SI-NEXT: s_cbranch_execz BB7_4
|
|
; SI-NEXT: ; %bb.6: ; %.demote1
|
|
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; SI-NEXT: s_cbranch_scc0 BB7_9
|
|
; SI-NEXT: ; %bb.7: ; %.demote1
|
|
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; SI-NEXT: s_wqm_b64 s[8:9], s[0:1]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[8:9]
|
|
; SI-NEXT: s_branch BB7_4
|
|
; SI-NEXT: BB7_8: ; %.return
|
|
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; SI-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; SI-NEXT: v_bfrev_b32_e32 v0, 60
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x3c00
|
|
; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: BB7_9:
|
|
; SI-NEXT: s_mov_b64 exec, 0
|
|
; SI-NEXT: exp null off, off, off, off done vm
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: wqm_deriv_loop:
|
|
; GFX9: ; %bb.0: ; %.entry
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX9-NEXT: s_wqm_b64 exec, exec
|
|
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX9-NEXT: s_mov_b32 s2, 0
|
|
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execz BB7_3
|
|
; GFX9-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX9-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
|
|
; GFX9-NEXT: BB7_3: ; %.continue0.preheader
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: s_branch BB7_5
|
|
; GFX9-NEXT: BB7_4: ; %.continue1
|
|
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX9-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execz BB7_8
|
|
; GFX9-NEXT: BB7_5: ; %.continue0
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX9-NEXT: s_nop 1
|
|
; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], vcc
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
|
|
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execz BB7_4
|
|
; GFX9-NEXT: ; %bb.6: ; %.demote1
|
|
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX9-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX9-NEXT: ; %bb.7: ; %.demote1
|
|
; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_branch BB7_4
|
|
; GFX9-NEXT: BB7_8: ; %.return
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX9-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: BB7_9:
|
|
; GFX9-NEXT: s_mov_b64 exec, 0
|
|
; GFX9-NEXT: exp null off, off, off, off done vm
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-32-LABEL: wqm_deriv_loop:
|
|
; GFX10-32: ; %bb.0: ; %.entry
|
|
; GFX10-32-NEXT: s_mov_b32 s0, exec_lo
|
|
; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo
|
|
; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-32-NEXT: s_mov_b32 s1, 0
|
|
; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo
|
|
; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2
|
|
; GFX10-32-NEXT: s_cbranch_execz BB7_3
|
|
; GFX10-32-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX10-32-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX10-32-NEXT: s_wqm_b32 s3, s0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
|
|
; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
|
|
; GFX10-32-NEXT: s_mov_b32 s2, 0
|
|
; GFX10-32-NEXT: s_branch BB7_5
|
|
; GFX10-32-NEXT: BB7_4: ; %.continue1
|
|
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3
|
|
; GFX10-32-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1
|
|
; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1
|
|
; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
|
|
; GFX10-32-NEXT: s_cbranch_execz BB7_8
|
|
; GFX10-32-NEXT: BB7_5: ; %.continue0
|
|
; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX10-32-NEXT: s_mov_b32 s3, s0
|
|
; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3
|
|
; GFX10-32-NEXT: s_xor_b32 s3, s0, -1
|
|
; GFX10-32-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
|
|
; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo
|
|
; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3
|
|
; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4
|
|
; GFX10-32-NEXT: s_cbranch_execz BB7_4
|
|
; GFX10-32-NEXT: ; %bb.6: ; %.demote1
|
|
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
|
|
; GFX10-32-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX10-32-NEXT: ; %bb.7: ; %.demote1
|
|
; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-32-NEXT: s_wqm_b32 s4, s0
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4
|
|
; GFX10-32-NEXT: s_branch BB7_4
|
|
; GFX10-32-NEXT: BB7_8: ; %.return
|
|
; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
|
|
; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0
|
|
; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
; GFX10-32-NEXT: BB7_9:
|
|
; GFX10-32-NEXT: s_mov_b32 exec_lo, 0
|
|
; GFX10-32-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-32-NEXT: s_endpgm
|
|
;
|
|
; GFX10-64-LABEL: wqm_deriv_loop:
|
|
; GFX10-64: ; %bb.0: ; %.entry
|
|
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
|
|
; GFX10-64-NEXT: s_wqm_b64 exec, exec
|
|
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
; GFX10-64-NEXT: s_mov_b32 s2, 0
|
|
; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
|
; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB7_3
|
|
; GFX10-64-NEXT: ; %bb.1: ; %.demote0
|
|
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX10-64-NEXT: ; %bb.2: ; %.demote0
|
|
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
|
|
; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX10-64-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX10-64-NEXT: s_branch BB7_5
|
|
; GFX10-64-NEXT: BB7_4: ; %.continue1
|
|
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7]
|
|
; GFX10-64-NEXT: s_add_i32 s2, s2, 1
|
|
; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1
|
|
; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB7_8
|
|
; GFX10-64-NEXT: BB7_5: ; %.continue0
|
|
; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1]
|
|
; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7]
|
|
; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1
|
|
; GFX10-64-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
|
|
; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
|
|
; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
|
|
; GFX10-64-NEXT: s_or_b64 s[6:7], s[6:7], vcc
|
|
; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
|
|
; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9]
|
|
; GFX10-64-NEXT: s_cbranch_execz BB7_4
|
|
; GFX10-64-NEXT: ; %bb.6: ; %.demote1
|
|
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
|
; GFX10-64-NEXT: s_cbranch_scc0 BB7_9
|
|
; GFX10-64-NEXT: ; %bb.7: ; %.demote1
|
|
; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
|
; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9]
|
|
; GFX10-64-NEXT: s_branch BB7_4
|
|
; GFX10-64-NEXT: BB7_8: ; %.return
|
|
; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1]
|
|
; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00
|
|
; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60
|
|
; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
; GFX10-64-NEXT: BB7_9:
|
|
; GFX10-64-NEXT: s_mov_b64 exec, 0
|
|
; GFX10-64-NEXT: exp null off, off, off, off done vm
|
|
; GFX10-64-NEXT: s_endpgm
|
|
.entry:
|
|
%p0 = extractelement <2 x float> %input, i32 0
|
|
%p1 = extractelement <2 x float> %input, i32 1
|
|
%x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
|
|
%x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
|
|
%argi = fptosi float %arg to i32
|
|
%cond0 = icmp eq i32 %argi, 0
|
|
br i1 %cond0, label %.continue0, label %.demote0
|
|
|
|
.demote0:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue0
|
|
|
|
.continue0:
|
|
%count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
|
|
%live = call i1 @llvm.amdgcn.live.mask()
|
|
%live.cond = select i1 %live, i32 0, i32 %count
|
|
%live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
|
|
%live.v0f = bitcast i32 %live.v0 to float
|
|
%live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
|
|
%live.v1f = bitcast i32 %live.v1 to float
|
|
%v0 = fsub float %live.v0f, %live.v1f
|
|
%v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
|
|
%cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
|
|
%cond2 = and i1 %live, %cond1
|
|
br i1 %cond2, label %.continue1, label %.demote1
|
|
|
|
.demote1:
|
|
call void @llvm.amdgcn.wqm.demote(i1 false)
|
|
br label %.continue1
|
|
|
|
.continue1:
|
|
%next = add i32 %count, 1
|
|
%loop.cond = icmp slt i32 %next, %limit
|
|
br i1 %loop.cond, label %.continue0, label %.return
|
|
|
|
.return:
|
|
call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.amdgcn.wqm.demote(i1) #0
|
|
declare i1 @llvm.amdgcn.live.mask() #0
|
|
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
|
|
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
|
declare float @llvm.amdgcn.wqm.f32(float) #1
|
|
declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
|
|
declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
|
|
declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
|
|
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind readnone }
|
|
attributes #2 = { nounwind readnone speculatable }
|
|
attributes #3 = { inaccessiblememonly nounwind }
|
|
attributes #4 = { convergent nounwind readnone }
|