[AMDGPU][SDag] Better lowering for 32-bit ctlz/cttz

Differential Revision: https://reviews.llvm.org/D107566
This commit is contained in:
Jay Foad 2021-08-05 14:32:25 +01:00
parent e6c364a624
commit 2b63933115
6 changed files with 273 additions and 369 deletions

View File

@ -2386,8 +2386,16 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
if (Src.getValueType() == MVT::i32) {
assert(ZeroUndef);
return DAG.getNode(NewOpc, SL, MVT::i32, Src);
// (ctlz hi:lo) -> (umin (ffbh src), 32)
// (cttz hi:lo) -> (umin (ffbl src), 32)
// (ctlz_zero_undef src) -> (ffbh src)
// (cttz_zero_undef src) -> (ffbl src)
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
if (!ZeroUndef) {
const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
}
return NewOpr;
}
SDValue Lo, Hi;

View File

@ -465,11 +465,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
if (Subtarget->hasFFBH())
if (Subtarget->hasFFBH()) {
setOperationAction(ISD::CTLZ, MVT::i32, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
}
if (Subtarget->hasFFBL())
if (Subtarget->hasFFBL()) {
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
}
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//

View File

@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
; SI-LABEL: s_ctlz_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_flbit_i32_b32 s5, s4
; SI-NEXT: s_flbit_i32_b32 s2, s2
; SI-NEXT: s_min_u32 s4, s2, 32
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@ -41,9 +40,8 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_flbit_i32_b32 s1, s0
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b32 s0, s1, 32
; VI-NEXT: s_flbit_i32_b32 s0, s0
; VI-NEXT: s_min_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@ -68,8 +66,7 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_flbit_i32_b32 s0, s4
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s0, s0, 32
; GFX10-NEXT: s_min_u32 s0, s0, 32
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
@ -98,17 +95,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -126,9 +122,8 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -157,14 +152,13 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_ctlz_i32:
@ -203,12 +197,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v2, v1
; SI-NEXT: v_ffbh_u32_e32 v3, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v1, 32, v1
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -226,12 +218,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; VI-NEXT: v_ffbh_u32_e32 v3, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -263,17 +253,15 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_ffbh_u32_e32 v3, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_ctlz_v2i32:
@ -315,18 +303,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v4, v3
; SI-NEXT: v_ffbh_u32_e32 v5, v2
; SI-NEXT: v_ffbh_u32_e32 v6, v1
; SI-NEXT: v_ffbh_u32_e32 v7, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
; SI-NEXT: v_ffbh_u32_e32 v3, v3
; SI-NEXT: v_ffbh_u32_e32 v2, v2
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v3, 32, v3
; SI-NEXT: v_min_u32_e32 v2, 32, v2
; SI-NEXT: v_min_u32_e32 v1, 32, v1
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -344,18 +328,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v4, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
; VI-NEXT: v_ffbh_u32_e32 v5, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
; VI-NEXT: v_ffbh_u32_e32 v6, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; VI-NEXT: v_ffbh_u32_e32 v7, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
; VI-NEXT: v_ffbh_u32_e32 v3, v3
; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v3, 32, v3
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -397,18 +377,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: v_ffbh_u32_e32 v6, v2
; GFX10-NEXT: v_ffbh_u32_e32 v7, v1
; GFX10-NEXT: v_ffbh_u32_e32 v8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
; GFX10-NEXT: v_ffbh_u32_e32 v3, v3
; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@ -455,9 +431,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@ -474,9 +449,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
; VI-NEXT: v_add_u16_e32 v0, -8, v0
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@ -520,9 +494,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
@ -1152,9 +1125,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1174,9 +1146,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@ -1211,13 +1182,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@ -1263,9 +1233,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1285,9 +1254,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@ -1322,13 +1290,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@ -1493,11 +1460,11 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 32, v1, s[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1]
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_mov_b32_e32 v0, 0xffff
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -1538,9 +1505,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;

View File

@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
; SI-LABEL: s_cttz_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_ff1_i32_b32 s5, s4
; SI-NEXT: s_ff1_i32_b32 s2, s2
; SI-NEXT: s_min_u32 s4, s2, 32
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@ -41,9 +40,8 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ff1_i32_b32 s1, s0
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b32 s0, s1, 32
; VI-NEXT: s_ff1_i32_b32 s0, s0
; VI-NEXT: s_min_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@ -68,8 +66,7 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ff1_i32_b32 s0, s4
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s0, s0, 32
; GFX10-NEXT: s_min_u32 s0, s0, 32
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
@ -98,17 +95,16 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbl_b32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -126,9 +122,8 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -157,14 +152,13 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_cttz_i32:
@ -203,12 +197,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbl_b32_e32 v2, v1
; SI-NEXT: v_ffbl_b32_e32 v3, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
; SI-NEXT: v_ffbl_b32_e32 v1, v1
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v1, 32, v1
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -226,12 +218,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v2, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; VI-NEXT: v_ffbl_b32_e32 v3, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
; VI-NEXT: v_ffbl_b32_e32 v1, v1
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -263,17 +253,15 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbl_b32_e32 v2, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_ffbl_b32_e32 v3, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_cttz_v2i32:
@ -315,18 +303,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbl_b32_e32 v4, v3
; SI-NEXT: v_ffbl_b32_e32 v5, v2
; SI-NEXT: v_ffbl_b32_e32 v6, v1
; SI-NEXT: v_ffbl_b32_e32 v7, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
; SI-NEXT: v_ffbl_b32_e32 v3, v3
; SI-NEXT: v_ffbl_b32_e32 v2, v2
; SI-NEXT: v_ffbl_b32_e32 v1, v1
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v3, 32, v3
; SI-NEXT: v_min_u32_e32 v2, 32, v2
; SI-NEXT: v_min_u32_e32 v1, 32, v1
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@ -344,18 +328,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v4, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
; VI-NEXT: v_ffbl_b32_e32 v5, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
; VI-NEXT: v_ffbl_b32_e32 v6, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; VI-NEXT: v_ffbl_b32_e32 v7, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
; VI-NEXT: v_ffbl_b32_e32 v3, v3
; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: v_ffbl_b32_e32 v1, v1
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v3, 32, v3
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_min_u32_e32 v1, 32, v1
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@ -397,18 +377,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbl_b32_e32 v5, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: v_ffbl_b32_e32 v6, v2
; GFX10-NEXT: v_ffbl_b32_e32 v7, v1
; GFX10-NEXT: v_ffbl_b32_e32 v8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
; GFX10-NEXT: v_ffbl_b32_e32 v3, v3
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@ -1141,9 +1117,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbl_b32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1163,9 +1138,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@ -1200,13 +1174,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@ -1252,9 +1225,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbl_b32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1274,9 +1246,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@ -1311,13 +1282,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@ -1482,9 +1452,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
; VI-NEXT: v_ffbl_b32_e32 v3, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
@ -1526,10 +1495,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
; GFX10-NEXT: v_ffbl_b32_e32 v3, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm

View File

@ -782,9 +782,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_ffbl_b32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@ -820,9 +819,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_ffbl_b32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v2, 32, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
@ -1365,9 +1363,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_ffbl_b32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@ -1405,9 +1402,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_ffbl_b32_e32 v1, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2
@ -1605,9 +1601,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
; VI-NEXT: v_ffbl_b32_e32 v3, v2
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2

View File

@ -9,21 +9,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s3
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_flbit_i32_b32 s0, s3
; GFX6-NEXT: s_min_u32 s8, s0, 32
; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_sub_i32 s0, 32, s8
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
@ -33,8 +31,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: s_cselect_b32 s6, s4, 32
; GFX8-NEXT: s_min_u32 s6, s4, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@ -67,8 +64,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@ -93,8 +89,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v4, v2
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v4, 32, v4, vcc
; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v4, v[1:2]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@ -122,21 +117,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s3
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: s_flbit_i32_b32 s0, s3
; GFX6-NEXT: s_min_u32 s8, s0, 32
; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX6-NEXT: s_sub_i32 s0, 32, s8
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@ -145,8 +138,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: s_cselect_b32 s6, s4, 32
; GFX8-NEXT: s_min_u32 s6, s4, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@ -178,8 +170,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@ -203,8 +194,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v5, 32, v0, vcc
; GFX8-NEXT: v_min_u32_e32 v5, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[1:2]
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@ -236,56 +226,50 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)*
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
; GFX6-NEXT: s_min_u32 s8, s8, 32
; GFX6-NEXT: s_min_u32 s9, s9, 32
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; GFX6-NEXT: s_sub_i32 s10, 32, s8
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
; GFX6-NEXT: s_sub_i32 s11, 32, s9
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX6-NEXT: v_ldexp_f32_e32 v1, v0, v5
; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v4
; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10
; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s11
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s2, s7
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
; GFX8-NEXT: s_cselect_b32 s9, s2, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
; GFX8-NEXT: s_sub_i32 s9, 32, s9
; GFX8-NEXT: s_flbit_i32_b32 s6, s3
; GFX8-NEXT: s_min_u32 s8, s6, 32
; GFX8-NEXT: s_flbit_i32_b32 s7, s1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
; GFX8-NEXT: s_min_u32 s9, s7, 32
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
; GFX8-NEXT: s_flbit_i32_b32 s8, s5
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX8-NEXT: s_cselect_b32 s6, s8, 32
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX8-NEXT: s_sub_i32 s2, 32, s6
; GFX8-NEXT: v_ldexp_f32 v1, v0, s9
; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_sub_i32 s0, 32, s8
; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
; GFX8-NEXT: s_sub_i32 s0, 32, s9
; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result = uitofp <2 x i64> %in to <2 x float>
@ -314,14 +298,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
@ -374,16 +354,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
; GFX8-NEXT: v_ffbh_u32_e32 v12, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v11, v6
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GFX8-NEXT: v_cndmask_b32_e32 v11, 32, v11, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v13, v2
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
; GFX8-NEXT: v_min_u32_e32 v11, 32, v11
; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6]
@ -433,26 +409,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
; GFX6-NEXT: s_min_u32 s8, s8, 32
; GFX6-NEXT: s_min_u32 s9, s9, 32
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; GFX6-NEXT: s_sub_i32 s10, 32, s8
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
; GFX6-NEXT: s_sub_i32 s11, 32, s9
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5
; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10
; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s11
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
@ -466,24 +438,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s2, s7
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
; GFX8-NEXT: s_cselect_b32 s9, s2, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
; GFX8-NEXT: s_sub_i32 s9, 32, s9
; GFX8-NEXT: s_flbit_i32_b32 s3, s5
; GFX8-NEXT: s_min_u32 s8, s2, 32
; GFX8-NEXT: s_min_u32 s9, s3, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
; GFX8-NEXT: s_flbit_i32_b32 s8, s5
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX8-NEXT: s_cselect_b32 s6, s8, 32
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX8-NEXT: s_sub_i32 s2, 32, s6
; GFX8-NEXT: v_ldexp_f32 v0, v0, s9
; GFX8-NEXT: s_sub_i32 s8, 32, s8
; GFX8-NEXT: s_sub_i32 s2, 32, s9
; GFX8-NEXT: v_ldexp_f32 v0, v0, s8
; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
@ -518,14 +488,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
@ -584,16 +550,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
; GFX8-NEXT: v_ffbh_u32_e32 v13, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v12, v6
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v14, v2
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v14, 32, v14, vcc
; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
; GFX8-NEXT: v_min_u32_e32 v14, 32, v14
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
; GFX8-NEXT: v_sub_u32_e32 v15, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]