forked from OSchip/llvm-project
[AMDGPU][SDag] Better lowering for 32-bit ctlz/cttz
Differential Revision: https://reviews.llvm.org/D107566
This commit is contained in:
parent
e6c364a624
commit
2b63933115
|
@ -2386,8 +2386,16 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
|
|||
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
|
||||
|
||||
if (Src.getValueType() == MVT::i32) {
|
||||
assert(ZeroUndef);
|
||||
return DAG.getNode(NewOpc, SL, MVT::i32, Src);
|
||||
// (ctlz hi:lo) -> (umin (ffbh src), 32)
|
||||
// (cttz hi:lo) -> (umin (ffbl src), 32)
|
||||
// (ctlz_zero_undef src) -> (ffbh src)
|
||||
// (cttz_zero_undef src) -> (ffbl src)
|
||||
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
|
||||
if (!ZeroUndef) {
|
||||
const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
|
||||
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
|
||||
}
|
||||
return NewOpr;
|
||||
}
|
||||
|
||||
SDValue Lo, Hi;
|
||||
|
|
|
@ -465,11 +465,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
if (!Subtarget->hasBCNT(64))
|
||||
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
|
||||
|
||||
if (Subtarget->hasFFBH())
|
||||
if (Subtarget->hasFFBH()) {
|
||||
setOperationAction(ISD::CTLZ, MVT::i32, Custom);
|
||||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
|
||||
}
|
||||
|
||||
if (Subtarget->hasFFBL())
|
||||
if (Subtarget->hasFFBL()) {
|
||||
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
|
||||
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
|
||||
}
|
||||
|
||||
// We only really have 32-bit BFE instructions (and 16-bit on VI).
|
||||
//
|
||||
|
|
|
@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|||
define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
||||
; SI-LABEL: s_ctlz_i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_flbit_i32_b32 s5, s4
|
||||
; SI-NEXT: s_flbit_i32_b32 s2, s2
|
||||
; SI-NEXT: s_min_u32 s4, s2, 32
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -41,9 +40,8 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
|
|||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_flbit_i32_b32 s1, s0
|
||||
; VI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; VI-NEXT: s_cselect_b32 s0, s1, 32
|
||||
; VI-NEXT: s_flbit_i32_b32 s0, s0
|
||||
; VI-NEXT: s_min_u32 s0, s0, 32
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
@ -68,8 +66,7 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
|
|||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_flbit_i32_b32 s0, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, s0, 32
|
||||
; GFX10-NEXT: s_min_u32 s0, s0, 32
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
@ -98,17 +95,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -126,9 +122,8 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -157,14 +152,13 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-GISEL-LABEL: v_ctlz_i32:
|
||||
|
@ -203,12 +197,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v2, v1
|
||||
; SI-NEXT: v_ffbh_u32_e32 v3, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -226,12 +218,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -263,17 +253,15 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v3, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-GISEL-LABEL: v_ctlz_v2i32:
|
||||
|
@ -315,18 +303,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v4, v3
|
||||
; SI-NEXT: v_ffbh_u32_e32 v5, v2
|
||||
; SI-NEXT: v_ffbh_u32_e32 v6, v1
|
||||
; SI-NEXT: v_ffbh_u32_e32 v7, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v3, v3
|
||||
; SI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; SI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; SI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -344,18 +328,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v4, v3
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v5, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v6, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v7, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v3, v3
|
||||
; VI-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -397,18 +377,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v6, v2
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v7, v1
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v8, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v3, v3
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -455,9 +431,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
|
|||
; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
|
||||
|
@ -474,9 +449,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
|
|||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
|
||||
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
|
||||
; VI-NEXT: v_add_u16_e32 v0, -8, v0
|
||||
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
|
@ -520,9 +494,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
|
|||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
|
||||
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
|
||||
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
|
||||
; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
|
||||
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
|
||||
|
@ -1152,9 +1125,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -1174,9 +1146,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
|
@ -1211,13 +1182,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
|
@ -1263,9 +1233,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -1285,9 +1254,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
|
@ -1322,13 +1290,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
|
@ -1493,11 +1460,11 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbh_u32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 32, v1, s[0:1]
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1]
|
||||
; VI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0xffff
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1538,9 +1505,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo
|
||||
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
|
|
|
@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|||
define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
||||
; SI-LABEL: s_cttz_i32:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_ff1_i32_b32 s5, s4
|
||||
; SI-NEXT: s_ff1_i32_b32 s2, s2
|
||||
; SI-NEXT: s_min_u32 s4, s2, 32
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -41,9 +40,8 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
|
|||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; VI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; VI-NEXT: s_cselect_b32 s0, s1, 32
|
||||
; VI-NEXT: s_ff1_i32_b32 s0, s0
|
||||
; VI-NEXT: s_min_u32 s0, s0, 32
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
@ -68,8 +66,7 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
|
|||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_ff1_i32_b32 s0, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, s0, 32
|
||||
; GFX10-NEXT: s_min_u32 s0, s0, 32
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
@ -98,17 +95,16 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, 0
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -126,9 +122,8 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -157,14 +152,13 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-GISEL-LABEL: v_cttz_i32:
|
||||
|
@ -203,12 +197,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbl_b32_e32 v2, v1
|
||||
; SI-NEXT: v_ffbl_b32_e32 v3, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -226,12 +218,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -263,17 +253,15 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v2, v1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v3, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-GISEL-LABEL: v_cttz_v2i32:
|
||||
|
@ -315,18 +303,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbl_b32_e32 v4, v3
|
||||
; SI-NEXT: v_ffbl_b32_e32 v5, v2
|
||||
; SI-NEXT: v_ffbl_b32_e32 v6, v1
|
||||
; SI-NEXT: v_ffbl_b32_e32 v7, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v3, v3
|
||||
; SI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; SI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; SI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
|
@ -344,18 +328,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v4, v3
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v5, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v6, v1
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v7, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v3
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -397,18 +377,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
|
|||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v5, v3
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v6, v2
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v7, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v8, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v3, v3
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
|
||||
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1141,9 +1117,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -1163,9 +1138,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
|
@ -1200,13 +1174,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
|
@ -1252,9 +1225,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
@ -1274,9 +1246,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
|
@ -1311,13 +1282,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
|
@ -1482,9 +1452,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
||||
|
@ -1526,10 +1495,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v3, v2
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
|
||||
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
|
|
|
@ -782,9 +782,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
|
|||
; SI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -820,9 +819,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
|
|||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
|
@ -1365,9 +1363,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; SI-NEXT: v_or_b32_e32 v1, v1, v3
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; SI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; SI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; SI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
|
@ -1405,9 +1402,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v1, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v0, v0
|
||||
; VI-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -1605,9 +1601,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
|
|||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
|
||||
; VI-NEXT: v_ffbl_b32_e32 v3, v2
|
||||
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
|
||||
; VI-NEXT: v_ffbl_b32_e32 v2, v2
|
||||
; VI-NEXT: v_min_u32_e32 v2, 32, v2
|
||||
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
|
|
@ -9,21 +9,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
|
|||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s8, s3
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s0, s3
|
||||
; GFX6-NEXT: s_min_u32 s8, s0, 32
|
||||
; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
|
||||
; GFX6-NEXT: s_sub_i32 s0, 32, s8
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
|
||||
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
|
@ -33,8 +31,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
|
|||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s6, s4, 32
|
||||
; GFX8-NEXT: s_min_u32 s6, s4, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
|
@ -67,8 +64,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
|
|||
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
|
@ -93,8 +89,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
|
|||
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v4, v2
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v4, 32, v4, vcc
|
||||
; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v4, v[1:2]
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
||||
|
@ -122,21 +117,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
|
|||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s8, s3
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s4, s0
|
||||
; GFX6-NEXT: s_mov_b32 s5, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s0, s3
|
||||
; GFX6-NEXT: s_min_u32 s8, s0, 32
|
||||
; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
|
||||
; GFX6-NEXT: s_sub_i32 s0, 32, s8
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -145,8 +138,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
|
|||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s6, s4, 32
|
||||
; GFX8-NEXT: s_min_u32 s6, s4, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
|
@ -178,8 +170,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
|
|||
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
|
@ -203,8 +194,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
|
|||
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, 32, v0, vcc
|
||||
; GFX8-NEXT: v_min_u32_e32 v5, 32, v0
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[1:2]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
|
@ -236,56 +226,50 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)*
|
|||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
|
||||
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
|
||||
; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
|
||||
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
|
||||
; GFX6-NEXT: s_min_u32 s8, s8, 32
|
||||
; GFX6-NEXT: s_min_u32 s9, s9, 32
|
||||
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
|
||||
; GFX6-NEXT: s_sub_i32 s10, 32, s8
|
||||
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
|
||||
; GFX6-NEXT: s_sub_i32 s11, 32, s9
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
|
||||
; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v1
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v1, v0, v5
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v4
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s11
|
||||
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s2, s7
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s9, s2, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
|
||||
; GFX8-NEXT: s_sub_i32 s9, 32, s9
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s6, s3
|
||||
; GFX8-NEXT: s_min_u32 s8, s6, 32
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s7, s1
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
|
||||
; GFX8-NEXT: s_min_u32 s9, s7, 32
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s8, s5
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
||||
; GFX8-NEXT: s_cselect_b32 s6, s8, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v1
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s6
|
||||
; GFX8-NEXT: v_ldexp_f32 v1, v0, s9
|
||||
; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_sub_i32 s0, 32, s8
|
||||
; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
|
||||
; GFX8-NEXT: s_sub_i32 s0, 32, s9
|
||||
; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
%result = uitofp <2 x i64> %in to <2 x float>
|
||||
|
@ -314,14 +298,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
|
|||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
|
||||
; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
|
||||
; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
|
||||
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
|
||||
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
|
||||
|
@ -374,16 +354,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
|
|||
; GFX8-NEXT: v_ffbh_u32_e32 v12, v4
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v11, v6
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v11, 32, v11, vcc
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v13, v2
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
|
||||
; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX8-NEXT: v_min_u32_e32 v11, 32, v11
|
||||
; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
|
||||
; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
|
||||
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6]
|
||||
|
@ -433,26 +409,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
|
|||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
|
||||
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
|
||||
; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
|
||||
; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
|
||||
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
|
||||
; GFX6-NEXT: s_min_u32 s8, s8, 32
|
||||
; GFX6-NEXT: s_min_u32 s9, s9, 32
|
||||
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
|
||||
; GFX6-NEXT: s_sub_i32 s10, 32, s8
|
||||
; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
|
||||
; GFX6-NEXT: s_sub_i32 s11, 32, s9
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
|
||||
; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
|
||||
; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5
|
||||
; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10
|
||||
; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s11
|
||||
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
|
@ -466,24 +438,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
|
|||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s2, s7
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s9, s2, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
|
||||
; GFX8-NEXT: s_sub_i32 s9, 32, s9
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s3, s5
|
||||
; GFX8-NEXT: s_min_u32 s8, s2, 32
|
||||
; GFX8-NEXT: s_min_u32 s9, s3, 32
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
|
||||
; GFX8-NEXT: s_flbit_i32_b32 s8, s5
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
||||
; GFX8-NEXT: s_cselect_b32 s6, s8, 32
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s6
|
||||
; GFX8-NEXT: v_ldexp_f32 v0, v0, s9
|
||||
; GFX8-NEXT: s_sub_i32 s8, 32, s8
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s9
|
||||
; GFX8-NEXT: v_ldexp_f32 v0, v0, s8
|
||||
; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
|
||||
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
|
@ -518,14 +488,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
|
|||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
|
||||
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
|
||||
; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
|
||||
; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
|
||||
; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
|
||||
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
|
||||
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
|
||||
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
|
||||
|
@ -584,16 +550,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
|
|||
; GFX8-NEXT: v_ffbh_u32_e32 v13, v4
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v12, v6
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
|
||||
; GFX8-NEXT: v_ffbh_u32_e32 v14, v2
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v14, 32, v14, vcc
|
||||
; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
|
||||
; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
|
||||
; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
|
||||
; GFX8-NEXT: v_min_u32_e32 v14, 32, v14
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
|
||||
; GFX8-NEXT: v_sub_u32_e32 v15, vcc, 32, v0
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
|
||||
|
|
Loading…
Reference in New Issue