AMDGPU: Custom lower i64 ctlz

llvm-svn: 257348
This commit is contained in:
Matt Arsenault 2016-01-11 16:50:29 +00:00
parent f5f9bf415c
commit f058d67643
4 changed files with 143 additions and 1 deletions

View File

@ -288,9 +288,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
if (!Subtarget->hasFFBL())
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
@ -636,6 +638,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return LowerCTLZ(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
@ -2162,6 +2167,58 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
assert(Src.getValueType() == MVT::i64);
bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), MVT::i32);
SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
// ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
if (!ZeroUndef) {
// Test if the full 64-bit input is zero.
// FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
// which we probably don't want.
SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
// TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
// with the same cycles, otherwise it is slower.
// SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
// DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
// The instruction returns -1 for 0 input, but the defined intrinsic
// behavior is to return the number of bits.
NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
SrcIsZero, Bits32, NewCtlz);
}
return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
}
SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);

View File

@ -54,6 +54,8 @@ private:
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

View File

@ -93,6 +93,16 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
}
; FUNC-LABEL: {{^}}s_ctlz_i64:
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
store i64 %ctlz, i64 addrspace(1)* %out
@ -108,6 +118,17 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
}
; FUNC-LABEL: {{^}}v_ctlz_i64:
; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid

View File

@ -6,6 +6,12 @@ declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
; SI: s_load_dword [[VAL:s[0-9]+]],
; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
@ -69,3 +75,59 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
ret void
}
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
store i64 %ctlz, i64 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
%trunc = trunc i64 %ctlz to i32
store i32 %trunc, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %in.gep
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
store i64 %ctlz, i64 addrspace(1)* %out.gep
ret void
}
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
%tid = call i32 @llvm.r600.read.tidig.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%val = load i64, i64 addrspace(1)* %in.gep
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
%trunc = trunc i64 %ctlz to i32
store i32 %trunc, i32 addrspace(1)* %out.gep
ret void
}