forked from OSchip/llvm-project
parent
f5f9bf415c
commit
f058d67643
|
@ -288,9 +288,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
|
|||
if (!Subtarget->hasFFBL())
|
||||
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
|
||||
|
||||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
|
||||
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
|
||||
|
||||
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
|
||||
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
|
||||
|
||||
static const MVT::SimpleValueType VectorIntTypes[] = {
|
||||
MVT::v2i32, MVT::v4i32
|
||||
};
|
||||
|
@ -636,6 +638,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
|
|||
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
|
||||
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
|
||||
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
|
||||
case ISD::CTLZ:
|
||||
case ISD::CTLZ_ZERO_UNDEF:
|
||||
return LowerCTLZ(Op, DAG);
|
||||
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
|
||||
}
|
||||
return Op;
|
||||
|
@ -2162,6 +2167,58 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
|
|||
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue Src = Op.getOperand(0);
|
||||
assert(Src.getValueType() == MVT::i64);
|
||||
|
||||
bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
|
||||
SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
|
||||
|
||||
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
|
||||
const SDValue One = DAG.getConstant(1, SL, MVT::i32);
|
||||
|
||||
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
|
||||
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
|
||||
|
||||
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
|
||||
*DAG.getContext(), MVT::i32);
|
||||
|
||||
SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
|
||||
|
||||
SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
|
||||
SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
|
||||
|
||||
const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
|
||||
SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
|
||||
|
||||
// ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
|
||||
SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
|
||||
|
||||
if (!ZeroUndef) {
|
||||
// Test if the full 64-bit input is zero.
|
||||
|
||||
// FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
|
||||
// which we probably don't want.
|
||||
SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
|
||||
SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
|
||||
|
||||
// TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
|
||||
// with the same cycles, otherwise it is slower.
|
||||
// SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
|
||||
// DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
|
||||
|
||||
const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
|
||||
|
||||
// The instruction returns -1 for 0 input, but the defined intrinsic
|
||||
// behavior is to return the number of bits.
|
||||
NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
|
||||
SrcIsZero, Bits32, NewCtlz);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
|
||||
bool Signed) const {
|
||||
SDLoc SL(Op);
|
||||
|
|
|
@ -54,6 +54,8 @@ private:
|
|||
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
|
||||
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -93,6 +93,16 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_ctlz_i64:
|
||||
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
|
||||
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
||||
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
||||
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
|
||||
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
|
||||
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
|
||||
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||
define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
|
||||
store i64 %ctlz, i64 addrspace(1)* %out
|
||||
|
@ -108,6 +118,17 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_ctlz_i64:
|
||||
; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
|
||||
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
|
||||
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
|
||||
; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
|
||||
; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
|
||||
; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]]
|
||||
; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
|
||||
; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
|
||||
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
|
||||
define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
|
|
|
@ -6,6 +6,12 @@ declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
|
|||
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
|
||||
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
|
||||
|
||||
declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
|
||||
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
|
||||
declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
|
||||
|
||||
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
|
||||
; SI: s_load_dword [[VAL:s[0-9]+]],
|
||||
; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
||||
|
@ -69,3 +75,59 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
|
|||
store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
|
||||
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s[[HI]]
|
||||
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
||||
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
||||
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
|
||||
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
|
||||
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
|
||||
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||
define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
||||
store i64 %ctlz, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
|
||||
define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
|
||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
||||
%trunc = trunc i64 %ctlz to i32
|
||||
store i32 %trunc, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64:
|
||||
; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
|
||||
; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
|
||||
; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
|
||||
; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
|
||||
; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||
; SI: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||
define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
|
||||
%val = load i64, i64 addrspace(1)* %in.gep
|
||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
||||
store i64 %ctlz, i64 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
|
||||
define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%val = load i64, i64 addrspace(1)* %in.gep
|
||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
||||
%trunc = trunc i64 %ctlz to i32
|
||||
store i32 %trunc, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue