Reimplement CTPOP legalization with the "best" algorithm from

http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

In a silly microbenchmark on a 65 nm core2 this is 1.5x faster than the old
code in 32 bit mode and about 2x faster in 64 bit mode. It's also a lot shorter,
especially when counting 64 bit population on a 32 bit target.

I hope this is fast enough to replace Kernighan-style counting loops even when
the input is rather sparse.

llvm-svn: 123547
This commit is contained in:
Benjamin Kramer 2011-01-15 20:30:30 +00:00
parent b587180fa7
commit fff2517edc
1 changed files with 45 additions and 18 deletions

View File

@ -2388,6 +2388,17 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
} }
} }
/// SplatByte - Distribute ByteVal over NumBits bits.
static APInt SplatByte(unsigned NumBits, uint8_t ByteVal) {
APInt Val = APInt(NumBits, ByteVal);
unsigned Shift = 8;
for (unsigned i = NumBits; i > 8; i >>= 1) {
Val = (Val << Shift) | Val;
Shift <<= 1;
}
return Val;
}
/// ExpandBitCount - Expand the specified bitcount instruction into operations. /// ExpandBitCount - Expand the specified bitcount instruction into operations.
/// ///
SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
@ -2395,26 +2406,42 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
switch (Opc) { switch (Opc) {
default: assert(0 && "Cannot expand this yet!"); default: assert(0 && "Cannot expand this yet!");
case ISD::CTPOP: { case ISD::CTPOP: {
static const uint64_t mask[6] = {
0x5555555555555555ULL, 0x3333333333333333ULL,
0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
};
EVT VT = Op.getValueType(); EVT VT = Op.getValueType();
EVT ShVT = TLI.getShiftAmountTy(); EVT ShVT = TLI.getShiftAmountTy();
unsigned len = VT.getSizeInBits(); unsigned Len = VT.getSizeInBits();
for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
//x = (x & mask[i][len/8]) + (x >> (1 << i) & mask[i][len/8]) // This is the "best" algorithm from
unsigned EltSize = VT.isVector() ? // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
VT.getVectorElementType().getSizeInBits() : len;
SDValue Tmp2 = DAG.getConstant(APInt(EltSize, mask[i]), VT); SDValue Mask55 = DAG.getConstant(SplatByte(Len, 0x55), VT);
SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT); SDValue Mask33 = DAG.getConstant(SplatByte(Len, 0x33), VT);
Op = DAG.getNode(ISD::ADD, dl, VT, SDValue Mask0F = DAG.getConstant(SplatByte(Len, 0x0F), VT);
DAG.getNode(ISD::AND, dl, VT, Op, Tmp2), SDValue Mask01 = DAG.getConstant(SplatByte(Len, 0x01), VT);
DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3), // v = v - ((v >> 1) & 0x55555555...)
Tmp2)); Op = DAG.getNode(ISD::SUB, dl, VT, Op,
} DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(1, ShVT)),
Mask55));
// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
Op = DAG.getNode(ISD::ADD, dl, VT,
DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(2, ShVT)),
Mask33));
// v = (v + (v >> 4)) & 0x0F0F0F0F...
Op = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::ADD, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(4, ShVT))),
Mask0F);
// v = (v * 0x01010101...) >> (Len - 8)
Op = DAG.getNode(ISD::SRL, dl, VT,
DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
DAG.getConstant(Len - 8, ShVT));
return Op; return Op;
} }
case ISD::CTLZ: { case ISD::CTLZ: {