LLVM sdisel normalize bit extraction of the form:

((x & 0xff00) >> 8) << 2
to
 (x >> 6) & 0x3fc

This is general goodness since it folds a left shift into the mask. However,
the trailing zeros in the mask prevents the ARM backend from using the bit
extraction instructions. And worse since the mask materialization may require
an addition instruction. This comes up fairly frequently when the result of 
the bit twiddling is used as memory address. e.g.

 = ptr[(x & 0xFF0000) >> 16]

We want to generate:
  ubfx   r3, r1, #16, #8
  ldr.w  r3, [r0, r3, lsl #2]

vs.
  mov.w  r9, #1020
  and.w  r2, r9, r1, lsr #14
  ldr    r2, [r0, r2]

Add a late ARM specific isel optimization to
ARMDAGToDAGISel::PreprocessISelDAG(). It folds the left shift to the
'base + offset' address computation; change the mask to one which doesn't have
trailing zeros and enable the use of ubfx.

Note the optimization has to be done late since it's target specific and we
don't want to change the DAG normalization. It's also fairly restrictive
as shifter operands are not always free. It's only done for lsh 1 / 2. It's
known to be free on some cpus and they are most common for address
computation.

This is a slight win for blowfish, rijndael, etc.

rdar://12870177

llvm-svn: 170581
This commit is contained in:
Evan Cheng 2012-12-19 20:16:09 +00:00
parent 870f4fe261
commit eae6d2ccea
2 changed files with 132 additions and 2 deletions

View File

@ -78,6 +78,8 @@ public:
return "ARM Instruction Selection";
}
virtual void PreprocessISelDAG();
/// getI32Imm - Return a target constant of type i32 with the specified
/// value.
inline SDValue getI32Imm(unsigned Imm) {
@ -327,6 +329,87 @@ static bool isScaledConstantInRange(SDValue Node, int Scale,
return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
}
void ARMDAGToDAGISel::PreprocessISelDAG() {
if (!Subtarget->hasV6T2Ops())
return;
bool isThumb2 = Subtarget->isThumb();
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
if (N->getOpcode() != ISD::ADD)
continue;
// Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
// leading zeros, followed by consecutive set bits, followed by 1 or 2
// trailing zeros, e.g. 1020.
// Transform the expression to
// (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
// of trailing zeros of c2. The left shift would be folded as an shifter
// operand of 'add' and the 'and' and 'srl' would become a bits extraction
// node (UBFX).
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned And_imm = 0;
if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
std::swap(N0, N1);
}
if (!And_imm)
continue;
// Check if the AND mask is an immediate of the form: 000.....1111111100
unsigned TZ = CountTrailingZeros_32(And_imm);
if (TZ != 1 && TZ != 2)
// Be conservative here. Shifter operands aren't always free. e.g. On
// Swift, left shifter operand of 1 / 2 for free but others are not.
// e.g.
// ubfx r3, r1, #16, #8
// ldr.w r3, [r0, r3, lsl #2]
// vs.
// mov.w r9, #1020
// and.w r2, r9, r1, lsr #14
// ldr r2, [r0, r2]
continue;
And_imm >>= TZ;
if (And_imm & (And_imm + 1))
continue;
// Look for (and (srl X, c1), c2).
SDValue Srl = N1.getOperand(0);
unsigned Srl_imm = 0;
if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
(Srl_imm <= 2))
continue;
// Make sure first operand is not a shifter operand which would prevent
// folding of the left shift.
SDValue CPTmp0;
SDValue CPTmp1;
SDValue CPTmp2;
if (isThumb2) {
if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
continue;
} else {
if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
continue;
}
// Now make the transformation.
Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32,
Srl.getOperand(0),
CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32,
Srl, CurDAG->getConstant(And_imm, MVT::i32));
N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32,
N1, CurDAG->getConstant(TZ, MVT::i32));
CurDAG->UpdateNodeOperands(N, N0, N1);
}
}
/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
/// least on current ARM implementations) which should be avoidded.
@ -2119,10 +2202,10 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
if (!Subtarget->hasV6T2Ops())
return NULL;
unsigned Opc = isSigned ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
unsigned Opc = isSigned
? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
: (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
// For unsigned extracts, check for a shift right and mask
unsigned And_imm = 0;
if (N->getOpcode() == ISD::AND) {
@ -2140,7 +2223,29 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
// Note: The width operand is encoded as width-1.
unsigned Width = CountTrailingOnes_32(And_imm) - 1;
unsigned LSB = Srl_imm;
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
// It's cheaper to use a right shift to extract the top bits.
if (Subtarget->isThumb()) {
Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(LSB, MVT::i32),
getAL(CurDAG), Reg0, Reg0 };
return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
}
// ARM models shift instructions as MOVsi with shifter operand.
ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
SDValue ShOpc =
CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB),
MVT::i32);
SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
getAL(CurDAG), Reg0, Reg0 };
return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops, 5);
}
SDValue Ops[] = { N->getOperand(0).getOperand(0),
CurDAG->getTargetConstant(LSB, MVT::i32),
CurDAG->getTargetConstant(Width, MVT::i32),

View File

@ -26,3 +26,28 @@ define i32 @ubfx2(i32 %a) {
ret i32 %t2
}
; rdar://12870177
define i32 @ubfx_opt(i32* nocapture %ctx, i32 %x) nounwind readonly ssp {
entry:
; CHECK: ubfx_opt
; CHECK: lsr [[REG1:(lr|r[0-9]+)]], r1, #24
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG1]], lsl #2]
; CHECK: ubfx [[REG2:(lr|r[0-9]+)]], r1, #16, #8
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG2]], lsl #2]
; CHECK: ubfx [[REG3:(lr|r[0-9]+)]], r1, #8, #8
; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG3]], lsl #2]
%and = lshr i32 %x, 8
%shr = and i32 %and, 255
%and1 = lshr i32 %x, 16
%shr2 = and i32 %and1, 255
%shr4 = lshr i32 %x, 24
%arrayidx = getelementptr inbounds i32* %ctx, i32 %shr4
%0 = load i32* %arrayidx, align 4
%arrayidx5 = getelementptr inbounds i32* %ctx, i32 %shr2
%1 = load i32* %arrayidx5, align 4
%add = add i32 %1, %0
%arrayidx6 = getelementptr inbounds i32* %ctx, i32 %shr
%2 = load i32* %arrayidx6, align 4
%add7 = add i32 %add, %2
ret i32 %add7
}