ARM64: Combine shifts and uses from different basic block to bit-extract instruction

llvm-svn: 206774
This commit is contained in:
Yi Jiang 2014-04-21 19:34:27 +00:00
parent 36f025e697
commit d069f6393a
5 changed files with 321 additions and 3 deletions

View File

@ -182,6 +182,9 @@ public:
return HasMultipleConditionRegisters; return HasMultipleConditionRegisters;
} }
/// Return true if the target has BitExtract instructions.
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
/// Return true if a vector of the given type should be split /// Return true if a vector of the given type should be split
/// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type
/// legalization. /// legalization.
@ -1010,6 +1013,14 @@ protected:
HasMultipleConditionRegisters = hasManyRegs; HasMultipleConditionRegisters = hasManyRegs;
} }
/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
/// combined with "shift" to BitExtract instructions.
void setHasExtractBitsInsn(bool hasExtractInsn = true) {
HasExtractBitsInsn = hasExtractInsn;
}
/// Tells the code generator not to expand sequence of operations into a /// Tells the code generator not to expand sequence of operations into a
/// separate sequences that increases the amount of flow control. /// separate sequences that increases the amount of flow control.
void setJumpIsExpensive(bool isExpensive = true) { void setJumpIsExpensive(bool isExpensive = true) {
@ -1436,6 +1447,12 @@ private:
/// the blocks of their users. /// the blocks of their users.
bool HasMultipleConditionRegisters; bool HasMultipleConditionRegisters;
/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
/// combined with "shift" to BitExtract instructions.
bool HasExtractBitsInsn;
/// Tells the code generator not to expand integer divides by constants into a /// Tells the code generator not to expand integer divides by constants into a
/// sequence of muls, adds, and shifts. This is a hack until a real cost /// sequence of muls, adds, and shifts. This is a hack until a real cost
/// model is in place. If we ever optimize for size, this will be set to true /// model is in place. If we ever optimize for size, this will be set to true

View File

@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
return MadeChange; return MadeChange;
} }
/// isExtractBitsCandidateUse - Check if the candidates could
/// be combined with shift instruction, which includes:
/// 1. Truncate instruction
/// 2. And instruction and the imm is a mask of the low bits:
/// imm & (imm+1) == 0
bool isExtractBitsCandidateUse(Instruction *User) {
if (!isa<TruncInst>(User)) {
if (User->getOpcode() != Instruction::And ||
!isa<ConstantInt>(User->getOperand(1)))
return false;
unsigned Cimm = dyn_cast<ConstantInt>(User->getOperand(1))->getZExtValue();
if (Cimm & (Cimm + 1))
return false;
}
return true;
}
/// SinkShiftAndTruncate - sink both shift and truncate instruction
/// to the use of truncate's BB.
bool
SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
const TargetLowering &TLI) {
BasicBlock *UserBB = User->getParent();
DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
TruncInst *TruncI = dyn_cast<TruncInst>(User);
bool MadeChange = false;
for (Value::user_iterator TruncUI = TruncI->user_begin(),
TruncE = TruncI->user_end();
TruncUI != TruncE;) {
Use &TruncTheUse = TruncUI.getUse();
Instruction *TruncUser = cast<Instruction>(*TruncUI);
// Preincrement use iterator so we don't invalidate it.
++TruncUI;
int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
if (!ISDOpcode)
continue;
// If the use is actually a legal node, there will not be an implicit
// truncate.
if (TLI.isOperationLegalOrCustom(ISDOpcode,
EVT::getEVT(TruncUser->getType())))
continue;
// Don't bother for PHI nodes.
if (isa<PHINode>(TruncUser))
continue;
BasicBlock *TruncUserBB = TruncUser->getParent();
if (UserBB == TruncUserBB)
continue;
BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
if (!InsertedShift && !InsertedTrunc) {
BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
// Sink the shift
if (ShiftI->getOpcode() == Instruction::AShr)
InsertedShift =
BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
else
InsertedShift =
BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
// Sink the trunc
BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
TruncInsertPt++;
InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
TruncI->getType(), "", TruncInsertPt);
MadeChange = true;
TruncTheUse = InsertedTrunc;
}
}
return MadeChange;
}
/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
/// the uses could potentially be combined with this shift instruction and
/// generate BitExtract instruction. It will only be applied if the architecture
/// supports BitExtract instruction. Here is an example:
/// BB1:
/// %x.extract.shift = lshr i64 %arg1, 32
/// BB2:
/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
/// ==>
///
/// BB2:
/// %x.extract.shift.1 = lshr i64 %arg1, 32
/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
///
/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
/// instruction.
/// Return true if any changes are made.
static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
const TargetLowering &TLI) {
BasicBlock *DefBB = ShiftI->getParent();
/// Only insert instructions in each block once.
DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
bool MadeChange = false;
for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
UI != E;) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
// Preincrement use iterator so we don't invalidate it.
++UI;
// Don't bother for PHI nodes.
if (isa<PHINode>(User))
continue;
if (!isExtractBitsCandidateUse(User))
continue;
BasicBlock *UserBB = User->getParent();
if (UserBB == DefBB) {
// If the shift and truncate instruction are in the same BB. The use of
// the truncate(TruncUse) may still introduce another truncate if not
// legal. In this case, we would like to sink both shift and truncate
// instruction to the BB of TruncUse.
// for example:
// BB1:
// i64 shift.result = lshr i64 opnd, imm
// trunc.result = trunc shift.result to i16
//
// BB2:
// ----> We will have an implicit truncate here if the architecture does
// not have i16 compare.
// cmp i16 trunc.result, opnd2
//
if (isa<TruncInst>(User) && shiftIsLegal
// If the type of the truncate is legal, no trucate will be
// introduced in other basic blocks.
&& (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
MadeChange =
SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
continue;
}
// If we have already inserted a shift into this block, use it.
BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
if (!InsertedShift) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
if (ShiftI->getOpcode() == Instruction::AShr)
InsertedShift =
BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
else
InsertedShift =
BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
MadeChange = true;
}
// Replace a use of the shift with a use of the new shift.
TheUse = InsertedShift;
}
// If we removed all uses, nuke the shift.
if (ShiftI->use_empty())
ShiftI->eraseFromParent();
return MadeChange;
}
namespace { namespace {
class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
protected: protected:
@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
return false; return false;
} }
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
if (TLI && CI && TLI->hasExtractBitsInsn())
return OptimizeExtractBits(BinOp, CI, *TLI);
return false;
}
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
if (GEPI->hasAllZeroIndices()) { if (GEPI->hasAllZeroIndices()) {
/// The GEP operand must be a pointer, so must its result -> BitCast /// The GEP operand must be a pointer, so must its result -> BitCast

View File

@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// Make sure to clamp the MSB so that we preserve the semantics of the // Make sure to clamp the MSB so that we preserve the semantics of the
// original operations. // original operations.
ClampMSB = true; ClampMSB = true;
} else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
Srl_imm)) {
// If the shift result was truncated, we can still combine them.
Opd0 = Op0->getOperand(0).getOperand(0);
// Use the type of SRL node.
VT = Opd0->getValueType(0);
} else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
Opd0 = Op0->getOperand(0); Opd0 = Op0->getOperand(0);
} else if (BiggerPattern) { } else if (BiggerPattern) {
@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
// we're looking for a shift of a shift // we're looking for a shift of a shift
uint64_t Shl_imm = 0; uint64_t Shl_imm = 0;
uint64_t Trunc_bits = 0;
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
Opd0 = N->getOperand(0).getOperand(0); Opd0 = N->getOperand(0).getOperand(0);
} else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
// We are looking for a shift of truncate. Truncate from i64 to i32 could
// be considered as setting high 32 bits as zero. Our strategy here is to
// always generate 64bit UBFM. This consistency will help the CSE pass
// later find more redundancy.
Opd0 = N->getOperand(0).getOperand(0);
Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
VT = Opd0->getValueType(0);
assert(VT == MVT::i64 && "the promoted type should be i64");
} else if (BiggerPattern) { } else if (BiggerPattern) {
// Let's pretend a 0 shift left has been performed. // Let's pretend a 0 shift left has been performed.
// FIXME: Currently we limit this to the bigger pattern case, // FIXME: Currently we limit this to the bigger pattern case,
@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
"bad amount in shift node!"); "bad amount in shift node!");
// Note: The width operand is encoded as width-1. // Note: The width operand is encoded as width-1.
unsigned Width = VT.getSizeInBits() - Srl_imm - 1; unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
int sLSB = Srl_imm - Shl_imm; int sLSB = Srl_imm - Shl_imm;
if (sLSB < 0) if (sLSB < 0)
return false; return false;
@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
return NULL; return NULL;
EVT VT = N->getValueType(0); EVT VT = N->getValueType(0);
SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
CurDAG->getTargetConstant(MSB, VT) }; // If the bit extract operation is 64bit but the original type is 32bit, we
// need to add one EXTRACT_SUBREG.
if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) {
SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
CurDAG->getTargetConstant(MSB, MVT::i64)};
SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
MachineSDNode *Node =
CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
SDValue(BFM, 0), SubReg);
return Node;
}
SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
CurDAG->getTargetConstant(MSB, VT)};
return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3); return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
} }

View File

@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
setDivIsWellDefined(true); setDivIsWellDefined(true);
RequireStrictAlign = StrictAlign; RequireStrictAlign = StrictAlign;
setHasExtractBitsInsn(true);
} }
void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {

View File

@ -1,3 +1,4 @@
; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
; RUN: llc < %s -march=arm64 | FileCheck %s ; RUN: llc < %s -march=arm64 | FileCheck %s
%struct.X = type { i8, i8, [2 x i8] } %struct.X = type { i8, i8, [2 x i8] }
%struct.Y = type { i32, i8 } %struct.Y = type { i32, i8 }
@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind ssp {
%result = and i64 %conv82, 255 %result = and i64 %conv82, 255
ret i64 %result ret i64 %result
} }
; Using the access to the global array to keep the instruction and control flow.
@first_ones = external global [65536 x i8]
; Function Attrs: nounwind readonly ssp
define i32 @fct19(i64 %arg1) nounwind readonly ssp {
; CHECK-LABEL: fct19:
entry:
%x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
%x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
%x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
%x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
%tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
br i1 %tobool, label %if.end, label %if.then
if.then: ; preds = %entry
%arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
%0 = load i8* %arrayidx3, align 1
%conv = zext i8 %0 to i32
br label %return
; OPT-LABEL: if.end
if.end: ; preds = %entry
; OPT: lshr
; CHECK: ubfm [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47
%x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
%tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
; CHECK: cbz
br i1 %tobool6, label %if.end13, label %if.then7
; OPT-LABEL: if.then7
if.then7: ; preds = %if.end
; OPT: lshr
; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
; So neither of them should be in the assemble code.
; CHECK-NOT: and
; CHECK-NOT: ubfm
%idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
%arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
%1 = load i8* %arrayidx11, align 1
%conv12 = zext i8 %1 to i32
%add = add nsw i32 %conv12, 16
br label %return
; OPT-LABEL: if.end13
if.end13: ; preds = %if.end
; OPT: lshr
; OPT: trunc
; CHECK: ubfm [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31
%tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
; CHECK: cbz
br i1 %tobool16, label %return, label %if.then17
; OPT-LABEL: if.then17
if.then17: ; preds = %if.end13
; OPT: lshr
; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
; So neither of them should be in the assemble code.
; CHECK-NOT: and
; CHECK-NOT: ubfm
%idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
%arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
%2 = load i8* %arrayidx21, align 1
%conv22 = zext i8 %2 to i32
%add23 = add nsw i32 %conv22, 32
br label %return
return: ; preds = %if.end13, %if.then17, %if.then7, %if.then
; CHECK: ret
%retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
ret i32 %retval.0
}