forked from OSchip/llvm-project
ARM64: Combine shifts and uses from different basic block to bit-extract instruction
llvm-svn: 206774
This commit is contained in:
parent
36f025e697
commit
d069f6393a
|
@ -182,6 +182,9 @@ public:
|
|||
return HasMultipleConditionRegisters;
|
||||
}
|
||||
|
||||
/// Return true if the target has BitExtract instructions.
|
||||
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
|
||||
|
||||
/// Return true if a vector of the given type should be split
|
||||
/// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type
|
||||
/// legalization.
|
||||
|
@ -1010,6 +1013,14 @@ protected:
|
|||
HasMultipleConditionRegisters = hasManyRegs;
|
||||
}
|
||||
|
||||
/// Tells the code generator that the target has BitExtract instructions.
|
||||
/// The code generator will aggressively sink "shift"s into the blocks of
|
||||
/// their users if the users will generate "and" instructions which can be
|
||||
/// combined with "shift" to BitExtract instructions.
|
||||
void setHasExtractBitsInsn(bool hasExtractInsn = true) {
|
||||
HasExtractBitsInsn = hasExtractInsn;
|
||||
}
|
||||
|
||||
/// Tells the code generator not to expand sequence of operations into a
|
||||
/// separate sequences that increases the amount of flow control.
|
||||
void setJumpIsExpensive(bool isExpensive = true) {
|
||||
|
@ -1436,6 +1447,12 @@ private:
|
|||
/// the blocks of their users.
|
||||
bool HasMultipleConditionRegisters;
|
||||
|
||||
/// Tells the code generator that the target has BitExtract instructions.
|
||||
/// The code generator will aggressively sink "shift"s into the blocks of
|
||||
/// their users if the users will generate "and" instructions which can be
|
||||
/// combined with "shift" to BitExtract instructions.
|
||||
bool HasExtractBitsInsn;
|
||||
|
||||
/// Tells the code generator not to expand integer divides by constants into a
|
||||
/// sequence of muls, adds, and shifts. This is a hack until a real cost
|
||||
/// model is in place. If we ever optimize for size, this will be set to true
|
||||
|
|
|
@ -628,6 +628,187 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
|
|||
return MadeChange;
|
||||
}
|
||||
|
||||
/// isExtractBitsCandidateUse - Check if the candidates could
|
||||
/// be combined with shift instruction, which includes:
|
||||
/// 1. Truncate instruction
|
||||
/// 2. And instruction and the imm is a mask of the low bits:
|
||||
/// imm & (imm+1) == 0
|
||||
bool isExtractBitsCandidateUse(Instruction *User) {
|
||||
if (!isa<TruncInst>(User)) {
|
||||
if (User->getOpcode() != Instruction::And ||
|
||||
!isa<ConstantInt>(User->getOperand(1)))
|
||||
return false;
|
||||
|
||||
unsigned Cimm = dyn_cast<ConstantInt>(User->getOperand(1))->getZExtValue();
|
||||
|
||||
if (Cimm & (Cimm + 1))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// SinkShiftAndTruncate - sink both shift and truncate instruction
|
||||
/// to the use of truncate's BB.
|
||||
bool
|
||||
SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
|
||||
DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
|
||||
const TargetLowering &TLI) {
|
||||
BasicBlock *UserBB = User->getParent();
|
||||
DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
|
||||
TruncInst *TruncI = dyn_cast<TruncInst>(User);
|
||||
bool MadeChange = false;
|
||||
|
||||
for (Value::user_iterator TruncUI = TruncI->user_begin(),
|
||||
TruncE = TruncI->user_end();
|
||||
TruncUI != TruncE;) {
|
||||
|
||||
Use &TruncTheUse = TruncUI.getUse();
|
||||
Instruction *TruncUser = cast<Instruction>(*TruncUI);
|
||||
// Preincrement use iterator so we don't invalidate it.
|
||||
|
||||
++TruncUI;
|
||||
|
||||
int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
|
||||
if (!ISDOpcode)
|
||||
continue;
|
||||
|
||||
// If the use is actually a legal node, there will not be an implicit
|
||||
// truncate.
|
||||
if (TLI.isOperationLegalOrCustom(ISDOpcode,
|
||||
EVT::getEVT(TruncUser->getType())))
|
||||
continue;
|
||||
|
||||
// Don't bother for PHI nodes.
|
||||
if (isa<PHINode>(TruncUser))
|
||||
continue;
|
||||
|
||||
BasicBlock *TruncUserBB = TruncUser->getParent();
|
||||
|
||||
if (UserBB == TruncUserBB)
|
||||
continue;
|
||||
|
||||
BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
|
||||
CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
|
||||
|
||||
if (!InsertedShift && !InsertedTrunc) {
|
||||
BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
|
||||
// Sink the shift
|
||||
if (ShiftI->getOpcode() == Instruction::AShr)
|
||||
InsertedShift =
|
||||
BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
|
||||
else
|
||||
InsertedShift =
|
||||
BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
|
||||
|
||||
// Sink the trunc
|
||||
BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
|
||||
TruncInsertPt++;
|
||||
|
||||
InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
|
||||
TruncI->getType(), "", TruncInsertPt);
|
||||
|
||||
MadeChange = true;
|
||||
|
||||
TruncTheUse = InsertedTrunc;
|
||||
}
|
||||
}
|
||||
return MadeChange;
|
||||
}
|
||||
|
||||
/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
|
||||
/// the uses could potentially be combined with this shift instruction and
|
||||
/// generate BitExtract instruction. It will only be applied if the architecture
|
||||
/// supports BitExtract instruction. Here is an example:
|
||||
/// BB1:
|
||||
/// %x.extract.shift = lshr i64 %arg1, 32
|
||||
/// BB2:
|
||||
/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
|
||||
/// ==>
|
||||
///
|
||||
/// BB2:
|
||||
/// %x.extract.shift.1 = lshr i64 %arg1, 32
|
||||
/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
|
||||
///
|
||||
/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
|
||||
/// instruction.
|
||||
/// Return true if any changes are made.
|
||||
static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
|
||||
const TargetLowering &TLI) {
|
||||
BasicBlock *DefBB = ShiftI->getParent();
|
||||
|
||||
/// Only insert instructions in each block once.
|
||||
DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
|
||||
|
||||
bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
|
||||
|
||||
bool MadeChange = false;
|
||||
for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
|
||||
UI != E;) {
|
||||
Use &TheUse = UI.getUse();
|
||||
Instruction *User = cast<Instruction>(*UI);
|
||||
// Preincrement use iterator so we don't invalidate it.
|
||||
++UI;
|
||||
|
||||
// Don't bother for PHI nodes.
|
||||
if (isa<PHINode>(User))
|
||||
continue;
|
||||
|
||||
if (!isExtractBitsCandidateUse(User))
|
||||
continue;
|
||||
|
||||
BasicBlock *UserBB = User->getParent();
|
||||
|
||||
if (UserBB == DefBB) {
|
||||
// If the shift and truncate instruction are in the same BB. The use of
|
||||
// the truncate(TruncUse) may still introduce another truncate if not
|
||||
// legal. In this case, we would like to sink both shift and truncate
|
||||
// instruction to the BB of TruncUse.
|
||||
// for example:
|
||||
// BB1:
|
||||
// i64 shift.result = lshr i64 opnd, imm
|
||||
// trunc.result = trunc shift.result to i16
|
||||
//
|
||||
// BB2:
|
||||
// ----> We will have an implicit truncate here if the architecture does
|
||||
// not have i16 compare.
|
||||
// cmp i16 trunc.result, opnd2
|
||||
//
|
||||
if (isa<TruncInst>(User) && shiftIsLegal
|
||||
// If the type of the truncate is legal, no trucate will be
|
||||
// introduced in other basic blocks.
|
||||
&& (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
|
||||
MadeChange =
|
||||
SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
|
||||
|
||||
continue;
|
||||
}
|
||||
// If we have already inserted a shift into this block, use it.
|
||||
BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
|
||||
|
||||
if (!InsertedShift) {
|
||||
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
|
||||
|
||||
if (ShiftI->getOpcode() == Instruction::AShr)
|
||||
InsertedShift =
|
||||
BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
|
||||
else
|
||||
InsertedShift =
|
||||
BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
|
||||
|
||||
MadeChange = true;
|
||||
}
|
||||
|
||||
// Replace a use of the shift with a use of the new shift.
|
||||
TheUse = InsertedShift;
|
||||
}
|
||||
|
||||
// If we removed all uses, nuke the shift.
|
||||
if (ShiftI->use_empty())
|
||||
ShiftI->eraseFromParent();
|
||||
|
||||
return MadeChange;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
|
||||
protected:
|
||||
|
@ -3225,6 +3406,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
|
|||
return false;
|
||||
}
|
||||
|
||||
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
|
||||
|
||||
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
|
||||
BinOp->getOpcode() == Instruction::LShr)) {
|
||||
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
|
||||
if (TLI && CI && TLI->hasExtractBitsInsn())
|
||||
return OptimizeExtractBits(BinOp, CI, *TLI);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
|
||||
if (GEPI->hasAllZeroIndices()) {
|
||||
/// The GEP operand must be a pointer, so must its result -> BitCast
|
||||
|
|
|
@ -1183,6 +1183,14 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
|
|||
// Make sure to clamp the MSB so that we preserve the semantics of the
|
||||
// original operations.
|
||||
ClampMSB = true;
|
||||
} else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
|
||||
isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
|
||||
Srl_imm)) {
|
||||
// If the shift result was truncated, we can still combine them.
|
||||
Opd0 = Op0->getOperand(0).getOperand(0);
|
||||
|
||||
// Use the type of SRL node.
|
||||
VT = Opd0->getValueType(0);
|
||||
} else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
|
||||
Opd0 = Op0->getOperand(0);
|
||||
} else if (BiggerPattern) {
|
||||
|
@ -1277,8 +1285,19 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
|
|||
|
||||
// we're looking for a shift of a shift
|
||||
uint64_t Shl_imm = 0;
|
||||
uint64_t Trunc_bits = 0;
|
||||
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
|
||||
Opd0 = N->getOperand(0).getOperand(0);
|
||||
} else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
|
||||
N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
|
||||
// We are looking for a shift of truncate. Truncate from i64 to i32 could
|
||||
// be considered as setting high 32 bits as zero. Our strategy here is to
|
||||
// always generate 64bit UBFM. This consistency will help the CSE pass
|
||||
// later find more redundancy.
|
||||
Opd0 = N->getOperand(0).getOperand(0);
|
||||
Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
|
||||
VT = Opd0->getValueType(0);
|
||||
assert(VT == MVT::i64 && "the promoted type should be i64");
|
||||
} else if (BiggerPattern) {
|
||||
// Let's pretend a 0 shift left has been performed.
|
||||
// FIXME: Currently we limit this to the bigger pattern case,
|
||||
|
@ -1295,7 +1314,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
|
|||
assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
|
||||
"bad amount in shift node!");
|
||||
// Note: The width operand is encoded as width-1.
|
||||
unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
|
||||
unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
|
||||
int sLSB = Srl_imm - Shl_imm;
|
||||
if (sLSB < 0)
|
||||
return false;
|
||||
|
@ -1354,8 +1373,23 @@ SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
|
|||
return NULL;
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
|
||||
CurDAG->getTargetConstant(MSB, VT) };
|
||||
|
||||
// If the bit extract operation is 64bit but the original type is 32bit, we
|
||||
// need to add one EXTRACT_SUBREG.
|
||||
if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) {
|
||||
SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
|
||||
CurDAG->getTargetConstant(MSB, MVT::i64)};
|
||||
|
||||
SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
|
||||
SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
|
||||
MachineSDNode *Node =
|
||||
CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
|
||||
SDValue(BFM, 0), SubReg);
|
||||
return Node;
|
||||
}
|
||||
|
||||
SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
|
||||
CurDAG->getTargetConstant(MSB, VT)};
|
||||
return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
|
||||
}
|
||||
|
||||
|
|
|
@ -438,6 +438,8 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
|
|||
setDivIsWellDefined(true);
|
||||
|
||||
RequireStrictAlign = StrictAlign;
|
||||
|
||||
setHasExtractBitsInsn(true);
|
||||
}
|
||||
|
||||
void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
|
||||
; RUN: llc < %s -march=arm64 | FileCheck %s
|
||||
%struct.X = type { i8, i8, [2 x i8] }
|
||||
%struct.Y = type { i32, i8 }
|
||||
|
@ -404,3 +405,75 @@ define i64 @fct18(i32 %xor72) nounwind ssp {
|
|||
%result = and i64 %conv82, 255
|
||||
ret i64 %result
|
||||
}
|
||||
|
||||
; Using the access to the global array to keep the instruction and control flow.
|
||||
@first_ones = external global [65536 x i8]
|
||||
|
||||
; Function Attrs: nounwind readonly ssp
|
||||
define i32 @fct19(i64 %arg1) nounwind readonly ssp {
|
||||
; CHECK-LABEL: fct19:
|
||||
entry:
|
||||
%x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
|
||||
%x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
|
||||
%x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
|
||||
%x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
|
||||
%tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
|
||||
br i1 %tobool, label %if.end, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
|
||||
%0 = load i8* %arrayidx3, align 1
|
||||
%conv = zext i8 %0 to i32
|
||||
br label %return
|
||||
|
||||
; OPT-LABEL: if.end
|
||||
if.end: ; preds = %entry
|
||||
; OPT: lshr
|
||||
; CHECK: ubfm [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #47
|
||||
%x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
|
||||
%tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
|
||||
; CHECK: cbz
|
||||
br i1 %tobool6, label %if.end13, label %if.then7
|
||||
|
||||
; OPT-LABEL: if.then7
|
||||
if.then7: ; preds = %if.end
|
||||
; OPT: lshr
|
||||
; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
|
||||
; So neither of them should be in the assemble code.
|
||||
; CHECK-NOT: and
|
||||
; CHECK-NOT: ubfm
|
||||
%idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
|
||||
%arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
|
||||
%1 = load i8* %arrayidx11, align 1
|
||||
%conv12 = zext i8 %1 to i32
|
||||
%add = add nsw i32 %conv12, 16
|
||||
br label %return
|
||||
|
||||
; OPT-LABEL: if.end13
|
||||
if.end13: ; preds = %if.end
|
||||
; OPT: lshr
|
||||
; OPT: trunc
|
||||
; CHECK: ubfm [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #31
|
||||
%tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
|
||||
; CHECK: cbz
|
||||
br i1 %tobool16, label %return, label %if.then17
|
||||
|
||||
; OPT-LABEL: if.then17
|
||||
if.then17: ; preds = %if.end13
|
||||
; OPT: lshr
|
||||
; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
|
||||
; So neither of them should be in the assemble code.
|
||||
; CHECK-NOT: and
|
||||
; CHECK-NOT: ubfm
|
||||
%idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
|
||||
%arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
|
||||
%2 = load i8* %arrayidx21, align 1
|
||||
%conv22 = zext i8 %2 to i32
|
||||
%add23 = add nsw i32 %conv22, 32
|
||||
br label %return
|
||||
|
||||
return: ; preds = %if.end13, %if.then17, %if.then7, %if.then
|
||||
; CHECK: ret
|
||||
%retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue