Add an unfolded offset field to LSR's Formula record. This is used to

model constants which can be added to base registers via add-immediate
instructions which don't require an additional register to materialize
the immediate.

llvm-svn: 130743
This commit is contained in:
Dan Gohman 2011-05-03 00:46:49 +00:00
parent 37286b2440
commit 6136e94897
5 changed files with 164 additions and 9 deletions

View File

@ -1583,6 +1583,14 @@ public:
return true;
}
/// isLegalAddImmediate - Return true if the specified immediate is legal
/// add immediate, that is the target has add instructions which can add
/// a register with the immediate without having to materialize the
/// immediate into a register.
virtual bool isLegalAddImmediate(int64_t Imm) const {
return true;
}
//===--------------------------------------------------------------------===//
// Div utility functions
//

View File

@ -6984,6 +6984,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return Imm >= 0 && Imm <= 255;
}
/// isLegalAddImmediate - Return true if the specified immediate is legal
/// add immediate, that is the target has add instructions which can add
/// a register with the immediate without having to materialize the
/// immediate into a register.
bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
return ARM_AM::getSOImmVal(Imm) != -1;
}
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,

View File

@ -264,6 +264,12 @@ namespace llvm {
/// the immediate into a register.
virtual bool isLegalICmpImmediate(int64_t Imm) const;
/// isLegalAddImmediate - Return true if the specified immediate is legal
/// add immediate, that is the target has add instructions which can
/// add a register and the immediate without having to materialize
/// the immediate into a register.
virtual bool isLegalAddImmediate(int64_t Imm) const;
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.

View File

@ -209,7 +209,12 @@ struct Formula {
/// when AM.Scale is not zero.
const SCEV *ScaledReg;
Formula() : ScaledReg(0) {}
/// UnfoldedOffset - An additional constant offset which added near the
/// use. This requires a temporary register, but the offset itself can
/// live in an add immediate field rather than a register.
int64_t UnfoldedOffset;
Formula() : ScaledReg(0), UnfoldedOffset(0) {}
void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const {
OS << "<unknown>";
OS << ')';
}
if (UnfoldedOffset != 0) {
if (!First) OS << " + "; else First = false;
OS << "imm(" << UnfoldedOffset << ')';
}
}
void Formula::dump() const {
@ -771,8 +780,10 @@ void Cost::RateFormula(const Formula &F,
RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
}
if (F.BaseRegs.size() > 1)
NumBaseAdds += F.BaseRegs.size() - 1;
// Determine how many (unfolded) adds we'll need inside the loop.
size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
if (NumBaseParts > 1)
NumBaseAdds += NumBaseParts - 1;
// Tally up the non-zero immediates.
for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@ -1945,7 +1956,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
if (F.BaseRegs == OrigF.BaseRegs &&
F.ScaledReg == OrigF.ScaledReg &&
F.AM.BaseGV == OrigF.AM.BaseGV &&
F.AM.Scale == OrigF.AM.Scale) {
F.AM.Scale == OrigF.AM.Scale &&
F.UnfoldedOffset == OrigF.UnfoldedOffset) {
if (F.AM.BaseOffs == 0)
return &LU;
// This is the formula where all the registers and symbols matched;
@ -2313,8 +2325,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
if (InnerSum->isZero())
continue;
Formula F = Base;
F.BaseRegs[i] = InnerSum;
F.BaseRegs.push_back(*J);
// Add the remaining pieces of the add back into the new formula.
const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
if (TLI && InnerSumSC &&
SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
InnerSumSC->getValue()->getZExtValue())) {
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
InnerSumSC->getValue()->getZExtValue();
F.BaseRegs.erase(F.BaseRegs.begin() + i);
} else
F.BaseRegs[i] = InnerSum;
// Add J as its own register, or an unfolded immediate.
const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
SC->getValue()->getZExtValue()))
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
SC->getValue()->getZExtValue();
else
F.BaseRegs.push_back(*J);
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
@ -2482,6 +2515,13 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
continue;
}
// Check that multiplying with the unfolded offset doesn't overflow.
if (F.UnfoldedOffset != 0) {
F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
continue;
}
// If we make it here and it's legal, add it.
(void)InsertFormula(LU, LUIdx, F);
next:;
@ -2664,7 +2704,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// other orig regs.
ImmMapTy::const_iterator OtherImms[] = {
Imms.begin(), prior(Imms.end()),
Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
};
for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
ImmMapTy::const_iterator M = OtherImms[i];
@ -2738,8 +2778,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
Formula NewF = F;
NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI))
continue;
LU.Kind, LU.AccessTy, TLI)) {
if (!TLI ||
!TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
continue;
NewF = F;
NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
}
NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
// If the new formula has a constant in a register, and adding the
@ -3488,6 +3533,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
}
}
// Expand the unfolded offset portion.
int64_t UnfoldedOffset = F.UnfoldedOffset;
if (UnfoldedOffset != 0) {
// Just add the immediate values.
Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
UnfoldedOffset)));
}
// Emit instructions summing all the operands.
const SCEV *FullS = Ops.empty() ?
SE.getConstant(IntTy, 0) :

View File

@ -0,0 +1,80 @@
; RUN: llc < %s | FileCheck %s
; LSR shouldn't introduce more induction variables than needed, increasing
; register pressure and therefore spilling. There is more room for improvement
; here.
; CHECK: sub sp, #32
; CHECK: ldr r{{.*}}, [sp, #4]
; CHECK-NEXT: ldr r{{.*}}, [sp, #16]
; CHECK-NEXT: ldr r{{.*}}, [sp, #12]
; CHECK-NEXT: adds
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
target triple = "thumbv7-apple-macosx10.7.0"
%struct.partition_entry = type { i32, i32, i64, i64 }
define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp {
entry:
%cmp79 = icmp sgt i32 %num_entries, 0
br i1 %cmp79, label %outer.loop, label %for.end72
outer.loop: ; preds = %for.inc69, %entry
%overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]
%0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]
%offset = getelementptr %struct.partition_entry* %part, i32 %0, i32 2
%len = getelementptr %struct.partition_entry* %part, i32 %0, i32 3
%tmp5 = load i64* %offset, align 4, !tbaa !0
%tmp15 = load i64* %len, align 4, !tbaa !0
%add = add nsw i64 %tmp15, %tmp5
br label %inner.loop
inner.loop: ; preds = %for.inc, %outer.loop
%overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]
%1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]
%cmp23 = icmp eq i32 %0, %1
br i1 %cmp23, label %for.inc, label %if.end
if.end: ; preds = %inner.loop
%len39 = getelementptr %struct.partition_entry* %part, i32 %1, i32 3
%offset28 = getelementptr %struct.partition_entry* %part, i32 %1, i32 2
%tmp29 = load i64* %offset28, align 4, !tbaa !0
%tmp40 = load i64* %len39, align 4, !tbaa !0
%add41 = add nsw i64 %tmp40, %tmp29
%cmp44 = icmp sge i64 %tmp29, %tmp5
%cmp47 = icmp slt i64 %tmp29, %add
%or.cond = and i1 %cmp44, %cmp47
%overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178
%cmp52 = icmp sle i64 %add41, %add
%cmp56 = icmp sgt i64 %add41, %tmp5
%or.cond74 = and i1 %cmp52, %cmp56
%overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2
%cmp61 = icmp sgt i64 %tmp29, %tmp5
%cmp65 = icmp slt i64 %add41, %add
%or.cond75 = or i1 %cmp61, %cmp65
br i1 %or.cond75, label %for.inc, label %if.then66
if.then66: ; preds = %if.end
br label %for.inc
for.inc: ; preds = %if.end, %if.then66, %inner.loop
%overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]
%inc = add nsw i32 %1, 1
%exitcond = icmp eq i32 %inc, %num_entries
br i1 %exitcond, label %for.inc69, label %inner.loop
for.inc69: ; preds = %for.inc
%inc71 = add nsw i32 %0, 1
%exitcond83 = icmp eq i32 %inc71, %num_entries
br i1 %exitcond83, label %for.end72, label %outer.loop
for.end72: ; preds = %for.inc69, %entry
%overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]
ret i32 %overlap.0.lcssa
}
!0 = metadata !{metadata !"long long", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA", null}