forked from OSchip/llvm-project
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to prevent page faults. Unfortunately, if we have a sequence of consecutive unaligned loads, this leads to suboptimal code generation because the 'extra' load from the first unaligned load can be combined with the base load from the second (but only if the decremented offset trick is not used for the first). Search up and down the chain, through loads and token factors, looking for consecutive loads, and if one is found, don't use the offset reduction trick. These duplicate loads are later combined to yield the desired sequence (in the future, we might want a more-powerful chain search, but that will require some changes to allow the combiner routines to access the AA object). This should complete the initial implementation of the optimized unaligned Altivec load expansion. There is some refactoring that should be done, but that will happen when the unaligned store expansion is added. llvm-svn: 182719
This commit is contained in:
parent
4157b371f6
commit
7d8a691b5d
|
@ -6781,6 +6781,75 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
// Return true is there is a nearyby consecutive load to the one provided
|
||||
// (regardless of alignment). We search up and down the chain, looking though
|
||||
// token factors and other loads (but nothing else). As a result, a true
|
||||
// results indicates that it is safe to create a new consecutive load adjacent
|
||||
// to the load provided.
|
||||
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
|
||||
SDValue Chain = LD->getChain();
|
||||
EVT VT = LD->getMemoryVT();
|
||||
|
||||
SmallSet<SDNode *, 16> LoadRoots;
|
||||
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
|
||||
SmallSet<SDNode *, 16> Visited;
|
||||
|
||||
// First, search up the chain, branching to follow all token-factor operands.
|
||||
// If we find a consecutive load, then we're done, otherwise, record all
|
||||
// nodes just above the top-level loads and token factors.
|
||||
while (!Queue.empty()) {
|
||||
SDNode *ChainNext = Queue.pop_back_val();
|
||||
if (!Visited.insert(ChainNext))
|
||||
continue;
|
||||
|
||||
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
|
||||
if (DAG.isConsecutiveLoad(ChainLD, LD, VT.getStoreSize(), 1))
|
||||
return true;
|
||||
|
||||
if (!Visited.count(ChainLD->getChain().getNode()))
|
||||
Queue.push_back(ChainLD->getChain().getNode());
|
||||
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
|
||||
for (SDNode::op_iterator O = ChainNext->op_begin(),
|
||||
OE = ChainNext->op_end(); O != OE; ++O)
|
||||
if (!Visited.count(O->getNode()))
|
||||
Queue.push_back(O->getNode());
|
||||
} else
|
||||
LoadRoots.insert(ChainNext);
|
||||
}
|
||||
|
||||
// Second, search down the chain, starting from the top-level nodes recorded
|
||||
// in the first phase. These top-level nodes are the nodes just above all
|
||||
// loads and token factors. Starting with their uses, recursively look though
|
||||
// all loads (just the chain uses) and token factors to find a consecutive
|
||||
// load.
|
||||
Visited.clear();
|
||||
Queue.clear();
|
||||
|
||||
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
|
||||
IE = LoadRoots.end(); I != IE; ++I) {
|
||||
Queue.push_back(*I);
|
||||
|
||||
while (!Queue.empty()) {
|
||||
SDNode *LoadRoot = Queue.pop_back_val();
|
||||
if (!Visited.insert(LoadRoot))
|
||||
continue;
|
||||
|
||||
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
|
||||
if (DAG.isConsecutiveLoad(ChainLD, LD, VT.getStoreSize(), 1))
|
||||
return true;
|
||||
|
||||
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
|
||||
UE = LoadRoot->use_end(); UI != UE; ++UI)
|
||||
if (((isa<LoadSDNode>(*UI) &&
|
||||
cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
|
||||
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
|
||||
Queue.push_back(*UI);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
const TargetMachine &TM = getTargetMachine();
|
||||
|
@ -7015,12 +7084,19 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
// require the next load to appear to be aligned, even though it
|
||||
// is actually offset from the base pointer by a lesser amount.
|
||||
int IncOffset = VT.getSizeInBits() / 8;
|
||||
int IncValue = IncOffset - 1;
|
||||
int IncValue = IncOffset;
|
||||
|
||||
// Walk (both up and down) the chain looking for another load at the real
|
||||
// (aligned) offset (the alignment of the other load does not matter in
|
||||
// this case). If found, then do not use the offset reduction trick, as
|
||||
// that will prevent the loads from being later combined (as they would
|
||||
// otherwise be duplicates).
|
||||
if (!findConsecutiveLoad(LD, DAG))
|
||||
--IncValue;
|
||||
|
||||
SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
|
||||
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
||||
|
||||
// FIXME: We might have another load (with a slightly-different
|
||||
// real offset) that we can reuse here.
|
||||
SDValue ExtraLoad =
|
||||
DAG.getLoad(VT, dl, Chain, Ptr,
|
||||
LD->getPointerInfo().getWithOffset(IncOffset),
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
; RUN: llc < %s -mcpu=g5 | FileCheck %s
|
||||
; RUN: llc < %s -mcpu=g5 | FileCheck %s -check-prefix=CHECK-PC
|
||||
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
|
@ -30,20 +29,22 @@ vector.body: ; preds = %vector.body, %vecto
|
|||
br i1 %10, label %for.end, label %vector.body
|
||||
|
||||
; CHECK: @foo
|
||||
; CHECK: lvx [[CNST:[0-9]+]],
|
||||
; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]]
|
||||
; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]]
|
||||
; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]]
|
||||
; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]],
|
||||
; CHECK-DAG: li [[C0:[0-9]+]], 0
|
||||
; CHECK-DAG: li [[C16:[0-9]+]], 16
|
||||
; CHECK-DAG: li [[C31:[0-9]+]], 31
|
||||
; CHECK-DAG: lvx [[CNST:[0-9]+]],
|
||||
; CHECK: .LBB0_1:
|
||||
; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[C0]]
|
||||
; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[C0]]
|
||||
; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[C0]]
|
||||
; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], [[C16]]
|
||||
; CHECK-DAG: lvx [[LD3:[0-9]+]], [[B3]], [[C31]]
|
||||
; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]]
|
||||
; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
|
||||
; CHECK-DAG: vperm [[R2:[0-9]+]], [[LD2]], [[LD3]], [[PC]]
|
||||
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
|
||||
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R2]], [[CNST]]
|
||||
; CHECK: blr
|
||||
|
||||
; CHECK-PC: @foo
|
||||
; CHECK-PC: lvsl
|
||||
; CHECK-PC-NOT: lvsl
|
||||
; CHECK-PC: blr
|
||||
|
||||
for.end: ; preds = %vector.body
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue