Prefer to duplicate PPC Altivec loads when expanding unaligned loads

When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).

This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.

llvm-svn: 182719
This commit is contained in:
Hal Finkel 2013-05-26 18:08:30 +00:00
parent 4157b371f6
commit 7d8a691b5d
2 changed files with 92 additions and 15 deletions

View File

@ -6781,6 +6781,75 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
return SDValue();
}
// Return true is there is a nearyby consecutive load to the one provided
// (regardless of alignment). We search up and down the chain, looking though
// token factors and other loads (but nothing else). As a result, a true
// results indicates that it is safe to create a new consecutive load adjacent
// to the load provided.
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
SDValue Chain = LD->getChain();
EVT VT = LD->getMemoryVT();
SmallSet<SDNode *, 16> LoadRoots;
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
SmallSet<SDNode *, 16> Visited;
// First, search up the chain, branching to follow all token-factor operands.
// If we find a consecutive load, then we're done, otherwise, record all
// nodes just above the top-level loads and token factors.
while (!Queue.empty()) {
SDNode *ChainNext = Queue.pop_back_val();
if (!Visited.insert(ChainNext))
continue;
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
if (DAG.isConsecutiveLoad(ChainLD, LD, VT.getStoreSize(), 1))
return true;
if (!Visited.count(ChainLD->getChain().getNode()))
Queue.push_back(ChainLD->getChain().getNode());
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
for (SDNode::op_iterator O = ChainNext->op_begin(),
OE = ChainNext->op_end(); O != OE; ++O)
if (!Visited.count(O->getNode()))
Queue.push_back(O->getNode());
} else
LoadRoots.insert(ChainNext);
}
// Second, search down the chain, starting from the top-level nodes recorded
// in the first phase. These top-level nodes are the nodes just above all
// loads and token factors. Starting with their uses, recursively look though
// all loads (just the chain uses) and token factors to find a consecutive
// load.
Visited.clear();
Queue.clear();
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
IE = LoadRoots.end(); I != IE; ++I) {
Queue.push_back(*I);
while (!Queue.empty()) {
SDNode *LoadRoot = Queue.pop_back_val();
if (!Visited.insert(LoadRoot))
continue;
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
if (DAG.isConsecutiveLoad(ChainLD, LD, VT.getStoreSize(), 1))
return true;
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
UE = LoadRoot->use_end(); UI != UE; ++UI)
if (((isa<LoadSDNode>(*UI) &&
cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
Queue.push_back(*UI);
}
}
return false;
}
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
const TargetMachine &TM = getTargetMachine();
@ -7015,12 +7084,19 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
// require the next load to appear to be aligned, even though it
// is actually offset from the base pointer by a lesser amount.
int IncOffset = VT.getSizeInBits() / 8;
int IncValue = IncOffset - 1;
int IncValue = IncOffset;
// Walk (both up and down) the chain looking for another load at the real
// (aligned) offset (the alignment of the other load does not matter in
// this case). If found, then do not use the offset reduction trick, as
// that will prevent the loads from being later combined (as they would
// otherwise be duplicates).
if (!findConsecutiveLoad(LD, DAG))
--IncValue;
SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
// FIXME: We might have another load (with a slightly-different
// real offset) that we can reuse here.
SDValue ExtraLoad =
DAG.getLoad(VT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncOffset),

View File

@ -1,5 +1,4 @@
; RUN: llc < %s -mcpu=g5 | FileCheck %s
; RUN: llc < %s -mcpu=g5 | FileCheck %s -check-prefix=CHECK-PC
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@ -30,20 +29,22 @@ vector.body: ; preds = %vector.body, %vecto
br i1 %10, label %for.end, label %vector.body
; CHECK: @foo
; CHECK: lvx [[CNST:[0-9]+]],
; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]]
; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]]
; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]]
; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]],
; CHECK-DAG: li [[C0:[0-9]+]], 0
; CHECK-DAG: li [[C16:[0-9]+]], 16
; CHECK-DAG: li [[C31:[0-9]+]], 31
; CHECK-DAG: lvx [[CNST:[0-9]+]],
; CHECK: .LBB0_1:
; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[C0]]
; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[C0]]
; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[C0]]
; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]], [[C16]]
; CHECK-DAG: lvx [[LD3:[0-9]+]], [[B3]], [[C31]]
; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]]
; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
; CHECK-DAG: vperm [[R2:[0-9]+]], [[LD2]], [[LD3]], [[PC]]
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
; CHECK-DAG: vaddfp {{[0-9]+}}, [[R2]], [[CNST]]
; CHECK: blr
; CHECK-PC: @foo
; CHECK-PC: lvsl
; CHECK-PC-NOT: lvsl
; CHECK-PC: blr
for.end: ; preds = %vector.body
ret void
}