forked from OSchip/llvm-project
[X86] Prevent non-temporal loads from folding into instructions by blocking them in X86DAGToDAGISel::IsProfitableToFold rather than with a predicate.
Remove tryFoldVecLoad since tryFoldLoad would call IsProfitableToFold and pick up the new check. This saves about 5K out of ~600K on the generated isel table. llvm-svn: 344189
This commit is contained in:
parent
7329be16c2
commit
b5421c498d
|
@ -239,12 +239,6 @@ namespace {
|
|||
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
|
||||
}
|
||||
|
||||
// Try to fold a vector load. This makes sure the load isn't non-temporal.
|
||||
bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
|
||||
SDValue &Base, SDValue &Scale,
|
||||
SDValue &Index, SDValue &Disp,
|
||||
SDValue &Segment);
|
||||
|
||||
/// Implement addressing mode selection for inline asm expressions.
|
||||
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
|
||||
unsigned ConstraintID,
|
||||
|
@ -516,6 +510,10 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
|
|||
if (N.getOpcode() != ISD::LOAD)
|
||||
return true;
|
||||
|
||||
// Don't fold non-temporal loads if we have an instruction for them.
|
||||
if (useNonTemporalLoad(cast<LoadSDNode>(N)))
|
||||
return false;
|
||||
|
||||
// If N is a load, do additional profitability checks.
|
||||
if (U == Root) {
|
||||
switch (U->getOpcode()) {
|
||||
|
@ -2053,20 +2051,6 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
|
|||
N.getOperand(1), Base, Scale, Index, Disp, Segment);
|
||||
}
|
||||
|
||||
bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
|
||||
SDValue &Base, SDValue &Scale,
|
||||
SDValue &Index, SDValue &Disp,
|
||||
SDValue &Segment) {
|
||||
if (!ISD::isNON_EXTLoad(N.getNode()) ||
|
||||
useNonTemporalLoad(cast<LoadSDNode>(N)) ||
|
||||
!IsProfitableToFold(N, P, Root) ||
|
||||
!IsLegalToFold(N, P, Root, OptLevel))
|
||||
return false;
|
||||
|
||||
return selectAddr(N.getNode(),
|
||||
N.getOperand(1), Base, Scale, Index, Disp, Segment);
|
||||
}
|
||||
|
||||
/// Return an SDNode that returns the value of the global base register.
|
||||
/// Output instructions required to initialize the global base register,
|
||||
/// if necessary.
|
||||
|
@ -2595,8 +2579,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
|
|||
// alignment on this load.
|
||||
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
||||
if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
|
||||
tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
|
||||
Tmp3, Tmp4)) {
|
||||
tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
|
||||
Tmp3, Tmp4)) {
|
||||
SDValue Load = N1.getOperand(0);
|
||||
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
|
||||
Load.getOperand(0) };
|
||||
|
@ -2632,8 +2616,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
|
|||
// alignment on this load.
|
||||
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
|
||||
if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
|
||||
tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
|
||||
Tmp3, Tmp4)) {
|
||||
tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
|
||||
Tmp3, Tmp4)) {
|
||||
SDValue Load = N2.getOperand(0);
|
||||
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
|
||||
Load.getOperand(0), InFlag };
|
||||
|
|
|
@ -647,28 +647,22 @@ def sdmem : Operand<v2f64> {
|
|||
// SSE pattern fragments
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Vector load wrappers to prevent folding of non-temporal aligned loads on
|
||||
// supporting targets.
|
||||
def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
return !useNonTemporalLoad(cast<LoadSDNode>(N));
|
||||
}]>;
|
||||
|
||||
// 128-bit load pattern fragments
|
||||
// NOTE: all 128-bit integer vector loads are promoted to v2i64
|
||||
def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>;
|
||||
def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>;
|
||||
def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>;
|
||||
def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
|
||||
def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
|
||||
def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
|
||||
|
||||
// 256-bit load pattern fragments
|
||||
// NOTE: all 256-bit integer vector loads are promoted to v4i64
|
||||
def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>;
|
||||
def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>;
|
||||
def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>;
|
||||
def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
|
||||
def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
|
||||
def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
|
||||
|
||||
// 512-bit load pattern fragments
|
||||
def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>;
|
||||
def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>;
|
||||
def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>;
|
||||
def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
|
||||
def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
|
||||
def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
|
||||
|
||||
// 128-/256-/512-bit extload pattern fragments
|
||||
def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
|
||||
|
@ -682,46 +676,45 @@ def alignedstore : PatFrag<(ops node:$val, node:$ptr),
|
|||
return St->getAlignment() >= St->getMemoryVT().getStoreSize();
|
||||
}]>;
|
||||
|
||||
// Like 'load', but always requires 128-bit vector alignment.
|
||||
def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
// Like 'load', but always requires vector size alignment.
|
||||
def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
auto *Ld = cast<LoadSDNode>(N);
|
||||
return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() &&
|
||||
!useNonTemporalLoad(cast<LoadSDNode>(N));
|
||||
return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
|
||||
}]>;
|
||||
|
||||
// 128-bit aligned load pattern fragments
|
||||
// NOTE: all 128-bit integer vector loads are promoted to v2i64
|
||||
def alignedloadv4f32 : PatFrag<(ops node:$ptr),
|
||||
(v4f32 (alignedvecload node:$ptr))>;
|
||||
(v4f32 (alignedload node:$ptr))>;
|
||||
def alignedloadv2f64 : PatFrag<(ops node:$ptr),
|
||||
(v2f64 (alignedvecload node:$ptr))>;
|
||||
(v2f64 (alignedload node:$ptr))>;
|
||||
def alignedloadv2i64 : PatFrag<(ops node:$ptr),
|
||||
(v2i64 (alignedvecload node:$ptr))>;
|
||||
(v2i64 (alignedload node:$ptr))>;
|
||||
|
||||
// 256-bit aligned load pattern fragments
|
||||
// NOTE: all 256-bit integer vector loads are promoted to v4i64
|
||||
def alignedloadv8f32 : PatFrag<(ops node:$ptr),
|
||||
(v8f32 (alignedvecload node:$ptr))>;
|
||||
(v8f32 (alignedload node:$ptr))>;
|
||||
def alignedloadv4f64 : PatFrag<(ops node:$ptr),
|
||||
(v4f64 (alignedvecload node:$ptr))>;
|
||||
(v4f64 (alignedload node:$ptr))>;
|
||||
def alignedloadv4i64 : PatFrag<(ops node:$ptr),
|
||||
(v4i64 (alignedvecload node:$ptr))>;
|
||||
(v4i64 (alignedload node:$ptr))>;
|
||||
|
||||
// 512-bit aligned load pattern fragments
|
||||
def alignedloadv16f32 : PatFrag<(ops node:$ptr),
|
||||
(v16f32 (alignedvecload node:$ptr))>;
|
||||
(v16f32 (alignedload node:$ptr))>;
|
||||
def alignedloadv8f64 : PatFrag<(ops node:$ptr),
|
||||
(v8f64 (alignedvecload node:$ptr))>;
|
||||
(v8f64 (alignedload node:$ptr))>;
|
||||
def alignedloadv8i64 : PatFrag<(ops node:$ptr),
|
||||
(v8i64 (alignedvecload node:$ptr))>;
|
||||
(v8i64 (alignedload node:$ptr))>;
|
||||
|
||||
// Like 'vecload', but uses special alignment checks suitable for use in
|
||||
// Like 'load', but uses special alignment checks suitable for use in
|
||||
// memory operands in most SSE instructions, which are required to
|
||||
// be naturally aligned on some targets but not on others. If the subtarget
|
||||
// allows unaligned accesses, match any load, though this may require
|
||||
// setting a feature bit in the processor (on startup, for example).
|
||||
// Opteron 10h and later implement such a feature.
|
||||
def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{
|
||||
def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|
||||
auto *Ld = cast<LoadSDNode>(N);
|
||||
return Subtarget->hasSSEUnalignedMem() ||
|
||||
Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
|
||||
|
|
Loading…
Reference in New Issue