forked from OSchip/llvm-project
[x86] enhance mayFoldLoad to check alignment
As noted in D112464, a pre-AVX target may not be able to fold an under-aligned vector load into another op, so we shouldn't report that as a load folding candidate. I only found one caller where this would make a difference -- combineCommutableSHUFP() -- so that's where I added a test to show the (minor) regression. Differential Revision: https://reviews.llvm.org/D112545
This commit is contained in:
parent
6edc509719
commit
6c0a2c2804
llvm
|
@ -5039,13 +5039,30 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
|
|||
// Other Lowering Hooks
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) {
|
||||
return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode());
|
||||
static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
|
||||
bool AssumeSingleUse = false) {
|
||||
if (!AssumeSingleUse && !Op.hasOneUse())
|
||||
return false;
|
||||
if (!ISD::isNormalLoad(Op.getNode()))
|
||||
return false;
|
||||
|
||||
// If this is an unaligned vector, make sure the target supports folding it.
|
||||
auto *Ld = cast<LoadSDNode>(Op.getNode());
|
||||
if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
|
||||
Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
|
||||
return false;
|
||||
|
||||
// TODO: If this is a non-temporal load and the target has an instruction
|
||||
// for it, it should not be folded. See "useNonTemporalLoad()".
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
|
||||
static bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
|
||||
const X86Subtarget &Subtarget,
|
||||
bool AssumeSingleUse = false) {
|
||||
if (!MayFoldLoad(Op, AssumeSingleUse))
|
||||
assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
|
||||
if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse))
|
||||
return false;
|
||||
|
||||
// We can not replace a wide volatile load with a broadcast-from-memory,
|
||||
|
@ -8996,8 +9013,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
|
|||
Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
|
||||
} else {
|
||||
if (!Subtarget.hasAVX2() &&
|
||||
!MayFoldLoadIntoBroadcastFromMem(
|
||||
!mayFoldLoadIntoBroadcastFromMem(
|
||||
RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
|
||||
Subtarget,
|
||||
/*AssumeSingleUse=*/true))
|
||||
return SDValue();
|
||||
Broadcast =
|
||||
|
@ -12727,8 +12745,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
|
|||
&DAG](SDValue &Input,
|
||||
MutableArrayRef<int> InputMask) {
|
||||
unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
|
||||
if (!Subtarget.hasAVX2() &&
|
||||
(!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input)))
|
||||
if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
|
||||
!mayFoldLoad(Input, Subtarget)))
|
||||
return;
|
||||
if (isNoopShuffleMask(InputMask))
|
||||
return;
|
||||
|
@ -16413,7 +16431,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
|
||||
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
|
||||
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
|
||||
MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
|
||||
mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
|
||||
auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
|
||||
if (!Ld->isNonTemporal()) {
|
||||
MVT MemVT = VT.getHalfNumVectorElementsVT();
|
||||
|
@ -19413,7 +19431,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
|||
// FIXME: relax the profitability check iff all N1 uses are insertions.
|
||||
if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
|
||||
((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
|
||||
(Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
|
||||
(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
|
||||
mayFoldLoad(N1, Subtarget)))) {
|
||||
SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
|
||||
SmallVector<int, 8> BlendMask;
|
||||
for (unsigned i = 0; i != NumElts; ++i)
|
||||
|
@ -19486,7 +19505,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
|||
// combine either bitwise AND or insert of float 0.0 to set these bits.
|
||||
|
||||
bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
|
||||
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
|
||||
if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) {
|
||||
// If this is an insertion of 32-bits into the low 32-bits of
|
||||
// a vector, we prefer to generate a blend with immediate rather
|
||||
// than an insertps. Blends are simpler operations in hardware and so
|
||||
|
@ -24626,8 +24645,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|||
// being inserted between two CMOV's. (in i16 case too TBN)
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=40974
|
||||
if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
|
||||
(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
|
||||
!MayFoldLoad(Op2))) {
|
||||
(Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) &&
|
||||
!mayFoldLoad(Op2, Subtarget))) {
|
||||
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
|
||||
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
|
||||
SDValue Ops[] = { Op2, Op1, CC, Cond };
|
||||
|
@ -36974,7 +36993,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
|||
if (isUndefOrEqual(Mask, 0)) {
|
||||
if (V1.getValueType() == MaskVT &&
|
||||
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
MayFoldLoad(V1.getOperand(0))) {
|
||||
mayFoldLoad(V1.getOperand(0), Subtarget)) {
|
||||
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
|
||||
return SDValue(); // Nothing to do!
|
||||
Res = V1.getOperand(0);
|
||||
|
@ -38415,8 +38434,10 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
|
|||
SDValue N0 = V.getOperand(0);
|
||||
SDValue N1 = V.getOperand(1);
|
||||
unsigned Imm = V.getConstantOperandVal(2);
|
||||
if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
|
||||
MayFoldLoad(peekThroughOneUseBitcasts(N1)))
|
||||
const X86Subtarget &Subtarget =
|
||||
static_cast<const X86Subtarget &>(DAG.getSubtarget());
|
||||
if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
|
||||
mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
|
||||
return SDValue();
|
||||
Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
|
||||
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
|
||||
|
@ -51652,8 +51673,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
|
|||
|
||||
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
|
||||
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
|
||||
(Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem(
|
||||
Op0.getOperand(0), VT.getScalarType())))
|
||||
(Subtarget.hasAVX2() ||
|
||||
mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), VT.getScalarType(),
|
||||
Subtarget)))
|
||||
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
|
||||
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
|
||||
Op0.getOperand(0),
|
||||
|
@ -51662,7 +51684,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
|
|||
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
|
||||
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
|
||||
(Subtarget.hasAVX2() ||
|
||||
(EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
|
||||
(EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
|
||||
Op0.getOperand(0).getValueType() == VT.getScalarType())
|
||||
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
|
||||
|
||||
|
@ -52994,7 +53016,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
|
|||
case ISD::SRL: {
|
||||
SDValue N0 = Op.getOperand(0);
|
||||
// Look out for (store (shl (load), x)).
|
||||
if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
|
||||
if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
|
@ -53009,11 +53031,11 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
|
|||
SDValue N0 = Op.getOperand(0);
|
||||
SDValue N1 = Op.getOperand(1);
|
||||
// Avoid disabling potential load folding opportunities.
|
||||
if (MayFoldLoad(N1) &&
|
||||
if (mayFoldLoad(N1, Subtarget) &&
|
||||
(!Commute || !isa<ConstantSDNode>(N0) ||
|
||||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
|
||||
return false;
|
||||
if (MayFoldLoad(N0) &&
|
||||
if (mayFoldLoad(N0, Subtarget) &&
|
||||
((Commute && !isa<ConstantSDNode>(N1)) ||
|
||||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
|
||||
return false;
|
||||
|
|
|
@ -1398,40 +1398,40 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
|
|||
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
|
||||
; SSE2-LABEL: interleave_24i32_out:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqu 64(%rdi), %xmm9
|
||||
; SSE2-NEXT: movups 80(%rdi), %xmm8
|
||||
; SSE2-NEXT: movups 64(%rdi), %xmm3
|
||||
; SSE2-NEXT: movdqu (%rdi), %xmm1
|
||||
; SSE2-NEXT: movups 16(%rdi), %xmm5
|
||||
; SSE2-NEXT: movups 32(%rdi), %xmm10
|
||||
; SSE2-NEXT: movdqu 48(%rdi), %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm11
|
||||
; SSE2-NEXT: movaps %xmm10, %xmm7
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
||||
; SSE2-NEXT: movaps %xmm8, %xmm4
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3]
|
||||
; SSE2-NEXT: movups %xmm5, 16(%rsi)
|
||||
; SSE2-NEXT: movups %xmm11, (%rsi)
|
||||
; SSE2-NEXT: movups %xmm2, 16(%rdx)
|
||||
; SSE2-NEXT: movups %xmm1, (%rdx)
|
||||
; SSE2-NEXT: movups %xmm6, 16(%rcx)
|
||||
; SSE2-NEXT: movups %xmm0, (%rcx)
|
||||
; SSE2-NEXT: movdqu (%rdi), %xmm0
|
||||
; SSE2-NEXT: movdqu 16(%rdi), %xmm10
|
||||
; SSE2-NEXT: movups 32(%rdi), %xmm5
|
||||
; SSE2-NEXT: movdqu 48(%rdi), %xmm3
|
||||
; SSE2-NEXT: movaps %xmm5, %xmm6
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[2,3]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0]
|
||||
; SSE2-NEXT: movaps %xmm8, %xmm5
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm9[2,3]
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm2
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0]
|
||||
; SSE2-NEXT: movups %xmm2, 16(%rsi)
|
||||
; SSE2-NEXT: movups %xmm4, (%rsi)
|
||||
; SSE2-NEXT: movups %xmm3, 16(%rdx)
|
||||
; SSE2-NEXT: movups %xmm0, (%rdx)
|
||||
; SSE2-NEXT: movups %xmm1, 16(%rcx)
|
||||
; SSE2-NEXT: movups %xmm7, (%rcx)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE42-LABEL: interleave_24i32_out:
|
||||
|
|
|
@ -97,20 +97,18 @@ define <4 x float> @t4_under_aligned(<4 x float>* %P) nounwind {
|
|||
; X32-LABEL: t4_under_aligned:
|
||||
; X32: # %bb.0:
|
||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: movups (%eax), %xmm1
|
||||
; X32-NEXT: xorps %xmm2, %xmm2
|
||||
; X32-NEXT: xorps %xmm0, %xmm0
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
|
||||
; X32-NEXT: movups (%eax), %xmm0
|
||||
; X32-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
|
||||
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; ALIGN-LABEL: t4_under_aligned:
|
||||
; ALIGN: # %bb.0:
|
||||
; ALIGN-NEXT: movups (%rdi), %xmm1
|
||||
; ALIGN-NEXT: xorps %xmm2, %xmm2
|
||||
; ALIGN-NEXT: xorps %xmm0, %xmm0
|
||||
; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
|
||||
; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
|
||||
; ALIGN-NEXT: movups (%rdi), %xmm0
|
||||
; ALIGN-NEXT: xorps %xmm1, %xmm1
|
||||
; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
|
||||
; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
|
||||
; ALIGN-NEXT: retq
|
||||
;
|
||||
; UNALIGN-LABEL: t4_under_aligned:
|
||||
|
|
Loading…
Reference in New Issue