[x86] enhance mayFoldLoad to check alignment

As noted in D112464, a pre-AVX target may not be able to fold an under-aligned vector load into another op, so we shouldn't report that as a load folding candidate. I only found one caller where this would make a difference -- combineCommutableSHUFP() -- so that's where I added a test to show the (minor) regression. Differential Revision: https://reviews.llvm.org/D112545
2021-10-27 07:53:14 -04:00 · 2021-10-27 07:53:14 -04:00 · 6c0a2c2804
parent 6edc509719
commit 6c0a2c2804
3 changed files with 84 additions and 64 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -5039,13 +5039,30 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//

-static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) {
-  return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode());
+static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                        bool AssumeSingleUse = false) {
+  if (!AssumeSingleUse && !Op.hasOneUse())
+    return false;
+  if (!ISD::isNormalLoad(Op.getNode()))
+    return false;
+
+  // If this is an unaligned vector, make sure the target supports folding it.
+  auto *Ld = cast<LoadSDNode>(Op.getNode());
+  if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
+      Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
+    return false;
+
+  // TODO: If this is a non-temporal load and the target has an instruction
+  //       for it, it should not be folded. See "useNonTemporalLoad()".
+
+  return true;
 }

-static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+static bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+                                            const X86Subtarget &Subtarget,
                                            bool AssumeSingleUse = false) {
-  if (!MayFoldLoad(Op, AssumeSingleUse))
+  assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
+  if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse))
    return false;

  // We can not replace a wide volatile load with a broadcast-from-memory,
@ -8996,8 +9013,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
              Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
          } else {
            if (!Subtarget.hasAVX2() &&
-                !MayFoldLoadIntoBroadcastFromMem(
+                !mayFoldLoadIntoBroadcastFromMem(
                    RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
+                    Subtarget,
                    /*AssumeSingleUse=*/true))
              return SDValue();
            Broadcast =
@ -12727,8 +12745,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
                                         &DAG](SDValue &Input,
                                               MutableArrayRef<int> InputMask) {
    unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
-    if (!Subtarget.hasAVX2() &&
-        (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input)))
+    if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
+                                 !mayFoldLoad(Input, Subtarget)))
      return;
    if (isNoopShuffleMask(InputMask))
      return;
@ -16413,7 +16431,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
    bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
    bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
    if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
-        MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+        mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
      auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
      if (!Ld->isNonTemporal()) {
        MVT MemVT = VT.getHalfNumVectorElementsVT();
@ -19413,7 +19431,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
    // FIXME: relax the profitability check iff all N1 uses are insertions.
    if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
        ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
-         (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
+         (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
+          mayFoldLoad(N1, Subtarget)))) {
      SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
      SmallVector<int, 8> BlendMask;
      for (unsigned i = 0; i != NumElts; ++i)
@ -19486,7 +19505,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
      //   combine either bitwise AND or insert of float 0.0 to set these bits.

      bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
-      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+      if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) {
        // If this is an insertion of 32-bits into the low 32-bits of
        // a vector, we prefer to generate a blend with immediate rather
        // than an insertps. Blends are simpler operations in hardware and so
@ -24626,8 +24645,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  //        being inserted between two CMOV's. (in i16 case too TBN)
  //        https://bugs.llvm.org/show_bug.cgi?id=40974
  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
-      (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
-       !MayFoldLoad(Op2))) {
+      (Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) &&
+       !mayFoldLoad(Op2, Subtarget))) {
    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
    Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
    SDValue Ops[] = { Op2, Op1, CC, Cond };
@ -36974,7 +36993,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      if (isUndefOrEqual(Mask, 0)) {
        if (V1.getValueType() == MaskVT &&
            V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-            MayFoldLoad(V1.getOperand(0))) {
+            mayFoldLoad(V1.getOperand(0), Subtarget)) {
          if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
            return SDValue(); // Nothing to do!
          Res = V1.getOperand(0);
@ -38415,8 +38434,10 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
    SDValue N0 = V.getOperand(0);
    SDValue N1 = V.getOperand(1);
    unsigned Imm = V.getConstantOperandVal(2);
-    if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
-        MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+    const X86Subtarget &Subtarget =
+        static_cast<const X86Subtarget &>(DAG.getSubtarget());
+    if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
+        mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
      return SDValue();
    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
    return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
@ -51652,8 +51673,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

    // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
    if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
-        (Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem(
-                                    Op0.getOperand(0), VT.getScalarType())))
+        (Subtarget.hasAVX2() ||
+         mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), VT.getScalarType(),
+                                         Subtarget)))
      return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
                                     Op0.getOperand(0),
@ -51662,7 +51684,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
    // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
    if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
        (Subtarget.hasAVX2() ||
-         (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+         (EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
        Op0.getOperand(0).getValueType() == VT.getScalarType())
      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

@ -52994,7 +53016,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
  case ISD::SRL: {
    SDValue N0 = Op.getOperand(0);
    // Look out for (store (shl (load), x)).
-    if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
+    if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
      return false;
    break;
  }
@ -53009,11 +53031,11 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    // Avoid disabling potential load folding opportunities.
-    if (MayFoldLoad(N1) &&
+    if (mayFoldLoad(N1, Subtarget) &&
        (!Commute || !isa<ConstantSDNode>(N0) ||
         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
      return false;
-    if (MayFoldLoad(N0) &&
+    if (mayFoldLoad(N0, Subtarget) &&
        ((Commute && !isa<ConstantSDNode>(N1)) ||
         (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
      return false;
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@ -1398,40 +1398,40 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
 define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
 ; SSE2-LABEL: interleave_24i32_out:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqu 64(%rdi), %xmm9
 ; SSE2-NEXT:    movups 80(%rdi), %xmm8
-; SSE2-NEXT:    movups 64(%rdi), %xmm3
-; SSE2-NEXT:    movdqu (%rdi), %xmm1
-; SSE2-NEXT:    movups 16(%rdi), %xmm5
-; SSE2-NEXT:    movups 32(%rdi), %xmm10
-; SSE2-NEXT:    movdqu 48(%rdi), %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    movaps %xmm10, %xmm7
-; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    movaps %xmm8, %xmm4
-; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2]
-; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3]
-; SSE2-NEXT:    movups %xmm5, 16(%rsi)
-; SSE2-NEXT:    movups %xmm11, (%rsi)
-; SSE2-NEXT:    movups %xmm2, 16(%rdx)
-; SSE2-NEXT:    movups %xmm1, (%rdx)
-; SSE2-NEXT:    movups %xmm6, 16(%rcx)
-; SSE2-NEXT:    movups %xmm0, (%rcx)
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm10
+; SSE2-NEXT:    movups 32(%rdi), %xmm5
+; SSE2-NEXT:    movdqu 48(%rdi), %xmm3
+; SSE2-NEXT:    movaps %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0]
+; SSE2-NEXT:    movaps %xmm8, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm9[2,3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0]
+; SSE2-NEXT:    movups %xmm2, 16(%rsi)
+; SSE2-NEXT:    movups %xmm4, (%rsi)
+; SSE2-NEXT:    movups %xmm3, 16(%rdx)
+; SSE2-NEXT:    movups %xmm0, (%rdx)
+; SSE2-NEXT:    movups %xmm1, 16(%rcx)
+; SSE2-NEXT:    movups %xmm7, (%rcx)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: interleave_24i32_out:
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@ -97,20 +97,18 @@ define <4 x float> @t4_under_aligned(<4 x float>* %P) nounwind {
 ; X32-LABEL: t4_under_aligned:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movups (%eax), %xmm1
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; X32-NEXT:    movups (%eax), %xmm0
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
 ; X32-NEXT:    retl
 ;
 ; ALIGN-LABEL: t4_under_aligned:
 ; ALIGN:       # %bb.0:
-; ALIGN-NEXT:    movups (%rdi), %xmm1
-; ALIGN-NEXT:    xorps %xmm2, %xmm2
-; ALIGN-NEXT:    xorps %xmm0, %xmm0
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
-; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; ALIGN-NEXT:    movups (%rdi), %xmm0
+; ALIGN-NEXT:    xorps %xmm1, %xmm1
+; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; ALIGN-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
 ; ALIGN-NEXT:    retq
 ;
 ; UNALIGN-LABEL: t4_under_aligned: