[X86][CostModel] X86TTIImpl::getMemoryOpCost(): rewrite vector handling again

Instead of handling power-of-two sized vector chunks, try handling the large vector in a stream mode, decreasing the operational vector size once it no longer works for the elements left to process. Notably, this improves costs for overaligned loads - loading padding is fine. This more directly tracks when we need to insert/extract the YMM/XMM subvector, some costs fluctuate because of that. Reviewed By: RKSimon, ABataev Differential Revision: https://reviews.llvm.org/D100684
2021-05-11 16:02:11 +03:00 · 2021-05-11 16:02:11 +03:00 · c02476f315
parent 49950cb1f6
commit c02476f315
4 changed files with 1884 additions and 1800 deletions
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -3254,50 +3254,134 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                  CostKind);

-  // Handle non-power-of-two vectors such as <3 x float> and <48 x i16>
-  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
-    const unsigned NumElem = VTy->getNumElements();
-    if (!isPowerOf2_32(NumElem)) {
-      // Factorize NumElem into sum of power-of-two.
-      InstructionCost Cost = 0;
-      unsigned NumElemDone = 0;
-      for (unsigned NumElemLeft = NumElem, Factor;
-           Factor = PowerOf2Floor(NumElemLeft), NumElemLeft > 0;
-           NumElemLeft -= Factor) {
-        Type *SubTy = FixedVectorType::get(VTy->getScalarType(), Factor);
-        unsigned SubTyBytes = SubTy->getPrimitiveSizeInBits() / 8;
-
-        Cost +=
-            getMemoryOpCost(Opcode, SubTy, Alignment, AddressSpace, CostKind);
-
-        std::pair<InstructionCost, MVT> LST =
-            TLI->getTypeLegalizationCost(DL, SubTy);
-        if (!LST.second.isVector()) {
-          APInt DemandedElts =
-              APInt::getBitsSet(NumElem, NumElemDone, NumElemDone + Factor);
-          Cost += getScalarizationOverhead(VTy, DemandedElts,
-                                           Opcode == Instruction::Load,
-                                           Opcode == Instruction::Store);
-        }
-
-        NumElemDone += Factor;
-        Alignment = commonAlignment(Alignment.valueOrOne(), SubTyBytes);
-      }
-      assert(NumElemDone == NumElem && "Processed wrong element count?");
-      return Cost;
-    }
-  }
-
  // Legalize the type.
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);

-  // Each load/store unit costs 1.
-  InstructionCost Cost = LT.first * 1;
+  auto *VTy = dyn_cast<FixedVectorType>(Src);

-  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
-  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
-  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
-    Cost *= 2;
+  // Handle the simple case of non-vectors.
+  // NOTE: this assumes that legalization never creates vector from scalars!
+  if (!VTy || !LT.second.isVector())
+    // Each load/store unit costs 1.
+    return LT.first * 1;
+
+  bool IsLoad = Opcode == Instruction::Load;
+
+  Type *EltTy = VTy->getElementType();
+
+  const int EltTyBits = DL.getTypeSizeInBits(EltTy);
+  assert(((EltTyBits > 0) && (EltTyBits % 8 == 0)) &&
+         "Expected byte-size types");
+  const int EltTyBytes = EltTyBits / 8;
+  assert(EltTyBytes != 0 && "Had sub-byte-sized type?");
+
+  InstructionCost Cost = 0;
+
+  // Source of truth: how many elements were there in the original IR vector?
+  const unsigned SrcNumElt = VTy->getNumElements();
+
+  // How far have we gotten?
+  int NumEltRemaining = SrcNumElt;
+  // Note that we intentionally capture by-reference, NumEltRemaining changes.
+  auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
+
+  assert(LT.second.getSizeInBits() % 8 == 0 && "Non-byte-sized legal type?");
+  const int MaxLegalOpSizeBytes = LT.second.getSizeInBits() / 8;
+  assert(MaxLegalOpSizeBytes != 0 && "Legalized to sub-byte-sized type?");
+
+  // With what size are we currently operating?
+  int CurrOpSizeBytes = MaxLegalOpSizeBytes;
+
+  // How many elements would a single op deal with at once?
+  assert(CurrOpSizeBytes % EltTyBytes == 0 &&
+         "Operation size is not a multiple of element size?");
+  int CurrNumEltPerOp = CurrOpSizeBytes / EltTyBytes;
+
+  // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
+  const unsigned XMMBits = 128;
+  assert(XMMBits % EltTyBits == 0 && "Filing XMM with EltTy leaves padding.");
+  const int NumEltPerXMM = XMMBits / EltTyBits;
+
+  auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
+
+  for (int SubVecEltsLeft = 0; NumEltRemaining > 0;
+       CurrOpSizeBytes /= 2, CurrNumEltPerOp /= 2) {
+    assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
+    assert((((NumEltRemaining * EltTyBytes) < (2 * CurrOpSizeBytes)) ||
+            (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
+           "Unless we haven't halved the op size yet, "
+           "we have less than two op's sized units of work left.");
+
+    auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
+                          ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
+                          : XMMVecTy;
+
+    assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
+           "After halving sizes, the vector elt count is no longer a multiple "
+           "of number of elements per operation?");
+    auto *CoalescedVecTy =
+        CurrNumEltPerOp == 1
+            ? CurrVecTy
+            : FixedVectorType::get(
+                  IntegerType::get(Src->getContext(),
+                                   EltTyBits * CurrNumEltPerOp),
+                  CurrVecTy->getNumElements() / CurrNumEltPerOp);
+    assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
+               DL.getTypeSizeInBits(CurrVecTy) &&
+           "coalesciing elements doesn't change vector width.");
+
+    while (NumEltRemaining > 0) {
+      assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
+
+      // Can we use this vector size, as per the remaining element count?
+      // Iff the vector is naturally aligned, we can do a wide load regardless.
+      if (NumEltRemaining < CurrNumEltPerOp &&
+          (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes))
+        break; // Try smalled vector size.
+
+      bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
+
+      // If we have fully processed the previous reg, we need to replenish it.
+      if (SubVecEltsLeft == 0) {
+        SubVecEltsLeft += CurrVecTy->getNumElements();
+        // And that's free only for the 0'th subvector of a legalized vector.
+        if (!Is0thSubVec)
+          Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+                                        : TTI::ShuffleKind::SK_ExtractSubvector,
+                                 VTy, None, NumEltDone(), CurrVecTy);
+      }
+
+      // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
+      // for smaller widths (32/16/8) we have to insert/extract them separately.
+      // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
+      // but let's pretend that it is also true for 16/8 bit wide ops...)
+      if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
+        int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
+        assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
+        int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
+        APInt DemandedElts =
+            APInt::getBitsSet(CoalescedVecTy->getNumElements(),
+                              CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
+        assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
+        Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
+                                         !IsLoad);
+      }
+
+      // This isn't exactly right. We're using slow unaligned 32-byte accesses
+      // as a proxy for a double-pumped AVX memory interface such as on
+      // Sandybridge.
+      if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+        Cost += 2;
+      else
+        Cost += 1;
+
+      SubVecEltsLeft -= CurrNumEltPerOp;
+      NumEltRemaining -= CurrNumEltPerOp;
+      Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
+    }
+  }
+
+  assert(NumEltRemaining <= 0 && "Should have processed all the elements.");

  return Cost;
 }
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"

 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; CHECK: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2

--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"

 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; CHECK: LV: Found an estimated cost of 34 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2

--- a/llvm/test/Analysis/CostModel/X86/load_store.ll
+++ b/llvm/test/Analysis/CostModel/X86/load_store.ll