[X86][CostModel] X86TTIImpl::getMemoryOpCost(): rewrite vector handling again

Instead of handling power-of-two sized vector chunks,
try handling the large vector in a stream mode,
decreasing the operational vector size
once it no longer works for the elements left to process.

Notably, this improves costs for overaligned loads - loading padding is fine.
This more directly tracks when we need to insert/extract the YMM/XMM subvector,
some costs fluctuate because of that.

Reviewed By: RKSimon, ABataev

Differential Revision: https://reviews.llvm.org/D100684
This commit is contained in:
Roman Lebedev 2021-05-11 16:02:11 +03:00
parent 49950cb1f6
commit c02476f315
No known key found for this signature in database
GPG Key ID: 083C3EBB4A1689E0
4 changed files with 1884 additions and 1800 deletions

View File

@ -3254,50 +3254,134 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Handle non-power-of-two vectors such as <3 x float> and <48 x i16>
if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
const unsigned NumElem = VTy->getNumElements();
if (!isPowerOf2_32(NumElem)) {
// Factorize NumElem into sum of power-of-two.
InstructionCost Cost = 0;
unsigned NumElemDone = 0;
for (unsigned NumElemLeft = NumElem, Factor;
Factor = PowerOf2Floor(NumElemLeft), NumElemLeft > 0;
NumElemLeft -= Factor) {
Type *SubTy = FixedVectorType::get(VTy->getScalarType(), Factor);
unsigned SubTyBytes = SubTy->getPrimitiveSizeInBits() / 8;
Cost +=
getMemoryOpCost(Opcode, SubTy, Alignment, AddressSpace, CostKind);
std::pair<InstructionCost, MVT> LST =
TLI->getTypeLegalizationCost(DL, SubTy);
if (!LST.second.isVector()) {
APInt DemandedElts =
APInt::getBitsSet(NumElem, NumElemDone, NumElemDone + Factor);
Cost += getScalarizationOverhead(VTy, DemandedElts,
Opcode == Instruction::Load,
Opcode == Instruction::Store);
}
NumElemDone += Factor;
Alignment = commonAlignment(Alignment.valueOrOne(), SubTyBytes);
}
assert(NumElemDone == NumElem && "Processed wrong element count?");
return Cost;
}
}
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
// Each load/store unit costs 1.
InstructionCost Cost = LT.first * 1;
auto *VTy = dyn_cast<FixedVectorType>(Src);
// This isn't exactly right. We're using slow unaligned 32-byte accesses as a
// proxy for a double-pumped AVX memory interface such as on Sandybridge.
if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
Cost *= 2;
// Handle the simple case of non-vectors.
// NOTE: this assumes that legalization never creates vector from scalars!
if (!VTy || !LT.second.isVector())
// Each load/store unit costs 1.
return LT.first * 1;
bool IsLoad = Opcode == Instruction::Load;
Type *EltTy = VTy->getElementType();
const int EltTyBits = DL.getTypeSizeInBits(EltTy);
assert(((EltTyBits > 0) && (EltTyBits % 8 == 0)) &&
"Expected byte-size types");
const int EltTyBytes = EltTyBits / 8;
assert(EltTyBytes != 0 && "Had sub-byte-sized type?");
InstructionCost Cost = 0;
// Source of truth: how many elements were there in the original IR vector?
const unsigned SrcNumElt = VTy->getNumElements();
// How far have we gotten?
int NumEltRemaining = SrcNumElt;
// Note that we intentionally capture by-reference, NumEltRemaining changes.
auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
assert(LT.second.getSizeInBits() % 8 == 0 && "Non-byte-sized legal type?");
const int MaxLegalOpSizeBytes = LT.second.getSizeInBits() / 8;
assert(MaxLegalOpSizeBytes != 0 && "Legalized to sub-byte-sized type?");
// With what size are we currently operating?
int CurrOpSizeBytes = MaxLegalOpSizeBytes;
// How many elements would a single op deal with at once?
assert(CurrOpSizeBytes % EltTyBytes == 0 &&
"Operation size is not a multiple of element size?");
int CurrNumEltPerOp = CurrOpSizeBytes / EltTyBytes;
// Note that even if we can store 64 bits of an XMM, we still operate on XMM.
const unsigned XMMBits = 128;
assert(XMMBits % EltTyBits == 0 && "Filing XMM with EltTy leaves padding.");
const int NumEltPerXMM = XMMBits / EltTyBits;
auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
for (int SubVecEltsLeft = 0; NumEltRemaining > 0;
CurrOpSizeBytes /= 2, CurrNumEltPerOp /= 2) {
assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
assert((((NumEltRemaining * EltTyBytes) < (2 * CurrOpSizeBytes)) ||
(CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
"Unless we haven't halved the op size yet, "
"we have less than two op's sized units of work left.");
auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
? FixedVectorType::get(EltTy, CurrNumEltPerOp)
: XMMVecTy;
assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
"After halving sizes, the vector elt count is no longer a multiple "
"of number of elements per operation?");
auto *CoalescedVecTy =
CurrNumEltPerOp == 1
? CurrVecTy
: FixedVectorType::get(
IntegerType::get(Src->getContext(),
EltTyBits * CurrNumEltPerOp),
CurrVecTy->getNumElements() / CurrNumEltPerOp);
assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
DL.getTypeSizeInBits(CurrVecTy) &&
"coalesciing elements doesn't change vector width.");
while (NumEltRemaining > 0) {
assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
// Can we use this vector size, as per the remaining element count?
// Iff the vector is naturally aligned, we can do a wide load regardless.
if (NumEltRemaining < CurrNumEltPerOp &&
(!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes))
break; // Try smalled vector size.
bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
// If we have fully processed the previous reg, we need to replenish it.
if (SubVecEltsLeft == 0) {
SubVecEltsLeft += CurrVecTy->getNumElements();
// And that's free only for the 0'th subvector of a legalized vector.
if (!Is0thSubVec)
Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
: TTI::ShuffleKind::SK_ExtractSubvector,
VTy, None, NumEltDone(), CurrVecTy);
}
// While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
// for smaller widths (32/16/8) we have to insert/extract them separately.
// Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
// but let's pretend that it is also true for 16/8 bit wide ops...)
if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
APInt DemandedElts =
APInt::getBitsSet(CoalescedVecTy->getNumElements(),
CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
!IsLoad);
}
// This isn't exactly right. We're using slow unaligned 32-byte accesses
// as a proxy for a double-pumped AVX memory interface such as on
// Sandybridge.
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
Cost += 2;
else
Cost += 1;
SubVecEltsLeft -= CurrNumEltPerOp;
NumEltRemaining -= CurrNumEltPerOp;
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
}
}
assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
return Cost;
}

View File

@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2

View File

@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: LV: Checking a loop in "test"
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2

File diff suppressed because it is too large Load Diff