From d7eb61929929fcb5dae77f63a5d9d9be026eaeb8 Mon Sep 17 00:00:00 2001 From: Tobias Grosser Date: Thu, 24 Aug 2017 09:46:25 +0000 Subject: [PATCH] Model cache size and associativity in TargetTransformInfo Summary: We add the precise cache sizes and associativity for the following Intel architectures: - Penry - Nehalem - Westmere - Sandy Bridge - Ivy Bridge - Haswell - Broadwell - Skylake - Kabylake Polly uses since several months a performance model for BLAS computations that derives optimal cache and register tile sizes from cache and latency information (based on ideas from "Analytical Modeling Is Enough for High-Performance BLIS", by Tze Meng Low published at TOMS 2016). While bootstrapping this model, these target values have been kept in Polly. However, as our implementation is now rather mature, it seems time to teach LLVM itself about cache sizes. Interestingly, L1 and L2 cache sizes are pretty constant across micro-architectures, hence a set of architecture specific default values seems like a good start. They can be expanded to more target specific values, in case certain newer architectures require different values. For now a set of Intel architectures are provided. Just as a little teaser, for a simple gemm kernel this model allows us to improve performance from 1.2s to 0.27s. For gemm kernels with less optimal memory layouts even larger speedups can be reported. Reviewers: Meinersbur, bollu, singam-sanjay, hfinkel, gareevroman, fhahn, sebpop, efriedma, asb Reviewed By: fhahn, asb Subscribers: lsaba, asb, pollydev, llvm-commits Differential Revision: https://reviews.llvm.org/D37051 llvm-svn: 311647 --- .../llvm/Analysis/TargetTransformInfo.h | 24 +++++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 23 +++++++++ llvm/lib/Analysis/TargetTransformInfo.cpp | 10 ++++ .../lib/Target/X86/X86TargetTransformInfo.cpp | 51 +++++++++++++++++++ llvm/lib/Target/X86/X86TargetTransformInfo.h | 8 +++ 5 files changed, 116 insertions(+) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 8696f080fd7a..6830e3d9141b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -603,6 +603,22 @@ public: /// \return The size of a cache line in bytes. unsigned getCacheLineSize() const; + /// The possible cache levels + enum class CacheLevel { + L1D, // The L1 data cache + L2D, // The L2 data cache + + // We currently do not model L3 caches, as their sizes differ widely between + // microarchitectures. Also, we currently do not have a use for L3 cache + // size modeling yet. + }; + + /// \return The size of the cache level in bytes, if available. + llvm::Optional getCacheSize(CacheLevel Level) const; + + /// \return The associativity of the cache level, if available. + llvm::Optional getCacheAssociativity(CacheLevel Level) const; + /// \return How much before a load we should place the prefetch instruction. /// This is currently measured in number of instructions. unsigned getPrefetchDistance() const; @@ -937,6 +953,8 @@ public: virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() = 0; + virtual llvm::Optional getCacheSize(CacheLevel Level) = 0; + virtual llvm::Optional getCacheAssociativity(CacheLevel Level) = 0; virtual unsigned getPrefetchDistance() = 0; virtual unsigned getMinPrefetchStride() = 0; virtual unsigned getMaxPrefetchIterationsAhead() = 0; @@ -1209,6 +1227,12 @@ public: unsigned getCacheLineSize() override { return Impl.getCacheLineSize(); } + llvm::Optional getCacheSize(CacheLevel Level) override { + return Impl.getCacheSize(Level); + } + llvm::Optional getCacheAssociativity(CacheLevel Level) override { + return Impl.getCacheAssociativity(Level); + } unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); } unsigned getMinPrefetchStride() override { return Impl.getMinPrefetchStride(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index aac659039b66..24989941ac83 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -340,6 +340,29 @@ public: unsigned getCacheLineSize() { return 0; } + llvm::Optional getCacheSize(TargetTransformInfo::CacheLevel Level) { + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + LLVM_FALLTHROUGH; + case TargetTransformInfo::CacheLevel::L2D: + return llvm::Optional(); + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); + } + + llvm::Optional getCacheAssociativity( + TargetTransformInfo::CacheLevel Level) { + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + LLVM_FALLTHROUGH; + case TargetTransformInfo::CacheLevel::L2D: + return llvm::Optional(); + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); + } + unsigned getPrefetchDistance() { return 0; } unsigned getMinPrefetchStride() { return 1; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 6cb7952d7967..e09138168c98 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -321,6 +321,16 @@ unsigned TargetTransformInfo::getCacheLineSize() const { return TTIImpl->getCacheLineSize(); } +llvm::Optional TargetTransformInfo::getCacheSize(CacheLevel Level) + const { + return TTIImpl->getCacheSize(Level); +} + +llvm::Optional TargetTransformInfo::getCacheAssociativity( + CacheLevel Level) const { + return TTIImpl->getCacheAssociativity(Level); +} + unsigned TargetTransformInfo::getPrefetchDistance() const { return TTIImpl->getPrefetchDistance(); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 6c73a6875c84..eeced4086879 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -66,6 +66,57 @@ X86TTIImpl::getPopcntSupport(unsigned TyWidth) { return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; } +llvm::Optional X86TTIImpl::getCacheSize( + TargetTransformInfo::CacheLevel Level) const { + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + // - Penry + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + return 32 * 1024; // 32 KByte + case TargetTransformInfo::CacheLevel::L2D: + // - Penry + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + return 256 * 1024; // 256 KByte + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); +} + +llvm::Optional X86TTIImpl::getCacheAssociativity( + TargetTransformInfo::CacheLevel Level) const { + // - Penry + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + LLVM_FALLTHROUGH; + case TargetTransformInfo::CacheLevel::L2D: + return 8; + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); +} + unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasSSE1()) return 0; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index dc184030dee6..a8edc46ed575 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -47,6 +47,14 @@ public: /// @} + /// \name Cache TTI Implementation + /// @{ + llvm::Optional getCacheSize( + TargetTransformInfo::CacheLevel Level) const; + llvm::Optional getCacheAssociativity( + TargetTransformInfo::CacheLevel Level) const; + /// @} + /// \name Vector TTI Implementations /// @{