diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ed04cdc4e407..437f63d3280b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -17,8 +17,11 @@ #define DEBUG_TYPE "x86tti" #include "X86.h" #include "X86TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -31,6 +34,17 @@ namespace llvm { void initializeX86TTIPass(PassRegistry &); } +static cl::opt +UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true), + cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden); +static cl::opt +PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0), + cl::desc("Threshold for X86 partial unrolling"), cl::Hidden); +static cl::opt +PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2), + cl::desc("Threshold for taken branches in X86 partial unrolling"), + cl::Hidden); + namespace { class X86TTI final : public ImmutablePass, public TargetTransformInfo { @@ -73,6 +87,8 @@ public: /// \name Scalar TTI Implementations /// @{ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; + void getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const override; /// @} @@ -137,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } +void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { + if (!UsePartialUnrolling) + return; + // According to the Intel 64 and IA-32 Architectures Optimization Reference + // Manual, Intel Core models and later have a loop stream detector + // (and associated uop queue) that can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have no more than 4 (8 for Nehalem and later) branches + // taken, and none of them may be calls. + // - The loop can have no more than 18 (28 for Nehalem and later) uops. + + // According to the Software Optimization Guide for AMD Family 15h Processors, + // models 30h-4fh (Steamroller and later) have a loop predictor and loop + // buffer which can benefit from partial unrolling. + // The relevant requirements are: + // - The loop must have fewer than 16 branches + // - The loop must have less than 40 uops in all executed loop branches + + unsigned MaxBranches, MaxOps; + if (PartialUnrollingThreshold.getNumOccurrences() > 0) { + MaxBranches = PartialUnrollingMaxBranches; + MaxOps = PartialUnrollingThreshold; + } else if (ST->isAtom()) { + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + MaxBranches = 2; + MaxOps = 10; + } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) { + MaxBranches = 16; + MaxOps = 40; + } else if (ST->hasFMA4() /* Any other recent AMD */) { + return; + } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) { + MaxBranches = 8; + MaxOps = 28; + } else if (ST->hasSSSE3() /* Intel Core */) { + MaxBranches = 4; + MaxOps = 18; + } else { + return; + } + + // Scan the loop: don't unroll loops with calls, and count the potential + // number of taken branches (this is somewhat conservative because we're + // counting all block transitions as potential branches while in reality some + // of these will become implicit via block placement). + unsigned MaxDepth = 0; + for (df_iterator DI = df_begin(L->getHeader()), + DE = df_end(L->getHeader()); DI != DE;) { + if (!L->contains(*DI)) { + DI.skipChildren(); + continue; + } + + MaxDepth = std::max(MaxDepth, DI.getPathLength()); + if (MaxDepth > MaxBranches) + return; + + for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I) + if (isa(I) || isa(I)) { + ImmutableCallSite CS(I); + if (const Function *F = CS.getCalledFunction()) { + if (!isLoweredToCall(F)) + continue; + } + + return; + } + + ++DI; + } + + // Enable runtime and partial unrolling up to the specified size. + UP.Partial = UP.Runtime = true; + UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps; + + // Set the maximum count based on the loop depth. The maximum number of + // branches taken in a loop (including the backedge) is equal to the maximum + // loop depth (the DFS path length from the loop header to any block in the + // loop). When the loop is unrolled, this depth (except for the backedge + // itself) is multiplied by the unrolling factor. This new unrolled depth + // must be less than the target-specific maximum branch count (which limits + // the number of taken branches in the uop buffer). + if (MaxDepth > 1) + UP.MaxCount = (MaxBranches-1)/(MaxDepth-1); +} + unsigned X86TTI::getNumberOfRegisters(bool Vector) const { if (Vector && !ST->hasSSE1()) return 0; diff --git a/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg b/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg new file mode 100644 index 000000000000..ba763cf03ffc --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/X86/lit.local.cfg @@ -0,0 +1,4 @@ +targets = set(config.root.targets_to_build.split()) +if not 'X86' in targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoopUnroll/X86/partial.ll b/llvm/test/Transforms/LoopUnroll/X86/partial.ll new file mode 100644 index 000000000000..15867cbea0ac --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/X86/partial.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds double* %b, i64 %index + %1 = bitcast double* %0 to <2 x double>* + %wide.load = load <2 x double>* %1, align 8 + %.sum9 = or i64 %index, 2 + %2 = getelementptr double* %b, i64 %.sum9 + %3 = bitcast double* %2 to <2 x double>* + %wide.load8 = load <2 x double>* %3, align 8 + %4 = fadd <2 x double> %wide.load, + %5 = fadd <2 x double> %wide.load8, + %6 = getelementptr inbounds double* %a, i64 %index + %7 = bitcast double* %6 to <2 x double>* + store <2 x double> %4, <2 x double>* %7, align 8 + %.sum10 = or i64 %index, 2 + %8 = getelementptr double* %a, i64 %.sum10 + %9 = bitcast double* %8 to <2 x double>* + store <2 x double> %5, <2 x double>* %9, align 8 + %index.next = add i64 %index, 4 + %10 = icmp eq i64 %index.next, 1600 + br i1 %10, label %for.end, label %vector.body + +; FIXME: We should probably unroll this loop by a factor of 2, but the cost +; model needs to be fixed to account for instructions likely to be folded +; as part of an addressing mode. +; CHECK-LABEL: @foo +; CHECK-NOUNRL-LABEL: @foo + +for.end: ; preds = %vector.body + ret void +} + +define void @bar(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %v0 = getelementptr inbounds double* %b, i64 %index + %v1 = bitcast double* %v0 to <2 x double>* + %wide.load = load <2 x double>* %v1, align 8 + %v4 = fadd <2 x double> %wide.load, + %v5 = fmul <2 x double> %v4, + %v6 = getelementptr inbounds double* %a, i64 %index + %v7 = bitcast double* %v6 to <2 x double>* + store <2 x double> %v5, <2 x double>* %v7, align 8 + %index.next = add i64 %index, 2 + %v10 = icmp eq i64 %index.next, 1600 + br i1 %v10, label %for.end, label %vector.body + +; FIXME: We should probably unroll this loop by a factor of 2, but the cost +; model needs to first to fixed to account for instructions likely to be folded +; as part of an addressing mode. + +; CHECK-LABEL: @bar +; CHECK: fadd +; CHECK-NEXT: fmul +; CHECK: fadd +; CHECK-NEXT: fmul + +; CHECK-NOUNRL-LABEL: @bar +; CHECK-NOUNRL: fadd +; CHECK-NOUNRL-NEXT: fmul +; CHECK-NOUNRL-NOT: fadd + +for.end: ; preds = %vector.body + ret void +} + +attributes #0 = { nounwind uwtable } + diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll index e98a4acddea5..224823b8ed5d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -1,13 +1,13 @@ -; RUN: opt < %s -mcpu=corei7 -O1 -S | FileCheck %s --check-prefix=O1 -; RUN: opt < %s -mcpu=corei7 -O2 -S | FileCheck %s --check-prefix=O2 -; RUN: opt < %s -mcpu=corei7 -O3 -S | FileCheck %s --check-prefix=O3 -; RUN: opt < %s -mcpu=corei7 -Os -S | FileCheck %s --check-prefix=Os -; RUN: opt < %s -mcpu=corei7 -Oz -S | FileCheck %s --check-prefix=Oz -; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S | FileCheck %s --check-prefix=O1VEC -; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S | FileCheck %s --check-prefix=OzVEC -; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S | FileCheck %s --check-prefix=O1VEC2 -; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S | FileCheck %s --check-prefix=OzVEC2 -; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S | FileCheck %s --check-prefix=O3DIS +; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1 +; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2 +; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os +; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz +; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC +; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC +; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2 +; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2 +; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS ; This file tests the llvm.vectorizer.pragma forcing vectorization even when ; optimization levels are too low, or when vectorization is disabled.