forked from OSchip/llvm-project
[AArch64] Add target hook for preferPredicateOverEpilogue
This patch adds the AArch64 hook for preferPredicateOverEpilogue, which currently returns true if SVE is enabled and one of the following conditions (non-exhaustive) is met: 1. The "sve-tail-folding" option is set to "all", or 2. The "sve-tail-folding" option is set to "all+noreductions" and the loop does not contain reductions, 3. The "sve-tail-folding" option is set to "all+norecurrences" and the loop has no first-order recurrences. Currently the default option is "disabled", but this will be changed in a later patch. I've added new tests to show the options behave as expected here: Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll Differential Revision: https://reviews.llvm.org/D129560
This commit is contained in:
parent
b5871dfaf3
commit
f15b6b2907
|
@ -52,6 +52,7 @@ class LoadInst;
|
|||
class LoopAccessInfo;
|
||||
class Loop;
|
||||
class LoopInfo;
|
||||
class LoopVectorizationLegality;
|
||||
class ProfileSummaryInfo;
|
||||
class RecurrenceDescriptor;
|
||||
class SCEV;
|
||||
|
@ -530,7 +531,7 @@ public:
|
|||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) const;
|
||||
LoopVectorizationLegality *LVL) const;
|
||||
|
||||
/// Query the target whether lowering of the llvm.get.active.lane.mask
|
||||
/// intrinsic is supported and how the mask should be used. A return value
|
||||
|
@ -1555,10 +1556,12 @@ public:
|
|||
AssumptionCache &AC,
|
||||
TargetLibraryInfo *LibInfo,
|
||||
HardwareLoopInfo &HWLoopInfo) = 0;
|
||||
virtual bool
|
||||
preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
|
||||
virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
|
||||
ScalarEvolution &SE,
|
||||
AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
LoopVectorizationLegality *LVL) = 0;
|
||||
virtual PredicationStyle emitGetActiveLaneMask() = 0;
|
||||
virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
|
||||
IntrinsicInst &II) = 0;
|
||||
|
@ -1935,8 +1938,8 @@ public:
|
|||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) override {
|
||||
return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
|
||||
LoopVectorizationLegality *LVL) override {
|
||||
return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
|
||||
}
|
||||
PredicationStyle emitGetActiveLaneMask() override {
|
||||
return Impl.emitGetActiveLaneMask();
|
||||
|
|
|
@ -163,7 +163,7 @@ public:
|
|||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) const {
|
||||
LoopVectorizationLegality *LVL) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -603,8 +603,8 @@ public:
|
|||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) {
|
||||
return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
|
||||
LoopVectorizationLegality *LVL) {
|
||||
return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
|
||||
}
|
||||
|
||||
PredicationStyle emitGetActiveLaneMask() {
|
||||
|
|
|
@ -294,8 +294,8 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
|
|||
bool TargetTransformInfo::preferPredicateOverEpilogue(
|
||||
Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI, DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) const {
|
||||
return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
|
||||
LoopVectorizationLegality *LVL) const {
|
||||
return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
|
||||
}
|
||||
|
||||
PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Transforms/InstCombine/InstCombiner.h"
|
||||
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
|
||||
#include <algorithm>
|
||||
using namespace llvm;
|
||||
using namespace llvm::PatternMatch;
|
||||
|
@ -37,6 +38,74 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
|
|||
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
|
||||
cl::init(10), cl::Hidden);
|
||||
|
||||
class TailFoldingKind {
|
||||
private:
|
||||
uint8_t Bits = 0; // Currently defaults to disabled.
|
||||
|
||||
public:
|
||||
enum TailFoldingOpts {
|
||||
TFDisabled = 0x0,
|
||||
TFReductions = 0x01,
|
||||
TFRecurrences = 0x02,
|
||||
TFSimple = 0x80,
|
||||
TFAll = TFReductions | TFRecurrences | TFSimple
|
||||
};
|
||||
|
||||
void operator=(const std::string &Val) {
|
||||
if (Val.empty())
|
||||
return;
|
||||
SmallVector<StringRef, 6> TailFoldTypes;
|
||||
StringRef(Val).split(TailFoldTypes, '+', -1, false);
|
||||
for (auto TailFoldType : TailFoldTypes) {
|
||||
if (TailFoldType == "disabled")
|
||||
Bits = 0;
|
||||
else if (TailFoldType == "all")
|
||||
Bits = TFAll;
|
||||
else if (TailFoldType == "default")
|
||||
Bits = 0; // Currently defaults to never tail-folding.
|
||||
else if (TailFoldType == "simple")
|
||||
add(TFSimple);
|
||||
else if (TailFoldType == "reductions")
|
||||
add(TFReductions);
|
||||
else if (TailFoldType == "recurrences")
|
||||
add(TFRecurrences);
|
||||
else if (TailFoldType == "noreductions")
|
||||
remove(TFReductions);
|
||||
else if (TailFoldType == "norecurrences")
|
||||
remove(TFRecurrences);
|
||||
else {
|
||||
errs()
|
||||
<< "invalid argument " << TailFoldType.str()
|
||||
<< " to -sve-tail-folding=; each element must be one of: disabled, "
|
||||
"all, default, simple, reductions, noreductions, recurrences, "
|
||||
"norecurrences\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
operator uint8_t() const { return Bits; }
|
||||
|
||||
void add(uint8_t Flag) { Bits |= Flag; }
|
||||
void remove(uint8_t Flag) { Bits &= ~Flag; }
|
||||
};
|
||||
|
||||
TailFoldingKind TailFoldingKindLoc;
|
||||
|
||||
cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
|
||||
"sve-tail-folding",
|
||||
cl::desc(
|
||||
"Control the use of vectorisation using tail-folding for SVE:"
|
||||
"\ndisabled No loop types will vectorize using tail-folding"
|
||||
"\ndefault Uses the default tail-folding settings for the target "
|
||||
"CPU"
|
||||
"\nall All legal loop types will vectorize using tail-folding"
|
||||
"\nsimple Use tail-folding for simple loops (not reductions or "
|
||||
"recurrences)"
|
||||
"\nreductions Use tail-folding for loops containing reductions"
|
||||
"\nrecurrences Use tail-folding for loops containing first order "
|
||||
"recurrences"),
|
||||
cl::location(TailFoldingKindLoc));
|
||||
|
||||
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const {
|
||||
const TargetMachine &TM = getTLI()->getTargetMachine();
|
||||
|
@ -2955,3 +3024,20 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
|
|||
|
||||
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
|
||||
}
|
||||
|
||||
bool AArch64TTIImpl::preferPredicateOverEpilogue(
|
||||
Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
|
||||
if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
|
||||
return false;
|
||||
|
||||
TailFoldingKind Required; // Defaults to 0.
|
||||
if (LVL->getReductionVars().size())
|
||||
Required.add(TailFoldingKind::TFReductions);
|
||||
if (LVL->getFirstOrderRecurrences().size())
|
||||
Required.add(TailFoldingKind::TFRecurrences);
|
||||
if (!Required)
|
||||
Required.add(TailFoldingKind::TFSimple);
|
||||
|
||||
return (TailFoldingKindLoc & Required) == Required;
|
||||
}
|
||||
|
|
|
@ -340,6 +340,11 @@ public:
|
|||
return PredicationStyle::None;
|
||||
}
|
||||
|
||||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
LoopVectorizationLegality *LVL);
|
||||
|
||||
bool supportsScalableVectors() const { return ST->hasSVE(); }
|
||||
|
||||
bool enableScalableVectorization() const { return ST->hasSVE(); }
|
||||
|
|
|
@ -20,8 +20,8 @@
|
|||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/Instruction.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IntrinsicsARM.h"
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
|
@ -33,6 +33,7 @@
|
|||
#include "llvm/Transforms/InstCombine/InstCombiner.h"
|
||||
#include "llvm/Transforms/Utils/Local.h"
|
||||
#include "llvm/Transforms/Utils/LoopUtils.h"
|
||||
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
@ -2197,12 +2198,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
|
||||
ScalarEvolution &SE,
|
||||
AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI) {
|
||||
bool ARMTTIImpl::preferPredicateOverEpilogue(
|
||||
Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
|
||||
if (!EnableTailPredication) {
|
||||
LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
|
||||
return false;
|
||||
|
@ -2244,7 +2242,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
|
|||
return false;
|
||||
}
|
||||
|
||||
return canTailPredicateLoop(L, LI, SE, DL, LAI);
|
||||
return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
|
||||
}
|
||||
|
||||
PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
|
||||
|
|
|
@ -288,12 +288,10 @@ public:
|
|||
AssumptionCache &AC,
|
||||
TargetLibraryInfo *LibInfo,
|
||||
HardwareLoopInfo &HWLoopInfo);
|
||||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
|
||||
ScalarEvolution &SE,
|
||||
AssumptionCache &AC,
|
||||
TargetLibraryInfo *TLI,
|
||||
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
|
||||
AssumptionCache &AC, TargetLibraryInfo *TLI,
|
||||
DominatorTree *DT,
|
||||
const LoopAccessInfo *LAI);
|
||||
LoopVectorizationLegality *LVL);
|
||||
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
TTI::UnrollingPreferences &UP,
|
||||
OptimizationRemarkEmitter *ORE);
|
||||
|
|
|
@ -9745,8 +9745,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
|
|||
};
|
||||
|
||||
// 4) if the TTI hook indicates this is profitable, request predication.
|
||||
if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
|
||||
LVL.getLAI()))
|
||||
if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
|
||||
return CM_ScalarEpilogueNotNeededUsePredicate;
|
||||
|
||||
return CM_ScalarEpilogueAllowed;
|
||||
|
|
|
@ -0,0 +1,222 @@
|
|||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
|
||||
; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
|
||||
; CHECK-NOTF-LABEL: @simple_memset(
|
||||
; CHECK-NOTF: vector.ph:
|
||||
; CHECK-NOTF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
|
||||
; CHECK-NOTF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-NOTF: vector.body:
|
||||
; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-NOTF: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
|
||||
|
||||
; CHECK-TF-NORED-LABEL: @simple_memset(
|
||||
; CHECK-TF-NORED: vector.ph:
|
||||
; CHECK-TF-NORED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
|
||||
; CHECK-TF-NORED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-TF-NORED: vector.body:
|
||||
; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
|
||||
; CHECK-TF-NOREC-LABEL: @simple_memset(
|
||||
; CHECK-TF-NOREC: vector.ph:
|
||||
; CHECK-TF-NOREC: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
|
||||
; CHECK-TF-NOREC: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-TF-NOREC: vector.body:
|
||||
; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
|
||||
; CHECK-TF-LABEL: @simple_memset(
|
||||
; CHECK-TF: vector.ph:
|
||||
; CHECK-TF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
|
||||
; CHECK-TF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-TF: vector.body:
|
||||
; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
|
||||
; CHECK-TF-ONLYRED-LABEL: @simple_memset(
|
||||
; CHECK-TF-ONLYRED: vector.ph:
|
||||
; CHECK-TF-ONLYRED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
|
||||
; CHECK-TF-ONLYRED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
|
||||
; CHECK-TF-ONLYRED: vector.body:
|
||||
; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
|
||||
|
||||
entry:
|
||||
br label %while.body
|
||||
|
||||
while.body: ; preds = %while.body, %entry
|
||||
%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
|
||||
%gep = getelementptr i32, i32* %ptr, i64 %index
|
||||
store i32 %val, i32* %gep
|
||||
%index.next = add nsw i64 %index, 1
|
||||
%cmp10 = icmp ult i64 %index.next, %n
|
||||
br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
|
||||
|
||||
while.end.loopexit: ; preds = %while.body
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
|
||||
; CHECK-NOTF-LABEL: @fadd_red_fast
|
||||
; CHECK-NOTF: vector.body:
|
||||
; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-NOTF: %[[LOAD:.*]] = load <vscale x 4 x float>
|
||||
; CHECK-NOTF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
|
||||
; CHECK-NOTF: middle.block:
|
||||
; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
|
||||
|
||||
; CHECK-TF-NORED-LABEL: @fadd_red_fast
|
||||
; CHECK-TF-NORED: vector.body:
|
||||
; CHECK-TF-NORED-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NORED: %[[LOAD:.*]] = load <vscale x 4 x float>
|
||||
; CHECK-TF-NORED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
|
||||
; CHECK-TF-NORED: middle.block:
|
||||
; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
|
||||
|
||||
; CHECK-TF-NOREC-LABEL: @fadd_red_fast
|
||||
; CHECK-TF-NOREC: vector.body:
|
||||
; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NOREC: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
|
||||
; CHECK-TF-NOREC: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
; CHECK-TF-NOREC: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
|
||||
; CHECK-TF-NOREC: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
|
||||
; CHECK-TF-NOREC: middle.block:
|
||||
; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
|
||||
|
||||
; CHECK-TF-LABEL: @fadd_red_fast
|
||||
; CHECK-TF: vector.body:
|
||||
; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
|
||||
; CHECK-TF: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
; CHECK-TF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
|
||||
; CHECK-TF: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
|
||||
; CHECK-TF: middle.block:
|
||||
; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
|
||||
|
||||
; CHECK-TF-ONLYRED-LABEL: @fadd_red_fast
|
||||
; CHECK-TF-ONLYRED: vector.body:
|
||||
; CHECK-TF-ONLYRED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-ONLYRED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
|
||||
; CHECK-TF-ONLYRED: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
; CHECK-TF-ONLYRED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
|
||||
; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
|
||||
; CHECK-TF-ONLYRED: middle.block:
|
||||
; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
|
||||
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%add = fadd fast float %0, %sum.07
|
||||
%iv.next = add nuw nsw i64 %iv, 1
|
||||
%exitcond.not = icmp eq i64 %iv.next, %n
|
||||
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
|
||||
|
||||
for.end:
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
|
||||
; CHECK-NOTF-LABEL: @add_recur
|
||||
; CHECK-NOTF: entry:
|
||||
; CHECK-NOTF: %[[PRE:.*]] = load i32, i32* %src, align 4
|
||||
; CHECK-NOTF: vector.ph:
|
||||
; CHECK-NOTF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
|
||||
; CHECK-NOTF: vector.body:
|
||||
; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-NOTF: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
|
||||
; CHECK-NOTF: %[[LOAD]] = load <vscale x 4 x i32>
|
||||
; CHECK-NOTF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
|
||||
; CHECK-NOTF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
|
||||
; CHECK-NOTF: store <vscale x 4 x i32> %[[ADD]]
|
||||
|
||||
; CHECK-TF-NORED-LABEL: @add_recur
|
||||
; CHECK-TF-NORED: entry:
|
||||
; CHECK-TF-NORED: %[[PRE:.*]] = load i32, i32* %src, align 4
|
||||
; CHECK-TF-NORED: vector.ph:
|
||||
; CHECK-TF-NORED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
|
||||
; CHECK-TF-NORED: vector.body:
|
||||
; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NORED: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
|
||||
; CHECK-TF-NORED: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
; CHECK-TF-NORED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
|
||||
; CHECK-TF-NORED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
|
||||
; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
|
||||
|
||||
; CHECK-TF-NOREC-LABEL: @add_recur
|
||||
; CHECK-TF-NOREC: entry:
|
||||
; CHECK-TF-NOREC: %[[PRE:.*]] = load i32, i32* %src, align 4
|
||||
; CHECK-TF-NOREC: vector.ph:
|
||||
; CHECK-TF-NOREC: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
|
||||
; CHECK-TF-NOREC: vector.body:
|
||||
; CHECK-TF-NOREC-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-NOREC: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
|
||||
; CHECK-TF-NOREC: %[[LOAD]] = load <vscale x 4 x i32>
|
||||
; CHECK-TF-NOREC: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
|
||||
; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
|
||||
; CHECK-TF-NOREC: store <vscale x 4 x i32> %[[ADD]]
|
||||
|
||||
; CHECK-TF-LABEL: @add_recur
|
||||
; CHECK-TF: entry:
|
||||
; CHECK-TF: %[[PRE:.*]] = load i32, i32* %src, align 4
|
||||
; CHECK-TF: vector.ph:
|
||||
; CHECK-TF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
|
||||
; CHECK-TF: vector.body:
|
||||
; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
|
||||
; CHECK-TF: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
|
||||
; CHECK-TF: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
|
||||
; CHECK-TF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
|
||||
; CHECK-TF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
|
||||
; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
|
||||
|
||||
; CHECK-TF-ONLYRED-LABEL: @add_recur
|
||||
; CHECK-TF-ONLYRED: entry:
|
||||
; CHECK-TF-ONLYRED: %[[PRE:.*]] = load i32, i32* %src, align 4
|
||||
; CHECK-TF-ONLYRED: vector.ph:
|
||||
; CHECK-TF-ONLYRED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
|
||||
; CHECK-TF-ONLYRED: vector.body:
|
||||
; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
|
||||
; CHECK-TF-ONLYRED: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
|
||||
; CHECK-TF-ONLYRED: %[[LOAD]] = load <vscale x 4 x i32>
|
||||
; CHECK-TF-ONLYRED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
|
||||
; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
|
||||
; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[ADD]]
|
||||
|
||||
entry:
|
||||
%.pre = load i32, i32* %src, align 4
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%0 = phi i32 [ %1, %for.body ], [ %.pre, %entry ]
|
||||
%i.010 = phi i64 [ %add, %for.body ], [ 0, %entry ]
|
||||
%add = add nuw nsw i64 %i.010, 1
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %src, i64 %add
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%add2 = add nsw i32 %1, %0
|
||||
%arrayidx3 = getelementptr inbounds i32, i32* %dst, i64 %i.010
|
||||
store i32 %add2, i32* %arrayidx3, align 4
|
||||
%exitcond.not = icmp eq i64 %add, %n
|
||||
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
|
||||
|
||||
for.end: ; preds = %for.body
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "target-features"="+sve" }
|
||||
|
||||
!0 = distinct !{!0, !1, !2, !3, !4}
|
||||
!1 = !{!"llvm.loop.vectorize.width", i32 4}
|
||||
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
|
||||
!3 = !{!"llvm.loop.interleave.count", i32 1}
|
||||
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
|
Loading…
Reference in New Issue