forked from OSchip/llvm-project
[SLP]Improve cost of the gather nodes.
No need to count the final shuffle cost for the constants, gathering of the constants is just a constant vector + extra inserts, if required. Differential Revision: https://reviews.llvm.org/D113770
This commit is contained in:
parent
2f43a656f3
commit
900cc1a226
|
@ -1607,10 +1607,12 @@ private:
|
|||
Value *vectorizeTree(ArrayRef<Value *> VL);
|
||||
|
||||
/// \returns the scalarization cost for this type. Scalarization in this
|
||||
/// context means the creation of vectors from a group of scalars.
|
||||
InstructionCost
|
||||
getGatherCost(FixedVectorType *Ty,
|
||||
const DenseSet<unsigned> &ShuffledIndices) const;
|
||||
/// context means the creation of vectors from a group of scalars. If \p
|
||||
/// NeedToShuffle is true, need to add a cost of reshuffling some of the
|
||||
/// vector elements.
|
||||
InstructionCost getGatherCost(FixedVectorType *Ty,
|
||||
const DenseSet<unsigned> &ShuffledIndices,
|
||||
bool NeedToShuffle) const;
|
||||
|
||||
/// Checks if the gathered \p VL can be represented as shuffle(s) of previous
|
||||
/// tree entries.
|
||||
|
@ -5580,7 +5582,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
|
|||
|
||||
InstructionCost
|
||||
BoUpSLP::getGatherCost(FixedVectorType *Ty,
|
||||
const DenseSet<unsigned> &ShuffledIndices) const {
|
||||
const DenseSet<unsigned> &ShuffledIndices,
|
||||
bool NeedToShuffle) const {
|
||||
unsigned NumElts = Ty->getNumElements();
|
||||
APInt DemandedElts = APInt::getZero(NumElts);
|
||||
for (unsigned I = 0; I < NumElts; ++I)
|
||||
|
@ -5589,7 +5592,7 @@ BoUpSLP::getGatherCost(FixedVectorType *Ty,
|
|||
InstructionCost Cost =
|
||||
TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
|
||||
/*Extract*/ false);
|
||||
if (!ShuffledIndices.empty())
|
||||
if (NeedToShuffle)
|
||||
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
|
||||
return Cost;
|
||||
}
|
||||
|
@ -5600,6 +5603,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
|
|||
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
||||
ScalarTy = SI->getValueOperand()->getType();
|
||||
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
|
||||
bool DuplicateNonConst = false;
|
||||
// Find the cost of inserting/extracting values from the vector.
|
||||
// Check if the same elements are inserted several times and count them as
|
||||
// shuffle candidates.
|
||||
|
@ -5608,12 +5612,17 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
|
|||
// Iterate in reverse order to consider insert elements with the high cost.
|
||||
for (unsigned I = VL.size(); I > 0; --I) {
|
||||
unsigned Idx = I - 1;
|
||||
if (isConstant(VL[Idx]))
|
||||
continue;
|
||||
if (!UniqueElements.insert(VL[Idx]).second)
|
||||
// No need to shuffle duplicates for constants.
|
||||
if (isConstant(VL[Idx])) {
|
||||
ShuffledElements.insert(Idx);
|
||||
continue;
|
||||
}
|
||||
if (!UniqueElements.insert(VL[Idx]).second) {
|
||||
DuplicateNonConst = true;
|
||||
ShuffledElements.insert(Idx);
|
||||
}
|
||||
}
|
||||
return getGatherCost(VecTy, ShuffledElements);
|
||||
return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);
|
||||
}
|
||||
|
||||
// Perform operand reordering on the instructions in VL and return the reordered
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: opt -slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefix=CHECK
|
||||
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 | FileCheck %s --check-prefix=CHECK
|
||||
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION
|
||||
|
||||
define void @Test(i32) {
|
||||
|
|
|
@ -343,22 +343,19 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
|
|||
; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1
|
||||
; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2
|
||||
; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 42>
|
||||
; CHECK-NEXT: [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]]
|
||||
; CHECK-NEXT: [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]]
|
||||
; CHECK-NEXT: [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]]
|
||||
; CHECK-NEXT: [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
|
||||
; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false
|
||||
; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
|
||||
; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
|
||||
; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
|
||||
; CHECK-NEXT: ret i1 [[S7]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[Y0]], i32 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]])
|
||||
; CHECK-NEXT: ret i1 [[TMP11]]
|
||||
;
|
||||
%x0 = extractelement <8 x i32> %x, i32 0
|
||||
%x1 = extractelement <8 x i32> %x, i32 1
|
||||
|
|
Loading…
Reference in New Issue