[TTI] Remove IsPairwiseForm from getArithmeticReductionCost

This patch removes the IsPairwiseForm flag from the Reduction Cost TTI
hooks, along with some accompanying code for pattern matching reductions
from trees starting at extract elements. IsPairWise is now assumed to be
false, which was the predominant way that the value was used from both
the Loop and SLP vectorizers. Since the adjustments such as D93860, the
SLP vectorizer has not relied upon this distinction between paiwise and
non-pairwise reductions.

This also removes some code that was detecting reductions trees starting
from extract elements inside the costmodel. This case was
double-counting costs though, adding the individual costs on the
individual instruction _and_ the total cost of the reduction. Removing
it changes the costs in llvm/test/Analysis/CostModel/X86/reduction.ll to
not double count. The cost of reduction intrinsics is still tested
through the various tests in
llvm/test/Analysis/CostModel/X86/reduce-xyz.ll.

Differential Revision: https://reviews.llvm.org/D105484
This commit is contained in:
David Green 2021-07-09 11:51:16 +01:00
parent aa9f58cc2c
commit 38c9a4068d
15 changed files with 161 additions and 603 deletions

View File

@ -865,40 +865,6 @@ public:
///< a vector of the same type as the input vectors. ///< a vector of the same type as the input vectors.
}; };
/// Kind of the reduction data.
enum ReductionKind {
RK_None, /// Not a reduction.
RK_Arithmetic, /// Binary reduction data.
RK_MinMax, /// Min/max reduction data.
RK_UnsignedMinMax, /// Unsigned min/max reduction data.
};
/// Contains opcode + LHS/RHS parts of the reduction operations.
struct ReductionData {
ReductionData() = delete;
ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
: Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
assert(Kind != RK_None && "expected binary or min/max reduction only.");
}
unsigned Opcode = 0;
Value *LHS = nullptr;
Value *RHS = nullptr;
ReductionKind Kind = RK_None;
bool hasSameData(ReductionData &RD) const {
return Kind == RD.Kind && Opcode == RD.Opcode;
}
};
static ReductionKind matchPairwiseReduction(
const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
static ReductionKind matchVectorSplittingReduction(
const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
static ReductionKind matchVectorReduction(const ExtractElementInst *ReduxRoot,
unsigned &Opcode, VectorType *&Ty,
bool &IsPairwise);
/// Additional information about an operand's possible values. /// Additional information about an operand's possible values.
enum OperandValueKind { enum OperandValueKind {
OK_AnyValue, // Operand can have any value. OK_AnyValue, // Operand can have any value.
@ -1180,25 +1146,16 @@ public:
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
bool UseMaskForCond = false, bool UseMaskForGaps = false) const; bool UseMaskForCond = false, bool UseMaskForGaps = false) const;
/// Calculate the cost of performing a vector reduction. /// Calculate the cost of vector reduction intrinsics.
/// ///
/// This is the cost of reducing the vector value of type \p Ty to a scalar /// This is the cost of reducing the vector value of type \p Ty to a scalar
/// value using the operation denoted by \p Opcode. The form of the reduction /// value using the operation denoted by \p Opcode.
/// can either be a pairwise reduction or a reduction that splits the vector
/// at every reduction level.
///
/// Pairwise:
/// (v0, v1, v2, v3)
/// ((v0+v1), (v2+v3), undef, undef)
/// Split:
/// (v0, v1, v2, v3)
/// ((v0+v2), (v1+v3), undef, undef)
InstructionCost getArithmeticReductionCost( InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, unsigned Opcode, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
InstructionCost getMinMaxReductionCost( InstructionCost getMinMaxReductionCost(
VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
/// Calculate the cost of an extended reduction pattern, similar to /// Calculate the cost of an extended reduction pattern, similar to
@ -1661,11 +1618,9 @@ public:
bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
virtual InstructionCost virtual InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind) = 0; TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost virtual InstructionCost
getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
bool IsPairwiseForm, bool IsUnsigned,
TTI::TargetCostKind CostKind) = 0; TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost getExtendedAddReductionCost( virtual InstructionCost getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
@ -2164,17 +2119,13 @@ public:
} }
InstructionCost InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind) override { TTI::TargetCostKind CostKind) override {
return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind);
CostKind);
} }
InstructionCost InstructionCost
getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
bool IsPairwiseForm, bool IsUnsigned,
TTI::TargetCostKind CostKind) override { TTI::TargetCostKind CostKind) override {
return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned, return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
CostKind);
} }
InstructionCost getExtendedAddReductionCost( InstructionCost getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,

View File

@ -621,12 +621,12 @@ public:
return 0; return 0;
} }
InstructionCost getArithmeticReductionCost(unsigned, VectorType *, bool, InstructionCost getArithmeticReductionCost(unsigned, VectorType *,
TTI::TargetCostKind) const { TTI::TargetCostKind) const {
return 1; return 1;
} }
InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, bool, InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool,
TTI::TargetCostKind) const { TTI::TargetCostKind) const {
return 1; return 1;
} }
@ -1112,26 +1112,6 @@ public:
if (CI) if (CI)
Idx = CI->getZExtValue(); Idx = CI->getZExtValue();
// Try to match a reduction (a series of shufflevector and vector ops
// followed by an extractelement).
unsigned RdxOpcode;
VectorType *RdxType;
bool IsPairwise;
switch (TTI::matchVectorReduction(EEI, RdxOpcode, RdxType, IsPairwise)) {
case TTI::RK_Arithmetic:
return TargetTTI->getArithmeticReductionCost(RdxOpcode, RdxType,
IsPairwise, CostKind);
case TTI::RK_MinMax:
return TargetTTI->getMinMaxReductionCost(
RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
IsPairwise, /*IsUnsigned=*/false, CostKind);
case TTI::RK_UnsignedMinMax:
return TargetTTI->getMinMaxReductionCost(
RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
IsPairwise, /*IsUnsigned=*/true, CostKind);
case TTI::RK_None:
break;
}
return TargetTTI->getVectorInstrCost(Opcode, U->getOperand(0)->getType(), return TargetTTI->getVectorInstrCost(Opcode, U->getOperand(0)->getType(),
Idx); Idx);
} }

View File

@ -1654,33 +1654,26 @@ public:
} }
case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_add:
return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_mul:
return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_and: case Intrinsic::vector_reduce_and:
return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_or:
return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_xor: case Intrinsic::vector_reduce_xor:
return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fadd:
// FIXME: Add new flag for cost of strict reductions. // FIXME: Add new flag for cost of strict reductions.
return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_fmul: case Intrinsic::vector_reduce_fmul:
// FIXME: Add new flag for cost of strict reductions. // FIXME: Add new flag for cost of strict reductions.
return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
/*IsPairwiseForm=*/false,
CostKind); CostKind);
case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smax:
case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_smin:
@ -1688,13 +1681,11 @@ public:
case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_fmin:
return thisT()->getMinMaxReductionCost( return thisT()->getMinMaxReductionCost(
VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
/*IsPairwiseForm=*/false,
/*IsUnsigned=*/false, CostKind); /*IsUnsigned=*/false, CostKind);
case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_umin:
return thisT()->getMinMaxReductionCost( return thisT()->getMinMaxReductionCost(
VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
/*IsPairwiseForm=*/false,
/*IsUnsigned=*/true, CostKind); /*IsUnsigned=*/true, CostKind);
case Intrinsic::abs: case Intrinsic::abs:
case Intrinsic::smax: case Intrinsic::smax:
@ -1998,9 +1989,9 @@ public:
return 0; return 0;
} }
/// Try to calculate arithmetic and shuffle op costs for reduction operations. /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
/// We're assuming that reduction operation are performing the following way: /// We're assuming that reduction operation are performing the following way:
/// 1. Non-pairwise reduction ///
/// %val1 = shufflevector<n x t> %val, <n x t> %undef, /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
/// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
/// \----------------v-------------/ \----------v------------/ /// \----------------v-------------/ \----------v------------/
@ -2016,25 +2007,10 @@ public:
/// n/4 elements 3*n/4 elements /// n/4 elements 3*n/4 elements
/// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
/// length n/2, the resulting vector has length n/4 etc. /// length n/2, the resulting vector has length n/4 etc.
/// 2. Pairwise reduction:
/// Everything is the same except for an additional shuffle operation which
/// is used to produce operands for pairwise kind of reductions.
/// %val1 = shufflevector<n x t> %val, <n x t> %undef,
/// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
/// \-------------v----------/ \----------v------------/
/// n/2 elements n/2 elements
/// %val2 = shufflevector<n x t> %val, <n x t> %undef,
/// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
/// \-------------v----------/ \----------v------------/
/// n/2 elements n/2 elements
/// %red1 = op <n x t> %val1, <n x t> val2
/// Again, the operation is performed on <n x t> vector, but the resulting
/// vector %red1 is <n/2 x t> vector.
/// ///
/// The cost model should take into account that the actual length of the /// The cost model should take into account that the actual length of the
/// vector is reduced on each iteration. /// vector is reduced on each iteration.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwise,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
Type *ScalarTy = Ty->getElementType(); Type *ScalarTy = Ty->getElementType();
unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
@ -2065,9 +2041,7 @@ public:
while (NumVecElts > MVTLen) { while (NumVecElts > MVTLen) {
NumVecElts /= 2; NumVecElts /= 2;
VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
// Assume the pairwise shuffles add a cost. ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
ShuffleCost += (IsPairwise + 1) *
thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
NumVecElts, SubTy); NumVecElts, SubTy);
ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
Ty = SubTy; Ty = SubTy;
@ -2081,13 +2055,8 @@ public:
// reduction operations are performed on the vectors with the same // reduction operations are performed on the vectors with the same
// architecture-dependent length. // architecture-dependent length.
// Non pairwise reductions need one shuffle per reduction level. Pairwise // By default reductions need one shuffle per reduction level.
// reductions need two shuffles on every level, but the last one. On that ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
// level one of the shuffles is <0, u, u, ...> which is identity.
unsigned NumShuffles = NumReduxLevels;
if (IsPairwise && NumReduxLevels >= 1)
NumShuffles += NumReduxLevels - 1;
ShuffleCost += NumShuffles * thisT()->getShuffleCost(
TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
return ShuffleCost + ArithCost + return ShuffleCost + ArithCost +
@ -2097,7 +2066,7 @@ public:
/// Try to calculate op costs for min/max reduction operations. /// Try to calculate op costs for min/max reduction operations.
/// \param CondTy Conditional type for the Select instruction. /// \param CondTy Conditional type for the Select instruction.
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
Type *ScalarTy = Ty->getElementType(); Type *ScalarTy = Ty->getElementType();
Type *ScalarCondTy = CondTy->getElementType(); Type *ScalarCondTy = CondTy->getElementType();
@ -2123,9 +2092,7 @@ public:
auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
// Assume the pairwise shuffles add a cost. ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
ShuffleCost += (IsPairwise + 1) *
thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
NumVecElts, SubTy); NumVecElts, SubTy);
MinMaxCost += MinMaxCost +=
thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
@ -2142,14 +2109,7 @@ public:
// operations performed on the current platform. That's why several final // operations performed on the current platform. That's why several final
// reduction opertions are perfomed on the vectors with the same // reduction opertions are perfomed on the vectors with the same
// architecture-dependent length. // architecture-dependent length.
ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
// Non pairwise reductions need one shuffle per reduction level. Pairwise
// reductions need two shuffles on every level, but the last one. On that
// level one of the shuffles is <0, u, u, ...> which is identity.
unsigned NumShuffles = NumReduxLevels;
if (IsPairwise && NumReduxLevels >= 1)
NumShuffles += NumReduxLevels - 1;
ShuffleCost += NumShuffles * thisT()->getShuffleCost(
TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
MinMaxCost += MinMaxCost +=
NumReduxLevels * NumReduxLevels *
@ -2169,8 +2129,8 @@ public:
// Without any native support, this is equivalent to the cost of // Without any native support, this is equivalent to the cost of
// vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
VectorType *ExtTy = VectorType::get(ResTy, Ty); VectorType *ExtTy = VectorType::get(ResTy, Ty);
InstructionCost RedCost = thisT()->getArithmeticReductionCost( InstructionCost RedCost =
Instruction::Add, ExtTy, false, CostKind); thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind);
InstructionCost MulCost = 0; InstructionCost MulCost = 0;
InstructionCost ExtCost = thisT()->getCastInstrCost( InstructionCost ExtCost = thisT()->getCastInstrCost(
IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,

View File

@ -894,19 +894,18 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
} }
InstructionCost TargetTransformInfo::getArithmeticReductionCost( InstructionCost TargetTransformInfo::getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const {
TTI::TargetCostKind CostKind) const {
InstructionCost Cost = InstructionCost Cost =
TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, CostKind); TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!"); assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost; return Cost;
} }
InstructionCost TargetTransformInfo::getMinMaxReductionCost( InstructionCost TargetTransformInfo::getMinMaxReductionCost(
VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind) const { TTI::TargetCostKind CostKind) const {
InstructionCost Cost = TTIImpl->getMinMaxReductionCost( InstructionCost Cost =
Ty, CondTy, IsPairwiseForm, IsUnsigned, CostKind); TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!"); assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost; return Cost;
} }
@ -1057,291 +1056,6 @@ TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
return TTIImpl->getInstructionLatency(I); return TTIImpl->getInstructionLatency(I);
} }
static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
unsigned Level) {
// We don't need a shuffle if we just want to have element 0 in position 0 of
// the vector.
if (!SI && Level == 0 && IsLeft)
return true;
else if (!SI)
return false;
SmallVector<int, 32> Mask(
cast<FixedVectorType>(SI->getType())->getNumElements(), -1);
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
// we look at the left or right side.
for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
Mask[i] = val;
ArrayRef<int> ActualMask = SI->getShuffleMask();
return Mask == ActualMask;
}
static Optional<TTI::ReductionData> getReductionData(Instruction *I) {
Value *L, *R;
if (m_BinOp(m_Value(L), m_Value(R)).match(I))
return TTI::ReductionData(TTI::RK_Arithmetic, I->getOpcode(), L, R);
if (auto *SI = dyn_cast<SelectInst>(I)) {
if (m_SMin(m_Value(L), m_Value(R)).match(SI) ||
m_SMax(m_Value(L), m_Value(R)).match(SI) ||
m_OrdFMin(m_Value(L), m_Value(R)).match(SI) ||
m_OrdFMax(m_Value(L), m_Value(R)).match(SI) ||
m_UnordFMin(m_Value(L), m_Value(R)).match(SI) ||
m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) {
auto *CI = cast<CmpInst>(SI->getCondition());
return TTI::ReductionData(TTI::RK_MinMax, CI->getOpcode(), L, R);
}
if (m_UMin(m_Value(L), m_Value(R)).match(SI) ||
m_UMax(m_Value(L), m_Value(R)).match(SI)) {
auto *CI = cast<CmpInst>(SI->getCondition());
return TTI::ReductionData(TTI::RK_UnsignedMinMax, CI->getOpcode(), L, R);
}
}
return llvm::None;
}
static TTI::ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
unsigned Level,
unsigned NumLevels) {
// Match one level of pairwise operations.
// %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
// %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
// %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
if (!I)
return TTI::RK_None;
assert(I->getType()->isVectorTy() && "Expecting a vector type");
Optional<TTI::ReductionData> RD = getReductionData(I);
if (!RD)
return TTI::RK_None;
ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(RD->LHS);
if (!LS && Level)
return TTI::RK_None;
ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(RD->RHS);
if (!RS && Level)
return TTI::RK_None;
// On level 0 we can omit one shufflevector instruction.
if (!Level && !RS && !LS)
return TTI::RK_None;
// Shuffle inputs must match.
Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr;
Value *NextLevelOpR = RS ? RS->getOperand(0) : nullptr;
Value *NextLevelOp = nullptr;
if (NextLevelOpR && NextLevelOpL) {
// If we have two shuffles their operands must match.
if (NextLevelOpL != NextLevelOpR)
return TTI::RK_None;
NextLevelOp = NextLevelOpL;
} else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
// On the first level we can omit the shufflevector <0, undef,...>. So the
// input to the other shufflevector <1, undef> must match with one of the
// inputs to the current binary operation.
// Example:
// %NextLevelOpL = shufflevector %R, <1, undef ...>
// %BinOp = fadd %NextLevelOpL, %R
if (NextLevelOpL && NextLevelOpL != RD->RHS)
return TTI::RK_None;
else if (NextLevelOpR && NextLevelOpR != RD->LHS)
return TTI::RK_None;
NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS;
} else
return TTI::RK_None;
// Check that the next levels binary operation exists and matches with the
// current one.
if (Level + 1 != NumLevels) {
if (!isa<Instruction>(NextLevelOp))
return TTI::RK_None;
Optional<TTI::ReductionData> NextLevelRD =
getReductionData(cast<Instruction>(NextLevelOp));
if (!NextLevelRD || !RD->hasSameData(*NextLevelRD))
return TTI::RK_None;
}
// Shuffle mask for pairwise operation must match.
if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) {
if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level))
return TTI::RK_None;
} else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) {
if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level))
return TTI::RK_None;
} else {
return TTI::RK_None;
}
if (++Level == NumLevels)
return RD->Kind;
// Match next level.
return matchPairwiseReductionAtLevel(dyn_cast<Instruction>(NextLevelOp), Level,
NumLevels);
}
TTI::ReductionKind TTI::matchPairwiseReduction(
const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
if (!EnableReduxCost)
return TTI::RK_None;
// Need to extract the first element.
ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
unsigned Idx = ~0u;
if (CI)
Idx = CI->getZExtValue();
if (Idx != 0)
return TTI::RK_None;
auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
if (!RdxStart)
return TTI::RK_None;
Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
if (!RD)
return TTI::RK_None;
auto *VecTy = cast<FixedVectorType>(RdxStart->getType());
unsigned NumVecElems = VecTy->getNumElements();
if (!isPowerOf2_32(NumVecElems))
return TTI::RK_None;
// We look for a sequence of shuffle,shuffle,add triples like the following
// that builds a pairwise reduction tree.
//
// (X0, X1, X2, X3)
// (X0 + X1, X2 + X3, undef, undef)
// ((X0 + X1) + (X2 + X3), undef, undef, undef)
//
// %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
// %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
// %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
// %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
// <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
// %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
// %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
// %r = extractelement <4 x float> %bin.rdx8, i32 0
if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) ==
TTI::RK_None)
return TTI::RK_None;
Opcode = RD->Opcode;
Ty = VecTy;
return RD->Kind;
}
static std::pair<Value *, ShuffleVectorInst *>
getShuffleAndOtherOprd(Value *L, Value *R) {
ShuffleVectorInst *S = nullptr;
if ((S = dyn_cast<ShuffleVectorInst>(L)))
return std::make_pair(R, S);
S = dyn_cast<ShuffleVectorInst>(R);
return std::make_pair(L, S);
}
TTI::ReductionKind TTI::matchVectorSplittingReduction(
const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
if (!EnableReduxCost)
return TTI::RK_None;
// Need to extract the first element.
ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
unsigned Idx = ~0u;
if (CI)
Idx = CI->getZExtValue();
if (Idx != 0)
return TTI::RK_None;
auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
if (!RdxStart)
return TTI::RK_None;
Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
if (!RD)
return TTI::RK_None;
auto *VecTy = cast<FixedVectorType>(ReduxRoot->getOperand(0)->getType());
unsigned NumVecElems = VecTy->getNumElements();
if (!isPowerOf2_32(NumVecElems))
return TTI::RK_None;
// We look for a sequence of shuffles and adds like the following matching one
// fadd, shuffle vector pair at a time.
//
// %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
// <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
// %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
// %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
// <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
// %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
// %r = extractelement <4 x float> %bin.rdx8, i32 0
unsigned MaskStart = 1;
Instruction *RdxOp = RdxStart;
SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
unsigned NumVecElemsRemain = NumVecElems;
while (NumVecElemsRemain - 1) {
// Check for the right reduction operation.
if (!RdxOp)
return TTI::RK_None;
Optional<TTI::ReductionData> RDLevel = getReductionData(RdxOp);
if (!RDLevel || !RDLevel->hasSameData(*RD))
return TTI::RK_None;
Value *NextRdxOp;
ShuffleVectorInst *Shuffle;
std::tie(NextRdxOp, Shuffle) =
getShuffleAndOtherOprd(RDLevel->LHS, RDLevel->RHS);
// Check the current reduction operation and the shuffle use the same value.
if (Shuffle == nullptr)
return TTI::RK_None;
if (Shuffle->getOperand(0) != NextRdxOp)
return TTI::RK_None;
// Check that shuffle masks matches.
for (unsigned j = 0; j != MaskStart; ++j)
ShuffleMask[j] = MaskStart + j;
// Fill the rest of the mask with -1 for undef.
std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
ArrayRef<int> Mask = Shuffle->getShuffleMask();
if (ShuffleMask != Mask)
return TTI::RK_None;
RdxOp = dyn_cast<Instruction>(NextRdxOp);
NumVecElemsRemain /= 2;
MaskStart *= 2;
}
Opcode = RD->Opcode;
Ty = VecTy;
return RD->Kind;
}
TTI::ReductionKind
TTI::matchVectorReduction(const ExtractElementInst *Root, unsigned &Opcode,
VectorType *&Ty, bool &IsPairwise) {
TTI::ReductionKind RdxKind = matchVectorSplittingReduction(Root, Opcode, Ty);
if (RdxKind != TTI::ReductionKind::RK_None) {
IsPairwise = false;
return RdxKind;
}
IsPairwise = true;
return matchPairwiseReduction(Root, Opcode, Ty);
}
InstructionCost InstructionCost
TargetTransformInfo::getInstructionThroughput(const Instruction *I) const { TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

View File

@ -1759,11 +1759,10 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
InstructionCost InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
if (!isa<ScalableVectorType>(Ty)) if (!isa<ScalableVectorType>(Ty))
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
CostKind);
assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) && assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
"Both vector needs to be scalable"); "Both vector needs to be scalable");
@ -1785,10 +1784,7 @@ AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
} }
InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
unsigned Opcode, VectorType *ValTy, bool IsPairwise, unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
TTI::TargetCostKind CostKind) {
assert(!IsPairwise && "Cannot be pair wise to continue");
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
InstructionCost LegalizationCost = 0; InstructionCost LegalizationCost = 0;
if (LT.first > 1) { if (LT.first > 1) {
@ -1814,15 +1810,9 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
InstructionCost InstructionCost
AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
if (isa<ScalableVectorType>(ValTy)) if (isa<ScalableVectorType>(ValTy))
return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
CostKind);
if (IsPairwiseForm)
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second; MVT MTy = LT.second;
@ -1894,8 +1884,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
} }
break; break;
} }
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
CostKind);
} }
InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {

View File

@ -158,12 +158,11 @@ public:
unsigned Index); unsigned Index);
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, InstructionCost getArithmeticReductionCostSVE(unsigned Opcode,
VectorType *ValTy, VectorType *ValTy,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getSpliceCost(VectorType *Tp, int Index); InstructionCost getSpliceCost(VectorType *Tp, int Index);
@ -306,7 +305,7 @@ public:
ElementCount VF) const; ElementCount VF) const;
InstructionCost getArithmeticReductionCost( InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, unsigned Opcode, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,

View File

@ -843,16 +843,13 @@ InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
InstructionCost InstructionCost
GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwise,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty); EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support // Computes cost on targets that have packed math instructions(which support
// 16-bit types only). // 16-bit types only).
if (IsPairwise || if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
!ST->hasVOP3PInsts() || return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind);
OrigTy.getScalarSizeInBits() != 16)
return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getFullRateInstrCost(); return LT.first * getFullRateInstrCost();
@ -860,17 +857,14 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
InstructionCost InstructionCost
GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty); EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support // Computes cost on targets that have packed math instructions(which support
// 16-bit types only). // 16-bit types only).
if (IsPairwise || if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
!ST->hasVOP3PInsts() || return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
OrigTy.getScalarSizeInBits() != 16)
return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost(CostKind); return LT.first * getHalfRateInstrCost(CostKind);

View File

@ -212,13 +212,13 @@ public:
int getInlinerVectorBonusPercent() { return 0; } int getInlinerVectorBonusPercent() { return 0; }
InstructionCost getArithmeticReductionCost( InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, bool IsPairwise, unsigned Opcode, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getMinMaxReductionCost( InstructionCost getMinMaxReductionCost(
VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
}; };

View File

@ -1594,13 +1594,11 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost(
InstructionCost InstructionCost
ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
EVT ValVT = TLI->getValueType(DL, ValTy); EVT ValVT = TLI->getValueType(DL, ValTy);
int ISD = TLI->InstructionOpcodeToISD(Opcode); int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@ -1612,8 +1610,7 @@ ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
CostKind);
} }
InstructionCost InstructionCost

View File

@ -257,7 +257,6 @@ public:
const Instruction *I = nullptr); const Instruction *I = nullptr);
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
Type *ResTy, VectorType *ValTy, Type *ResTy, VectorType *ValTy,

View File

@ -3737,12 +3737,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
InstructionCost InstructionCost
X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
bool IsPairwise,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
// Just use the default implementation for pair reductions.
if (IsPairwise)
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost. // and make it as the cost.
@ -3813,7 +3808,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
TargetTransformInfo::CastContextHint::None, TargetTransformInfo::CastContextHint::None,
CostKind) + CostKind) +
getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind); getArithmeticReductionCost(Opcode, WideVecTy, CostKind);
} }
InstructionCost ArithmeticCost = 0; InstructionCost ArithmeticCost = 0;
@ -3909,8 +3904,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
return ArithmeticCost + Entry->Cost; return ArithmeticCost + Entry->Cost;
return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind);
CostKind);
} }
unsigned NumVecElts = ValVTy->getNumElements(); unsigned NumVecElts = ValVTy->getNumElements();
@ -3919,8 +3913,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
// Special case power of 2 reductions where the scalar type isn't changed // Special case power of 2 reductions where the scalar type isn't changed
// by type legalization. // by type legalization.
if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind);
CostKind);
InstructionCost ReductionCost = 0; InstructionCost ReductionCost = 0;
@ -4118,13 +4111,8 @@ InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
InstructionCost InstructionCost
X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind) { TTI::TargetCostKind CostKind) {
// Just use the default implementation for pair reductions.
if (IsPairwise)
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
CostKind);
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second; MVT MTy = LT.second;
@ -4240,8 +4228,7 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
// by type legalization. // by type legalization.
if (!isPowerOf2_32(ValVTy->getNumElements()) || if (!isPowerOf2_32(ValVTy->getNumElements()) ||
ScalarSize != MTy.getScalarSizeInBits()) ScalarSize != MTy.getScalarSizeInBits())
return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
CostKind);
// Now handle reduction with the legal type, taking into account size changes // Now handle reduction with the legal type, taking into account size changes
// at each level. // at each level.

View File

@ -181,13 +181,13 @@ public:
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getArithmeticReductionCost( InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, unsigned Opcode, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwiseForm, bool IsUnsigned, bool IsUnsigned,
TTI::TargetCostKind CostKind); TTI::TargetCostKind CostKind);
InstructionCost getInterleavedMemoryOpCost( InstructionCost getInterleavedMemoryOpCost(

View File

@ -7176,8 +7176,8 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
const RecurrenceDescriptor &RdxDesc = const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
InstructionCost BaseCost = TTI.getArithmeticReductionCost( InstructionCost BaseCost =
RdxDesc.getOpcode(), VectorTy, false, CostKind); TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind);
// Get the operand that was not the reduction chain and match it to one of the // Get the operand that was not the reduction chain and match it to one of the
// patterns, returning the better cost if it is found. // patterns, returning the better cost if it is found.

View File

@ -7903,17 +7903,15 @@ private:
case RecurKind::FAdd: case RecurKind::FAdd:
case RecurKind::FMul: { case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy);
/*IsPairwiseForm=*/false);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
break; break;
} }
case RecurKind::FMax: case RecurKind::FMax:
case RecurKind::FMin: { case RecurKind::FMin: {
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost = VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
TTI->getMinMaxReductionCost(VectorTy, VecCondTy, /*unsigned=*/false);
/*pairwise=*/false, /*unsigned=*/false);
ScalarCost = ScalarCost =
TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
@ -7927,9 +7925,7 @@ private:
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned = bool IsUnsigned =
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
VectorCost = VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);
TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
/*IsPairwiseForm=*/false, IsUnsigned);
ScalarCost = ScalarCost =
TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

View File

@ -7,9 +7,7 @@
; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
; Check that we recognize the tree starting at the extractelement as a ; These are old tests for matching reduction costs from extract elements - something that has now been removed.
; reduction.
; NOTE: We're only really interested in the extractelement cost, which represents the entire reduction.
define fastcc float @reduction_cost_float(<4 x float> %rdx) { define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; SSE2-LABEL: 'reduction_cost_float' ; SSE2-LABEL: 'reduction_cost_float'
@ -17,7 +15,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSSE3-LABEL: 'reduction_cost_float' ; SSSE3-LABEL: 'reduction_cost_float'
@ -25,7 +23,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSE42-LABEL: 'reduction_cost_float' ; SSE42-LABEL: 'reduction_cost_float'
@ -33,7 +31,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX-LABEL: 'reduction_cost_float' ; AVX-LABEL: 'reduction_cost_float'
@ -41,7 +39,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SLM-LABEL: 'reduction_cost_float' ; SLM-LABEL: 'reduction_cost_float'
@ -49,7 +47,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
%rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -69,7 +67,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX1-LABEL: 'reduction_cost_int' ; AVX1-LABEL: 'reduction_cost_int'
@ -79,7 +77,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX2-LABEL: 'reduction_cost_int' ; AVX2-LABEL: 'reduction_cost_int'
@ -89,7 +87,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; SLM-LABEL: 'reduction_cost_int' ; SLM-LABEL: 'reduction_cost_int'
@ -99,7 +97,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
%rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
@ -127,7 +125,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -138,7 +136,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -149,7 +147,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -160,7 +158,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -171,7 +169,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -199,7 +197,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -210,7 +208,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -221,7 +219,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -232,7 +230,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -243,7 +241,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -270,7 +268,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -280,7 +278,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -290,7 +288,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -300,7 +298,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -310,7 +308,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
; ;
@ -332,31 +330,31 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1
; SSE2-LABEL: 'no_pairwise_reduction2double' ; SSE2-LABEL: 'no_pairwise_reduction2double'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSSE3-LABEL: 'no_pairwise_reduction2double' ; SSSE3-LABEL: 'no_pairwise_reduction2double'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSE42-LABEL: 'no_pairwise_reduction2double' ; SSE42-LABEL: 'no_pairwise_reduction2double'
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX-LABEL: 'no_pairwise_reduction2double' ; AVX-LABEL: 'no_pairwise_reduction2double'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SLM-LABEL: 'no_pairwise_reduction2double' ; SLM-LABEL: 'no_pairwise_reduction2double'
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
%rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@ -372,7 +370,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSSE3-LABEL: 'no_pairwise_reduction4float' ; SSSE3-LABEL: 'no_pairwise_reduction4float'
@ -380,7 +378,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSE42-LABEL: 'no_pairwise_reduction4float' ; SSE42-LABEL: 'no_pairwise_reduction4float'
@ -388,7 +386,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX-LABEL: 'no_pairwise_reduction4float' ; AVX-LABEL: 'no_pairwise_reduction4float'
@ -396,7 +394,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SLM-LABEL: 'no_pairwise_reduction4float' ; SLM-LABEL: 'no_pairwise_reduction4float'
@ -404,7 +402,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
%rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -422,7 +420,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSSE3-LABEL: 'no_pairwise_reduction4double' ; SSSE3-LABEL: 'no_pairwise_reduction4double'
@ -430,7 +428,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSE42-LABEL: 'no_pairwise_reduction4double' ; SSE42-LABEL: 'no_pairwise_reduction4double'
@ -438,7 +436,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX1-LABEL: 'no_pairwise_reduction4double' ; AVX1-LABEL: 'no_pairwise_reduction4double'
@ -446,7 +444,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX2-LABEL: 'no_pairwise_reduction4double' ; AVX2-LABEL: 'no_pairwise_reduction4double'
@ -454,7 +452,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SLM-LABEL: 'no_pairwise_reduction4double' ; SLM-LABEL: 'no_pairwise_reduction4double'
@ -462,7 +460,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
%rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -482,7 +480,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSSE3-LABEL: 'no_pairwise_reduction8float' ; SSSE3-LABEL: 'no_pairwise_reduction8float'
@ -492,7 +490,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSE42-LABEL: 'no_pairwise_reduction8float' ; SSE42-LABEL: 'no_pairwise_reduction8float'
@ -502,7 +500,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX1-LABEL: 'no_pairwise_reduction8float' ; AVX1-LABEL: 'no_pairwise_reduction8float'
@ -512,7 +510,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX2-LABEL: 'no_pairwise_reduction8float' ; AVX2-LABEL: 'no_pairwise_reduction8float'
@ -522,7 +520,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SLM-LABEL: 'no_pairwise_reduction8float' ; SLM-LABEL: 'no_pairwise_reduction8float'
@ -532,7 +530,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
%rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@ -547,22 +545,16 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
} }
define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
; SSE-LABEL: 'no_pairwise_reduction2i64' ; CHECK-LABEL: 'no_pairwise_reduction2i64'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
;
; AVX-LABEL: 'no_pairwise_reduction2i64'
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; SLM-LABEL: 'no_pairwise_reduction2i64' ; SLM-LABEL: 'no_pairwise_reduction2i64'
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
%rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
@ -578,7 +570,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; SLM-LABEL: 'no_pairwise_reduction4i32' ; SLM-LABEL: 'no_pairwise_reduction4i32'
@ -586,7 +578,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
%rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -604,7 +596,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; AVX1-LABEL: 'no_pairwise_reduction4i64' ; AVX1-LABEL: 'no_pairwise_reduction4i64'
@ -612,7 +604,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; AVX2-LABEL: 'no_pairwise_reduction4i64' ; AVX2-LABEL: 'no_pairwise_reduction4i64'
@ -620,7 +612,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; SLM-LABEL: 'no_pairwise_reduction4i64' ; SLM-LABEL: 'no_pairwise_reduction4i64'
@ -628,7 +620,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
%rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -648,7 +640,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SSSE3-LABEL: 'no_pairwise_reduction8i16' ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
@ -658,7 +650,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SSE42-LABEL: 'no_pairwise_reduction8i16' ; SSE42-LABEL: 'no_pairwise_reduction8i16'
@ -668,7 +660,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; AVX-LABEL: 'no_pairwise_reduction8i16' ; AVX-LABEL: 'no_pairwise_reduction8i16'
@ -678,7 +670,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SLM-LABEL: 'no_pairwise_reduction8i16' ; SLM-LABEL: 'no_pairwise_reduction8i16'
@ -688,7 +680,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
%rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@ -710,7 +702,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX1-LABEL: 'no_pairwise_reduction8i32' ; AVX1-LABEL: 'no_pairwise_reduction8i32'
@ -720,7 +712,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX2-LABEL: 'no_pairwise_reduction8i32' ; AVX2-LABEL: 'no_pairwise_reduction8i32'
@ -730,7 +722,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; SLM-LABEL: 'no_pairwise_reduction8i32' ; SLM-LABEL: 'no_pairwise_reduction8i32'
@ -740,7 +732,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
%rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@ -759,35 +751,35 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSSE3-LABEL: 'pairwise_reduction2double' ; SSSE3-LABEL: 'pairwise_reduction2double'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSE42-LABEL: 'pairwise_reduction2double' ; SSE42-LABEL: 'pairwise_reduction2double'
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX-LABEL: 'pairwise_reduction2double' ; AVX-LABEL: 'pairwise_reduction2double'
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SLM-LABEL: 'pairwise_reduction2double' ; SLM-LABEL: 'pairwise_reduction2double'
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
%rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
@ -806,7 +798,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSSE3-LABEL: 'pairwise_reduction4float' ; SSSE3-LABEL: 'pairwise_reduction4float'
@ -816,7 +808,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSE42-LABEL: 'pairwise_reduction4float' ; SSE42-LABEL: 'pairwise_reduction4float'
@ -826,7 +818,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX-LABEL: 'pairwise_reduction4float' ; AVX-LABEL: 'pairwise_reduction4float'
@ -836,7 +828,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SLM-LABEL: 'pairwise_reduction4float' ; SLM-LABEL: 'pairwise_reduction4float'
@ -846,7 +838,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
%rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@ -868,7 +860,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSSE3-LABEL: 'pairwise_reduction4double' ; SSSE3-LABEL: 'pairwise_reduction4double'
@ -878,7 +870,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SSE42-LABEL: 'pairwise_reduction4double' ; SSE42-LABEL: 'pairwise_reduction4double'
@ -888,7 +880,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX1-LABEL: 'pairwise_reduction4double' ; AVX1-LABEL: 'pairwise_reduction4double'
@ -898,7 +890,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; AVX2-LABEL: 'pairwise_reduction4double' ; AVX2-LABEL: 'pairwise_reduction4double'
@ -908,7 +900,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
; SLM-LABEL: 'pairwise_reduction4double' ; SLM-LABEL: 'pairwise_reduction4double'
@ -918,7 +910,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r
; ;
%rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@ -943,7 +935,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSSE3-LABEL: 'pairwise_reduction8float' ; SSSE3-LABEL: 'pairwise_reduction8float'
@ -956,7 +948,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SSE42-LABEL: 'pairwise_reduction8float' ; SSE42-LABEL: 'pairwise_reduction8float'
@ -969,7 +961,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX1-LABEL: 'pairwise_reduction8float' ; AVX1-LABEL: 'pairwise_reduction8float'
@ -982,7 +974,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; AVX2-LABEL: 'pairwise_reduction8float' ; AVX2-LABEL: 'pairwise_reduction8float'
@ -995,7 +987,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
; SLM-LABEL: 'pairwise_reduction8float' ; SLM-LABEL: 'pairwise_reduction8float'
@ -1008,7 +1000,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r
; ;
%rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@ -1030,14 +1022,14 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; SLM-LABEL: 'pairwise_reduction2i64' ; SLM-LABEL: 'pairwise_reduction2i64'
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
%rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
@ -1056,7 +1048,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; SLM-LABEL: 'pairwise_reduction4i32' ; SLM-LABEL: 'pairwise_reduction4i32'
@ -1066,7 +1058,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
%rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@ -1088,7 +1080,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; AVX1-LABEL: 'pairwise_reduction4i64' ; AVX1-LABEL: 'pairwise_reduction4i64'
@ -1098,7 +1090,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; AVX2-LABEL: 'pairwise_reduction4i64' ; AVX2-LABEL: 'pairwise_reduction4i64'
@ -1108,7 +1100,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
; SLM-LABEL: 'pairwise_reduction4i64' ; SLM-LABEL: 'pairwise_reduction4i64'
@ -1118,7 +1110,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
; ;
%rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@ -1143,7 +1135,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SSSE3-LABEL: 'pairwise_reduction8i16' ; SSSE3-LABEL: 'pairwise_reduction8i16'
@ -1156,7 +1148,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SSE42-LABEL: 'pairwise_reduction8i16' ; SSE42-LABEL: 'pairwise_reduction8i16'
@ -1169,7 +1161,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; AVX-LABEL: 'pairwise_reduction8i16' ; AVX-LABEL: 'pairwise_reduction8i16'
@ -1182,7 +1174,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
; SLM-LABEL: 'pairwise_reduction8i16' ; SLM-LABEL: 'pairwise_reduction8i16'
@ -1195,7 +1187,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
; ;
%rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@ -1223,7 +1215,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX1-LABEL: 'pairwise_reduction8i32' ; AVX1-LABEL: 'pairwise_reduction8i32'
@ -1236,7 +1228,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; AVX2-LABEL: 'pairwise_reduction8i32' ; AVX2-LABEL: 'pairwise_reduction8i32'
@ -1249,7 +1241,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
; SLM-LABEL: 'pairwise_reduction8i32' ; SLM-LABEL: 'pairwise_reduction8i32'
@ -1262,7 +1254,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
; ;
%rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>