From 38c9a4068df5c44433da3fc15789cef8ce71367d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 9 Jul 2021 11:51:16 +0100
Subject: [PATCH] [TTI] Remove IsPairwiseForm from getArithmeticReductionCost

This patch removes the IsPairwiseForm flag from the Reduction Cost TTI
hooks, along with some accompanying code for pattern matching reductions
from trees starting at extract elements. IsPairWise is now assumed to be
false, which was the predominant way that the value was used from both
the Loop and SLP vectorizers. Since the adjustments such as D93860, the
SLP vectorizer has not relied upon this distinction between paiwise and
non-pairwise reductions.

This also removes some code that was detecting reductions trees starting
from extract elements inside the costmodel. This case was
double-counting costs though, adding the individual costs on the
individual instruction _and_ the total cost of the reduction. Removing
it changes the costs in llvm/test/Analysis/CostModel/X86/reduction.ll to
not double count. The cost of reduction intrinsics is still tested
through the various tests in
llvm/test/Analysis/CostModel/X86/reduce-xyz.ll.

Differential Revision: https://reviews.llvm.org/D105484
---
 .../llvm/Analysis/TargetTransformInfo.h       |  65 +---
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  24 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  60 +---
 llvm/lib/Analysis/TargetTransformInfo.cpp     | 296 +-----------------
 .../AArch64/AArch64TargetTransformInfo.cpp    |  21 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   5 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  16 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |   4 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |   7 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   1 -
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  23 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |   4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  12 +-
 llvm/test/Analysis/CostModel/X86/reduction.ll | 222 +++++++------
 15 files changed, 161 insertions(+), 603 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a9b103b54576..b4b3736c36d5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -865,40 +865,6 @@ public:
                          ///< a vector of the same type as the input vectors.
   };
 
-  /// Kind of the reduction data.
-  enum ReductionKind {
-    RK_None,           /// Not a reduction.
-    RK_Arithmetic,     /// Binary reduction data.
-    RK_MinMax,         /// Min/max reduction data.
-    RK_UnsignedMinMax, /// Unsigned min/max reduction data.
-  };
-
-  /// Contains opcode + LHS/RHS parts of the reduction operations.
-  struct ReductionData {
-    ReductionData() = delete;
-    ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
-        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
-      assert(Kind != RK_None && "expected binary or min/max reduction only.");
-    }
-    unsigned Opcode = 0;
-    Value *LHS = nullptr;
-    Value *RHS = nullptr;
-    ReductionKind Kind = RK_None;
-    bool hasSameData(ReductionData &RD) const {
-      return Kind == RD.Kind && Opcode == RD.Opcode;
-    }
-  };
-
-  static ReductionKind matchPairwiseReduction(
-    const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
-
-  static ReductionKind matchVectorSplittingReduction(
-    const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty);
-
-  static ReductionKind matchVectorReduction(const ExtractElementInst *ReduxRoot,
-                                            unsigned &Opcode, VectorType *&Ty,
-                                            bool &IsPairwise);
-
   /// Additional information about an operand's possible values.
   enum OperandValueKind {
     OK_AnyValue,               // Operand can have any value.
@@ -1180,25 +1146,16 @@ public:
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) const;
 
-  /// Calculate the cost of performing a vector reduction.
+  /// Calculate the cost of vector reduction intrinsics.
   ///
   /// This is the cost of reducing the vector value of type \p Ty to a scalar
-  /// value using the operation denoted by \p Opcode. The form of the reduction
-  /// can either be a pairwise reduction or a reduction that splits the vector
-  /// at every reduction level.
-  ///
-  /// Pairwise:
-  ///  (v0, v1, v2, v3)
-  ///  ((v0+v1), (v2+v3), undef, undef)
-  /// Split:
-  ///  (v0, v1, v2, v3)
-  ///  ((v0+v2), (v1+v3), undef, undef)
+  /// value using the operation denoted by \p Opcode.
   InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
+      unsigned Opcode, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   InstructionCost getMinMaxReductionCost(
-      VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+      VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// Calculate the cost of an extended reduction pattern, similar to
@@ -1661,11 +1618,9 @@ public:
       bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
   virtual InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                             bool IsPairwiseForm,
                              TTI::TargetCostKind CostKind) = 0;
   virtual InstructionCost
-  getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                         bool IsPairwiseForm, bool IsUnsigned,
+  getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
                          TTI::TargetCostKind CostKind) = 0;
   virtual InstructionCost getExtendedAddReductionCost(
       bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
@@ -2164,17 +2119,13 @@ public:
   }
   InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                             bool IsPairwiseForm,
                              TTI::TargetCostKind CostKind) override {
-    return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm,
-                                           CostKind);
+    return Impl.getArithmeticReductionCost(Opcode, Ty, CostKind);
   }
   InstructionCost
-  getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                         bool IsPairwiseForm, bool IsUnsigned,
+  getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
                          TTI::TargetCostKind CostKind) override {
-    return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned,
-                                       CostKind);
+    return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
   }
   InstructionCost getExtendedAddReductionCost(
       bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index eff4a7cdb85b..6c7e59d08853 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -621,12 +621,12 @@ public:
     return 0;
   }
 
-  InstructionCost getArithmeticReductionCost(unsigned, VectorType *, bool,
+  InstructionCost getArithmeticReductionCost(unsigned, VectorType *,
                                              TTI::TargetCostKind) const {
     return 1;
   }
 
-  InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, bool,
+  InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool,
                                          TTI::TargetCostKind) const {
     return 1;
   }
@@ -1112,26 +1112,6 @@ public:
       if (CI)
         Idx = CI->getZExtValue();
 
-      // Try to match a reduction (a series of shufflevector and vector ops
-      // followed by an extractelement).
-      unsigned RdxOpcode;
-      VectorType *RdxType;
-      bool IsPairwise;
-      switch (TTI::matchVectorReduction(EEI, RdxOpcode, RdxType, IsPairwise)) {
-      case TTI::RK_Arithmetic:
-        return TargetTTI->getArithmeticReductionCost(RdxOpcode, RdxType,
-                                                     IsPairwise, CostKind);
-      case TTI::RK_MinMax:
-        return TargetTTI->getMinMaxReductionCost(
-            RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
-            IsPairwise, /*IsUnsigned=*/false, CostKind);
-      case TTI::RK_UnsignedMinMax:
-        return TargetTTI->getMinMaxReductionCost(
-            RdxType, cast<VectorType>(CmpInst::makeCmpResultType(RdxType)),
-            IsPairwise, /*IsUnsigned=*/true, CostKind);
-      case TTI::RK_None:
-        break;
-      }
       return TargetTTI->getVectorInstrCost(Opcode, U->getOperand(0)->getType(),
                                            Idx);
     }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 35c439c12b20..66e0f5f2e7a9 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1654,33 +1654,26 @@ public:
     }
     case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_mul:
       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_and:
       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_or:
       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_xor:
       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_fadd:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_fmul:
       // FIXME: Add new flag for cost of strict reductions.
       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
-                                                 /*IsPairwiseForm=*/false,
                                                  CostKind);
     case Intrinsic::vector_reduce_smax:
     case Intrinsic::vector_reduce_smin:
@@ -1688,13 +1681,11 @@ public:
     case Intrinsic::vector_reduce_fmin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
-          /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/false, CostKind);
     case Intrinsic::vector_reduce_umax:
     case Intrinsic::vector_reduce_umin:
       return thisT()->getMinMaxReductionCost(
           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
-          /*IsPairwiseForm=*/false,
           /*IsUnsigned=*/true, CostKind);
     case Intrinsic::abs:
     case Intrinsic::smax:
@@ -1998,9 +1989,9 @@ public:
     return 0;
   }
 
-  /// Try to calculate arithmetic and shuffle op costs for reduction operations.
+  /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
   /// We're assuming that reduction operation are performing the following way:
-  /// 1. Non-pairwise reduction
+  ///
   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
   ///            \----------------v-------------/  \----------v------------/
@@ -2016,25 +2007,10 @@ public:
   ///                            n/4 elements               3*n/4 elements
   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
   /// length n/2, the resulting vector has length n/4 etc.
-  /// 2. Pairwise reduction:
-  /// Everything is the same except for an additional shuffle operation which
-  /// is used to produce operands for pairwise kind of reductions.
-  /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
-  /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
-  ///            \-------------v----------/  \----------v------------/
-  ///                   n/2 elements               n/2 elements
-  /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
-  /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
-  ///            \-------------v----------/  \----------v------------/
-  ///                   n/2 elements               n/2 elements
-  /// %red1 = op <n x t> %val1, <n x t> val2
-  /// Again, the operation is performed on <n x t> vector, but the resulting
-  /// vector %red1 is <n/2 x t> vector.
   ///
   /// The cost model should take into account that the actual length of the
   /// vector is reduced on each iteration.
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                             bool IsPairwise,
                                              TTI::TargetCostKind CostKind) {
     Type *ScalarTy = Ty->getElementType();
     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
@@ -2065,9 +2041,7 @@ public:
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
-      // Assume the pairwise shuffles add a cost.
-      ShuffleCost += (IsPairwise + 1) *
-                     thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
+      ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
                                              NumVecElts, SubTy);
       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
       Ty = SubTy;
@@ -2081,13 +2055,8 @@ public:
     // reduction operations are performed on the vectors with the same
     // architecture-dependent length.
 
-    // Non pairwise reductions need one shuffle per reduction level. Pairwise
-    // reductions need two shuffles on every level, but the last one. On that
-    // level one of the shuffles is <0, u, u, ...> which is identity.
-    unsigned NumShuffles = NumReduxLevels;
-    if (IsPairwise && NumReduxLevels >= 1)
-      NumShuffles += NumReduxLevels - 1;
-    ShuffleCost += NumShuffles * thisT()->getShuffleCost(
+    // By default reductions need one shuffle per reduction level.
+    ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
     ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
     return ShuffleCost + ArithCost +
@@ -2097,7 +2066,7 @@ public:
   /// Try to calculate op costs for min/max reduction operations.
   /// \param CondTy Conditional type for the Select instruction.
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                                         bool IsPairwise, bool IsUnsigned,
+                                         bool IsUnsigned,
                                          TTI::TargetCostKind CostKind) {
     Type *ScalarTy = Ty->getElementType();
     Type *ScalarCondTy = CondTy->getElementType();
@@ -2123,9 +2092,7 @@ public:
       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
 
-      // Assume the pairwise shuffles add a cost.
-      ShuffleCost += (IsPairwise + 1) *
-                     thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
+      ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
                                              NumVecElts, SubTy);
       MinMaxCost +=
           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
@@ -2142,14 +2109,7 @@ public:
     // operations performed on the current platform. That's why several final
     // reduction opertions are perfomed on the vectors with the same
     // architecture-dependent length.
-
-    // Non pairwise reductions need one shuffle per reduction level. Pairwise
-    // reductions need two shuffles on every level, but the last one. On that
-    // level one of the shuffles is <0, u, u, ...> which is identity.
-    unsigned NumShuffles = NumReduxLevels;
-    if (IsPairwise && NumReduxLevels >= 1)
-      NumShuffles += NumReduxLevels - 1;
-    ShuffleCost += NumShuffles * thisT()->getShuffleCost(
+    ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
     MinMaxCost +=
         NumReduxLevels *
@@ -2169,8 +2129,8 @@ public:
     // Without any native support, this is equivalent to the cost of
     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
     VectorType *ExtTy = VectorType::get(ResTy, Ty);
-    InstructionCost RedCost = thisT()->getArithmeticReductionCost(
-        Instruction::Add, ExtTy, false, CostKind);
+    InstructionCost RedCost =
+        thisT()->getArithmeticReductionCost(Instruction::Add, ExtTy, CostKind);
     InstructionCost MulCost = 0;
     InstructionCost ExtCost = thisT()->getCastInstrCost(
         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3b89a9807863..7e68b8f754ba 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -894,19 +894,18 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
 }
 
 InstructionCost TargetTransformInfo::getArithmeticReductionCost(
-    unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
-    TTI::TargetCostKind CostKind) const {
+    unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const {
   InstructionCost Cost =
-      TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, CostKind);
+      TTIImpl->getArithmeticReductionCost(Opcode, Ty, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 InstructionCost TargetTransformInfo::getMinMaxReductionCost(
-    VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+    VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
     TTI::TargetCostKind CostKind) const {
-  InstructionCost Cost = TTIImpl->getMinMaxReductionCost(
-      Ty, CondTy, IsPairwiseForm, IsUnsigned, CostKind);
+  InstructionCost Cost =
+      TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -1057,291 +1056,6 @@ TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
   return TTIImpl->getInstructionLatency(I);
 }
 
-static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
-                                     unsigned Level) {
-  // We don't need a shuffle if we just want to have element 0 in position 0 of
-  // the vector.
-  if (!SI && Level == 0 && IsLeft)
-    return true;
-  else if (!SI)
-    return false;
-
-  SmallVector<int, 32> Mask(
-      cast<FixedVectorType>(SI->getType())->getNumElements(), -1);
-
-  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
-  // we look at the left or right side.
-  for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
-    Mask[i] = val;
-
-  ArrayRef<int> ActualMask = SI->getShuffleMask();
-  return Mask == ActualMask;
-}
-
-static Optional<TTI::ReductionData> getReductionData(Instruction *I) {
-  Value *L, *R;
-  if (m_BinOp(m_Value(L), m_Value(R)).match(I))
-    return TTI::ReductionData(TTI::RK_Arithmetic, I->getOpcode(), L, R);
-  if (auto *SI = dyn_cast<SelectInst>(I)) {
-    if (m_SMin(m_Value(L), m_Value(R)).match(SI) ||
-        m_SMax(m_Value(L), m_Value(R)).match(SI) ||
-        m_OrdFMin(m_Value(L), m_Value(R)).match(SI) ||
-        m_OrdFMax(m_Value(L), m_Value(R)).match(SI) ||
-        m_UnordFMin(m_Value(L), m_Value(R)).match(SI) ||
-        m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) {
-      auto *CI = cast<CmpInst>(SI->getCondition());
-      return TTI::ReductionData(TTI::RK_MinMax, CI->getOpcode(), L, R);
-    }
-    if (m_UMin(m_Value(L), m_Value(R)).match(SI) ||
-        m_UMax(m_Value(L), m_Value(R)).match(SI)) {
-      auto *CI = cast<CmpInst>(SI->getCondition());
-      return TTI::ReductionData(TTI::RK_UnsignedMinMax, CI->getOpcode(), L, R);
-    }
-  }
-  return llvm::None;
-}
-
-static TTI::ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
-                                                        unsigned Level,
-                                                        unsigned NumLevels) {
-  // Match one level of pairwise operations.
-  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
-  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
-  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
-  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
-  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-  if (!I)
-    return TTI::RK_None;
-
-  assert(I->getType()->isVectorTy() && "Expecting a vector type");
-
-  Optional<TTI::ReductionData> RD = getReductionData(I);
-  if (!RD)
-    return TTI::RK_None;
-
-  ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(RD->LHS);
-  if (!LS && Level)
-    return TTI::RK_None;
-  ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(RD->RHS);
-  if (!RS && Level)
-    return TTI::RK_None;
-
-  // On level 0 we can omit one shufflevector instruction.
-  if (!Level && !RS && !LS)
-    return TTI::RK_None;
-
-  // Shuffle inputs must match.
-  Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr;
-  Value *NextLevelOpR = RS ? RS->getOperand(0) : nullptr;
-  Value *NextLevelOp = nullptr;
-  if (NextLevelOpR && NextLevelOpL) {
-    // If we have two shuffles their operands must match.
-    if (NextLevelOpL != NextLevelOpR)
-      return TTI::RK_None;
-
-    NextLevelOp = NextLevelOpL;
-  } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
-    // On the first level we can omit the shufflevector <0, undef,...>. So the
-    // input to the other shufflevector <1, undef> must match with one of the
-    // inputs to the current binary operation.
-    // Example:
-    //  %NextLevelOpL = shufflevector %R, <1, undef ...>
-    //  %BinOp        = fadd          %NextLevelOpL, %R
-    if (NextLevelOpL && NextLevelOpL != RD->RHS)
-      return TTI::RK_None;
-    else if (NextLevelOpR && NextLevelOpR != RD->LHS)
-      return TTI::RK_None;
-
-    NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS;
-  } else
-    return TTI::RK_None;
-
-  // Check that the next levels binary operation exists and matches with the
-  // current one.
-  if (Level + 1 != NumLevels) {
-    if (!isa<Instruction>(NextLevelOp))
-      return TTI::RK_None;
-    Optional<TTI::ReductionData> NextLevelRD =
-        getReductionData(cast<Instruction>(NextLevelOp));
-    if (!NextLevelRD || !RD->hasSameData(*NextLevelRD))
-      return TTI::RK_None;
-  }
-
-  // Shuffle mask for pairwise operation must match.
-  if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) {
-    if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level))
-      return TTI::RK_None;
-  } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) {
-    if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level))
-      return TTI::RK_None;
-  } else {
-    return TTI::RK_None;
-  }
-
-  if (++Level == NumLevels)
-    return RD->Kind;
-
-  // Match next level.
-  return matchPairwiseReductionAtLevel(dyn_cast<Instruction>(NextLevelOp), Level,
-                                       NumLevels);
-}
-
-TTI::ReductionKind TTI::matchPairwiseReduction(
-  const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
-  if (!EnableReduxCost)
-    return TTI::RK_None;
-
-  // Need to extract the first element.
-  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
-  unsigned Idx = ~0u;
-  if (CI)
-    Idx = CI->getZExtValue();
-  if (Idx != 0)
-    return TTI::RK_None;
-
-  auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
-  if (!RdxStart)
-    return TTI::RK_None;
-  Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
-  if (!RD)
-    return TTI::RK_None;
-
-  auto *VecTy = cast<FixedVectorType>(RdxStart->getType());
-  unsigned NumVecElems = VecTy->getNumElements();
-  if (!isPowerOf2_32(NumVecElems))
-    return TTI::RK_None;
-
-  // We look for a sequence of shuffle,shuffle,add triples like the following
-  // that builds a pairwise reduction tree.
-  //
-  //  (X0, X1, X2, X3)
-  //   (X0 + X1, X2 + X3, undef, undef)
-  //    ((X0 + X1) + (X2 + X3), undef, undef, undef)
-  //
-  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
-  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
-  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
-  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
-  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-  // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
-  //       <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-  // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
-  //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-  // %r = extractelement <4 x float> %bin.rdx8, i32 0
-  if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) ==
-      TTI::RK_None)
-    return TTI::RK_None;
-
-  Opcode = RD->Opcode;
-  Ty = VecTy;
-
-  return RD->Kind;
-}
-
-static std::pair<Value *, ShuffleVectorInst *>
-getShuffleAndOtherOprd(Value *L, Value *R) {
-  ShuffleVectorInst *S = nullptr;
-
-  if ((S = dyn_cast<ShuffleVectorInst>(L)))
-    return std::make_pair(R, S);
-
-  S = dyn_cast<ShuffleVectorInst>(R);
-  return std::make_pair(L, S);
-}
-
-TTI::ReductionKind TTI::matchVectorSplittingReduction(
-  const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
-
-  if (!EnableReduxCost)
-    return TTI::RK_None;
-
-  // Need to extract the first element.
-  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
-  unsigned Idx = ~0u;
-  if (CI)
-    Idx = CI->getZExtValue();
-  if (Idx != 0)
-    return TTI::RK_None;
-
-  auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
-  if (!RdxStart)
-    return TTI::RK_None;
-  Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
-  if (!RD)
-    return TTI::RK_None;
-
-  auto *VecTy = cast<FixedVectorType>(ReduxRoot->getOperand(0)->getType());
-  unsigned NumVecElems = VecTy->getNumElements();
-  if (!isPowerOf2_32(NumVecElems))
-    return TTI::RK_None;
-
-  // We look for a sequence of shuffles and adds like the following matching one
-  // fadd, shuffle vector pair at a time.
-  //
-  // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
-  //                           <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
-  // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
-  //                          <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-  // %r = extractelement <4 x float> %bin.rdx8, i32 0
-
-  unsigned MaskStart = 1;
-  Instruction *RdxOp = RdxStart;
-  SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
-  unsigned NumVecElemsRemain = NumVecElems;
-  while (NumVecElemsRemain - 1) {
-    // Check for the right reduction operation.
-    if (!RdxOp)
-      return TTI::RK_None;
-    Optional<TTI::ReductionData> RDLevel = getReductionData(RdxOp);
-    if (!RDLevel || !RDLevel->hasSameData(*RD))
-      return TTI::RK_None;
-
-    Value *NextRdxOp;
-    ShuffleVectorInst *Shuffle;
-    std::tie(NextRdxOp, Shuffle) =
-        getShuffleAndOtherOprd(RDLevel->LHS, RDLevel->RHS);
-
-    // Check the current reduction operation and the shuffle use the same value.
-    if (Shuffle == nullptr)
-      return TTI::RK_None;
-    if (Shuffle->getOperand(0) != NextRdxOp)
-      return TTI::RK_None;
-
-    // Check that shuffle masks matches.
-    for (unsigned j = 0; j != MaskStart; ++j)
-      ShuffleMask[j] = MaskStart + j;
-    // Fill the rest of the mask with -1 for undef.
-    std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
-
-    ArrayRef<int> Mask = Shuffle->getShuffleMask();
-    if (ShuffleMask != Mask)
-      return TTI::RK_None;
-
-    RdxOp = dyn_cast<Instruction>(NextRdxOp);
-    NumVecElemsRemain /= 2;
-    MaskStart *= 2;
-  }
-
-  Opcode = RD->Opcode;
-  Ty = VecTy;
-  return RD->Kind;
-}
-
-TTI::ReductionKind
-TTI::matchVectorReduction(const ExtractElementInst *Root, unsigned &Opcode,
-                          VectorType *&Ty, bool &IsPairwise) {
-  TTI::ReductionKind RdxKind = matchVectorSplittingReduction(Root, Opcode, Ty);
-  if (RdxKind != TTI::ReductionKind::RK_None) {
-    IsPairwise = false;
-    return RdxKind;
-  }
-  IsPairwise = true;
-  return matchPairwiseReduction(Root, Opcode, Ty);
-}
-
 InstructionCost
 TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6005852f1710..af34a5ea9486 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1759,11 +1759,10 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction(
 
 InstructionCost
 AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                                       bool IsPairwise, bool IsUnsigned,
+                                       bool IsUnsigned,
                                        TTI::TargetCostKind CostKind) {
   if (!isa<ScalableVectorType>(Ty))
-    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
-                                         CostKind);
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
   assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) &&
          "Both vector needs to be scalable");
 
@@ -1785,10 +1784,7 @@ AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
 }
 
 InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
-    unsigned Opcode, VectorType *ValTy, bool IsPairwise,
-    TTI::TargetCostKind CostKind) {
-  assert(!IsPairwise && "Cannot be pair wise to continue");
-
+    unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
   InstructionCost LegalizationCost = 0;
   if (LT.first > 1) {
@@ -1814,15 +1810,9 @@ InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
 
 InstructionCost
 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
-                                           bool IsPairwiseForm,
                                            TTI::TargetCostKind CostKind) {
-
   if (isa<ScalableVectorType>(ValTy))
-    return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm,
-                                         CostKind);
-  if (IsPairwiseForm)
-    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                             CostKind);
+    return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
 
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
   MVT MTy = LT.second;
@@ -1894,8 +1884,7 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     }
     break;
   }
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                           CostKind);
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
 }
 
 InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 83956fbbe05b..1de650bd3220 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -158,12 +158,11 @@ public:
                                      unsigned Index);
 
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                                         bool IsPairwise, bool IsUnsigned,
+                                         bool IsUnsigned,
                                          TTI::TargetCostKind CostKind);
 
   InstructionCost getArithmeticReductionCostSVE(unsigned Opcode,
                                                 VectorType *ValTy,
-                                                bool IsPairwiseForm,
                                                 TTI::TargetCostKind CostKind);
 
   InstructionCost getSpliceCost(VectorType *Tp, int Index);
@@ -306,7 +305,7 @@ public:
                                    ElementCount VF) const;
 
   InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
+      unsigned Opcode, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 67886b9681ae..f5a34ec94421 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -843,16 +843,13 @@ InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost
 GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                       bool IsPairwise,
                                        TTI::TargetCostKind CostKind) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
 
   // Computes cost on targets that have packed math instructions(which support
   // 16-bit types only).
-  if (IsPairwise ||
-      !ST->hasVOP3PInsts() ||
-      OrigTy.getScalarSizeInBits() != 16)
-    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
+  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, CostKind);
 
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
   return LT.first * getFullRateInstrCost();
@@ -860,17 +857,14 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
 
 InstructionCost
 GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                                   bool IsPairwise, bool IsUnsigned,
+                                   bool IsUnsigned,
                                    TTI::TargetCostKind CostKind) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
 
   // Computes cost on targets that have packed math instructions(which support
   // 16-bit types only).
-  if (IsPairwise ||
-      !ST->hasVOP3PInsts() ||
-      OrigTy.getScalarSizeInBits() != 16)
-    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
-                                         CostKind);
+  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
 
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
   return LT.first * getHalfRateInstrCost(CostKind);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index e4e816a7a74e..c6cf31b35cee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -212,13 +212,13 @@ public:
   int getInlinerVectorBonusPercent() { return 0; }
 
   InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, bool IsPairwise,
+      unsigned Opcode, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
   InstructionCost getMinMaxReductionCost(
-      VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+      VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 };
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index f8951434ed8c..762ed8e6666e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1594,13 +1594,11 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost(
 
 InstructionCost
 ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
-                                       bool IsPairwiseForm,
                                        TTI::TargetCostKind CostKind) {
   EVT ValVT = TLI->getValueType(DL, ValTy);
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD)
-    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                             CostKind);
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
 
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1612,8 +1610,7 @@ ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
     return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
 
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
-                                           CostKind);
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index ae79832ed316..989f01006cfe 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -257,7 +257,6 @@ public:
                                          const Instruction *I = nullptr);
 
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
-                                             bool IsPairwiseForm,
                                              TTI::TargetCostKind CostKind);
   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                               Type *ResTy, VectorType *ValTy,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ff8d2d57e8f1..1d802e04198a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3737,12 +3737,7 @@ InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
 
 InstructionCost
 X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
-                                       bool IsPairwise,
                                        TTI::TargetCostKind CostKind) {
-  // Just use the default implementation for pair reductions.
-  if (IsPairwise)
-    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
-
   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
   // and make it as the cost.
 
@@ -3813,7 +3808,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
                             TargetTransformInfo::CastContextHint::None,
                             CostKind) +
-           getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind);
+           getArithmeticReductionCost(Opcode, WideVecTy, CostKind);
   }
 
   InstructionCost ArithmeticCost = 0;
@@ -3909,8 +3904,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
         return ArithmeticCost + Entry->Cost;
 
-    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
-                                             CostKind);
+    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind);
   }
 
   unsigned NumVecElts = ValVTy->getNumElements();
@@ -3919,8 +3913,7 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   // Special case power of 2 reductions where the scalar type isn't changed
   // by type legalization.
   if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
-    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
-                                             CostKind);
+    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, CostKind);
 
   InstructionCost ReductionCost = 0;
 
@@ -4118,13 +4111,8 @@ InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
 
 InstructionCost
 X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
-                                   bool IsPairwise, bool IsUnsigned,
+                                   bool IsUnsigned,
                                    TTI::TargetCostKind CostKind) {
-  // Just use the default implementation for pair reductions.
-  if (IsPairwise)
-    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
-                                         CostKind);
-
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
@@ -4240,8 +4228,7 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
   // by type legalization.
   if (!isPowerOf2_32(ValVTy->getNumElements()) ||
       ScalarSize != MTy.getScalarSizeInBits())
-    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
-                                         CostKind);
+    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
 
   // Now handle reduction with the legal type, taking into account size changes
   // at each level.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 6d95119992b6..ae6c4ad21140 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -181,13 +181,13 @@ public:
                                         TTI::TargetCostKind CostKind);
 
   InstructionCost getArithmeticReductionCost(
-      unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
+      unsigned Opcode, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
 
   InstructionCost getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
 
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
-                                         bool IsPairwiseForm, bool IsUnsigned,
+                                         bool IsUnsigned,
                                          TTI::TargetCostKind CostKind);
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4fb3055f041..13243f77dd45 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7176,8 +7176,8 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
 
   const RecurrenceDescriptor &RdxDesc =
       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
-  InstructionCost BaseCost = TTI.getArithmeticReductionCost(
-      RdxDesc.getOpcode(), VectorTy, false, CostKind);
+  InstructionCost BaseCost =
+      TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), VectorTy, CostKind);
 
   // Get the operand that was not the reduction chain and match it to one of the
   // patterns, returning the better cost if it is found.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 01f6873388cd..2ec32156a15f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7903,17 +7903,15 @@ private:
     case RecurKind::FAdd:
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
-      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
-                                                   /*IsPairwiseForm=*/false);
+      VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy);
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
       break;
     }
     case RecurKind::FMax:
     case RecurKind::FMin: {
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
-      VectorCost =
-          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                      /*pairwise=*/false, /*unsigned=*/false);
+      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                               /*unsigned=*/false);
       ScalarCost =
           TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
@@ -7927,9 +7925,7 @@ private:
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       bool IsUnsigned =
           RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
-      VectorCost =
-          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                      /*IsPairwiseForm=*/false, IsUnsigned);
+      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);
       ScalarCost =
           TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
           TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index 1812a074f86b..f52a6a384a48 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -7,9 +7,7 @@
 
 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
-; Check that we recognize the tree starting at the extractelement as a
-; reduction.
-; NOTE: We're only really interested in the extractelement cost, which represents the entire reduction.
+; These are old tests for matching reduction costs from extract elements - something that has now been removed.
 
 define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSE2-LABEL: 'reduction_cost_float'
@@ -17,7 +15,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'reduction_cost_float'
@@ -25,7 +23,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'reduction_cost_float'
@@ -33,7 +31,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX-LABEL: 'reduction_cost_float'
@@ -41,7 +39,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'reduction_cost_float'
@@ -49,7 +47,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -69,7 +67,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'reduction_cost_int'
@@ -79,7 +77,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX2-LABEL: 'reduction_cost_int'
@@ -89,7 +87,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'reduction_cost_int'
@@ -99,7 +97,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
@@ -127,7 +125,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -138,7 +136,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -149,7 +147,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -160,7 +158,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -171,7 +169,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -199,7 +197,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -210,7 +208,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -221,7 +219,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -232,7 +230,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -243,7 +241,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -270,7 +268,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -280,7 +278,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -290,7 +288,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -300,7 +298,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -310,7 +308,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r2
 ;
@@ -332,31 +330,31 @@ define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1
 ; SSE2-LABEL: 'no_pairwise_reduction2double'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction2double'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction2double'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX-LABEL: 'no_pairwise_reduction2double'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction2double'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -372,7 +370,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4float'
@@ -380,7 +378,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4float'
@@ -388,7 +386,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX-LABEL: 'no_pairwise_reduction4float'
@@ -396,7 +394,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4float'
@@ -404,7 +402,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -422,7 +420,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4double'
@@ -430,7 +428,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4double'
@@ -438,7 +436,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4double'
@@ -446,7 +444,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction4double'
@@ -454,7 +452,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4double'
@@ -462,7 +460,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -482,7 +480,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8float'
@@ -492,7 +490,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8float'
@@ -502,7 +500,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8float'
@@ -512,7 +510,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction8float'
@@ -522,7 +520,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8float'
@@ -532,7 +530,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -547,22 +545,16 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 }
 
 define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
-; SSE-LABEL: 'no_pairwise_reduction2i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
-;
-; AVX-LABEL: 'no_pairwise_reduction2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
+; CHECK-LABEL: 'no_pairwise_reduction2i64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction2i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
@@ -578,7 +570,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4i32'
@@ -586,7 +578,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -604,7 +596,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction4i64'
@@ -612,7 +604,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction4i64'
@@ -620,7 +612,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4i64'
@@ -628,7 +620,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -648,7 +640,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
@@ -658,7 +650,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8i16'
@@ -668,7 +660,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; AVX-LABEL: 'no_pairwise_reduction8i16'
@@ -678,7 +670,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8i16'
@@ -688,7 +680,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -710,7 +702,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'no_pairwise_reduction8i32'
@@ -720,7 +712,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX2-LABEL: 'no_pairwise_reduction8i32'
@@ -730,7 +722,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8i32'
@@ -740,7 +732,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -759,35 +751,35 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction2double'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction2double'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX-LABEL: 'pairwise_reduction2double'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SLM-LABEL: 'pairwise_reduction2double'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
@@ -806,7 +798,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4float'
@@ -816,7 +808,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4float'
@@ -826,7 +818,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX-LABEL: 'pairwise_reduction4float'
@@ -836,7 +828,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4float'
@@ -846,7 +838,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -868,7 +860,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction4double'
@@ -878,7 +870,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction4double'
@@ -888,7 +880,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4double'
@@ -898,7 +890,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4double'
@@ -908,7 +900,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4double'
@@ -918,7 +910,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -943,7 +935,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8float'
@@ -956,7 +948,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8float'
@@ -969,7 +961,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8float'
@@ -982,7 +974,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8float'
@@ -995,7 +987,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8float'
@@ -1008,7 +1000,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1030,14 +1022,14 @@ define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction2i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
@@ -1056,7 +1048,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i32'
@@ -1066,7 +1058,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -1088,7 +1080,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction4i64'
@@ -1098,7 +1090,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction4i64'
@@ -1108,7 +1100,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction4i64'
@@ -1118,7 +1110,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
   %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -1143,7 +1135,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
@@ -1156,7 +1148,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSE42-LABEL: 'pairwise_reduction8i16'
@@ -1169,7 +1161,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; AVX-LABEL: 'pairwise_reduction8i16'
@@ -1182,7 +1174,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i16'
@@ -1195,7 +1187,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1223,7 +1215,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX1-LABEL: 'pairwise_reduction8i32'
@@ -1236,7 +1228,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; AVX2-LABEL: 'pairwise_reduction8i32'
@@ -1249,7 +1241,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'pairwise_reduction8i32'
@@ -1262,7 +1254,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
   %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>