[TTI] Remove default condition type and predicate arguments from getCmpSelInstrCost

We need to be better at exposing the comparison predicate to getCmpSelInstrCost calls as some targets (e.g. X86 SSE) have very different costs for different comparisons (PR48337), and we can't always rely on the optional Instruction argument. This initial commit requires explicit condition type and predicate arguments. The next step will be to review a lot of the existing getCmpSelInstrCost calls which have used BAD_ICMP_PREDICATE even when the predicate is known. Differential Revision: https://reviews.llvm.org/D111024
2021-10-06 15:37:05 +01:00 · 2021-10-06 15:37:05 +01:00 · 0dcd2b40e6
parent 49dbde9c9e
commit 0dcd2b40e6
9 changed files with 172 additions and 115 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -1103,8 +1103,8 @@ public:
  /// is using a compare with the specified predicate as condition. When vector
  /// types are passed, \p VecPred must be used for all lanes.
  InstructionCost
-  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
-                     CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
+  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                     CmpInst::Predicate VecPred,
                     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                     const Instruction *I = nullptr) const;

--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@ -348,6 +348,9 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                        SinkAndHoistLICMFlags *LICMFlags = nullptr,
                        OptimizationRemarkEmitter *ORE = nullptr);

+/// Returns the comparison predicate used when expanding a min/max reduction.
+CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@ -889,32 +889,28 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
  return true;
 }

-Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
-                            Value *Right) {
-  CmpInst::Predicate Pred;
+CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
  switch (RK) {
  default:
    llvm_unreachable("Unknown min/max recurrence kind");
  case RecurKind::UMin:
-    Pred = CmpInst::ICMP_ULT;
-    break;
+    return CmpInst::ICMP_ULT;
  case RecurKind::UMax:
-    Pred = CmpInst::ICMP_UGT;
-    break;
+    return CmpInst::ICMP_UGT;
  case RecurKind::SMin:
-    Pred = CmpInst::ICMP_SLT;
-    break;
+    return CmpInst::ICMP_SLT;
  case RecurKind::SMax:
-    Pred = CmpInst::ICMP_SGT;
-    break;
+    return CmpInst::ICMP_SGT;
  case RecurKind::FMin:
-    Pred = CmpInst::FCMP_OLT;
-    break;
+    return CmpInst::FCMP_OLT;
  case RecurKind::FMax:
-    Pred = CmpInst::FCMP_OGT;
-    break;
+    return CmpInst::FCMP_OGT;
  }
+}

+Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
+                            Value *Right) {
+  CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK);
  Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
  return Select;
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -8579,28 +8579,32 @@ private:
    }
    case RecurKind::FMax:
    case RecurKind::FMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
                                               /*unsigned=*/false, CostKind);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
      break;
    }
    case RecurKind::SMax:
    case RecurKind::SMin:
    case RecurKind::UMax:
    case RecurKind::UMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
      bool IsUnsigned =
          RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
      VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
                                               CostKind);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
      break;
    }
    default:
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@ -82,7 +82,7 @@ private:
                                        ExtractElementInst *Ext1,
                                        unsigned PreferredExtractIndex) const;
  bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
-                             unsigned Opcode,
+                             const Instruction &I,
                             ExtractElementInst *&ConvertToShuffle,
                             unsigned PreferredExtractIndex);
  void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
@ -299,12 +299,13 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
 /// \p ConvertToShuffle to that extract instruction.
 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                          ExtractElementInst *Ext1,
-                                          unsigned Opcode,
+                                          const Instruction &I,
                                          ExtractElementInst *&ConvertToShuffle,
                                          unsigned PreferredExtractIndex) {
  assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
         isa<ConstantInt>(Ext1->getOperand(1)) &&
         "Expected constant extract indexes");
+  unsigned Opcode = I.getOpcode();
  Type *ScalarTy = Ext0->getType();
  auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
  InstructionCost ScalarOpCost, VectorOpCost;
@ -317,10 +318,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
  } else {
    assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
           "Expected a compare");
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
-                                          CmpInst::makeCmpResultType(ScalarTy));
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
-                                          CmpInst::makeCmpResultType(VecTy));
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
  }

  // Get cost estimates for the extract elements. These costs will factor into
@ -495,8 +497,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
          m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));

  ExtractElementInst *ExtractToChange;
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
-                            InsertIndex))
+  if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
    return false;

  if (ExtractToChange) {
@ -640,8 +641,11 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
  unsigned Opcode = I.getOpcode();
  InstructionCost ScalarOpCost, VectorOpCost;
  if (IsCmp) {
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
  } else {
    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
@ -741,7 +745,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
  InstructionCost OldCost =
      TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
-  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost +=
+      TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
+                             CmpInst::makeCmpResultType(I0->getType()), Pred) *
+      2;
  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());

  // The proposed vector pattern is:
@ -750,7 +757,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
  int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
  int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
-  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  InstructionCost NewCost = TTI.getCmpSelInstrCost(
+      CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
  SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
  ShufMask[CheapIndex] = ExpensiveIndex;
  NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@ -1,15 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX

 define i1 @fcmp_and_v2f64(<2 x double> %a) {
-; CHECK-LABEL: @fcmp_and_v2f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_and_v2f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_and_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], <double 4.200000e+01, double -8.000000e+00>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
  %e1 = extractelement <2 x double> %a, i32 0
  %e2 = extractelement <2 x double> %a, i32 1
@ -20,13 +27,20 @@ define i1 @fcmp_and_v2f64(<2 x double> %a) {
 }

 define i1 @fcmp_or_v4f64(<4 x double> %a) {
-; CHECK-LABEL: @fcmp_or_v4f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_or_v4f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_or_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[A:%.*]], <double 4.200000e+01, double undef, double -8.000000e+00, double undef>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
  %e1 = extractelement <4 x double> %a, i32 0
  %e2 = extractelement <4 x double> %a, i64 2
@ -38,11 +52,10 @@ define i1 @fcmp_or_v4f64(<4 x double> %a) {

 define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: @icmp_xor_v4i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = xor i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], <i32 undef, i32 -8, i32 undef, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
  %e1 = extractelement <4 x i32> %a, i32 3
@ -56,13 +69,20 @@ define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; add is not canonical (should be xor), but that is ok.

 define i1 @icmp_add_v8i32(<8 x i32> %a) {
-; CHECK-LABEL: @icmp_add_v8i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @icmp_add_v8i32(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
+; SSE-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
+; SSE-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
+; SSE-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @icmp_add_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i32> [[A:%.*]], <i32 undef, i32 undef, i32 -8, i32 undef, i32 undef, i32 undef, i32 undef, i32 42>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <8 x i32> <i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2
+; AVX-NEXT:    ret i1 [[R]]
 ;
  %e1 = extractelement <8 x i32> %a, i32 7
  %e2 = extractelement <8 x i32> %a, i32 2
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
@ -1,30 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX

 define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @cmp_v4i32(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[T:%.*]] = bitcast <4 x float> [[ARG:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[T2:%.*]] = extractelement <4 x i32> [[T]], i32 0
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast <4 x float> [[ARG1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <4 x i32> [[T3]], i32 0
-; CHECK-NEXT:    [[T5:%.*]] = icmp eq i32 [[T2]], [[T4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[T5]], label [[BB6:%.*]], label [[BB18:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[T7:%.*]] = extractelement <4 x i32> [[T]], i32 1
-; CHECK-NEXT:    [[T8:%.*]] = extractelement <4 x i32> [[T3]], i32 1
-; CHECK-NEXT:    [[T9:%.*]] = icmp eq i32 [[T7]], [[T8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[T9]], label [[BB10:%.*]], label [[BB18]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[T11:%.*]] = extractelement <4 x i32> [[T]], i32 2
-; CHECK-NEXT:    [[T12:%.*]] = extractelement <4 x i32> [[T3]], i32 2
-; CHECK-NEXT:    [[T13:%.*]] = icmp eq i32 [[T11]], [[T12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T13:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
 ; CHECK-NEXT:    br i1 [[T13]], label [[BB14:%.*]], label [[BB18]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[T15:%.*]] = extractelement <4 x i32> [[T]], i32 3
-; CHECK-NEXT:    [[T16:%.*]] = extractelement <4 x i32> [[T3]], i32 3
-; CHECK-NEXT:    [[T17:%.*]] = icmp eq i32 [[T15]], [[T16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T17:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; CHECK-NEXT:    br label [[BB18]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[T19:%.*]] = phi i1 [ false, [[BB10]] ], [ false, [[BB6]] ], [ false, [[BB:%.*]] ], [ [[T17]], [[BB14]] ]
@ -62,19 +58,32 @@ bb18:
 }

 define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
-; CHECK-LABEL: @cmp_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
-; CHECK:       t:
-; CHECK-NEXT:    [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
-; CHECK-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
-; CHECK-NEXT:    ret i32 [[E]]
-; CHECK:       f:
-; CHECK-NEXT:    ret i32 0
+; SSE-LABEL: @cmp_v2f64(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
+; SSE-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
+; SSE:       t:
+; SSE-NEXT:    [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
+; SSE-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
+; SSE-NEXT:    ret i32 [[E]]
+; SSE:       f:
+; SSE-NEXT:    ret i32 0
+;
+; AVX-LABEL: @cmp_v2f64(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[CMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; AVX-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
+; AVX:       t:
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]]
+; AVX-NEXT:    [[CMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; AVX-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
+; AVX-NEXT:    ret i32 [[E]]
+; AVX:       f:
+; AVX-NEXT:    ret i32 0
 ;
 entry:
  %x1 = extractelement <2 x double> %x, i32 1
@ -93,11 +102,17 @@ f:
 }

 define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp01_v2f64(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp01_v2f64(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
+; SSE-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp01_v2f64(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[SHIFT]]
+; AVX-NEXT:    [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; AVX-NEXT:    ret i1 [[CMP]]
 ;
  %x0 = extractelement <2 x double> %x, i32 0
  %y1 = extractelement <2 x double> %y, i32 1
@ -106,11 +121,17 @@ define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
 }

 define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp10_v2f64(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp10_v2f64(
+; SSE-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
+; SSE-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp10_v2f64(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ule <2 x double> [[SHIFT]], [[Y:%.*]]
+; AVX-NEXT:    [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0
+; AVX-NEXT:    ret i1 [[CMP]]
 ;
  %x1 = extractelement <2 x double> %x, i32 1
  %y0 = extractelement <2 x double> %y, i32 0
@ -120,9 +141,9 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {

 define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @cmp12_v4i32(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X1]], [[Y2]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[CMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
  %x1 = extractelement <4 x i32> %x, i32 1
@ -132,12 +153,19 @@ define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
 }

 define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) {
-; CHECK-LABEL: @ins_fcmp_ext_ext(
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
-; CHECK-NEXT:    ret <4 x i1> [[R]]
+; SSE-LABEL: @ins_fcmp_ext_ext(
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SSE-NEXT:    [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]]
+; SSE-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
+; SSE-NEXT:    ret <4 x i1> [[R]]
+;
+; AVX-LABEL: @ins_fcmp_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ugt <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A21:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
+; AVX-NEXT:    ret <4 x i1> [[R]]
 ;
  %a1 = extractelement <4 x float> %a, i32 1
  %a2 = extractelement <4 x float> %a, i32 2
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@ -132,8 +132,7 @@ define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i32 2
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> poison, i1 [[R_SCALAR]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
  %i0 = insertelement <4 x float> poison, float %x, i32 2
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@ -132,8 +132,7 @@ define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
  %i0 = insertelement <4 x float> undef, float %x, i32 2