[SLPVectorizer][X86][AMDGPU] Remove fcmp+select to fmin/fmax reduction support.

Previously we could match fcmp+select to a reduction if the fcmp had the nonans fast math flag. But if the select had the nonans fast math flag, InstCombine would turn it into a fminnum/fmaxnum intrinsic before SLP gets to it. Seems fairly likely that if one of the fcmp+select pair have the fast math flag, they both would. My plan is to start vectorizing the fmaxnum/fminnum version soon, but I wanted to get this code out as it had some of the strangest fast math flag behaviors.
2020-09-09 13:45:36 -07:00 · 2020-09-09 13:45:36 -07:00 · c195ae2f00
parent 00460ae520
commit c195ae2f00
5 changed files with 481 additions and 160 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -6256,9 +6256,9 @@ class HorizontalReduction {
  enum ReductionKind {
    RK_None,       /// Not a reduction.
    RK_Arithmetic, /// Binary reduction data.
-    RK_Min,        /// Minimum reduction data.
+    RK_SMin,       /// Signed minimum reduction data.
    RK_UMin,       /// Unsigned minimum reduction data.
-    RK_Max,        /// Maximum reduction data.
+    RK_SMax,       /// Signed maximum reduction data.
    RK_UMax,       /// Unsigned maximum reduction data.
  };

@ -6276,9 +6276,6 @@ class HorizontalReduction {
    /// Kind of the reduction operation.
    ReductionKind Kind = RK_None;

-    /// True if float point min/max reduction has no NaNs.
-    bool NoNaN = false;
-
    /// Checks if the reduction operation can be vectorized.
    bool isVectorizable() const {
      return LHS && RHS &&
@ -6288,10 +6285,9 @@ class HorizontalReduction {
                Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
                Opcode == Instruction::And || Opcode == Instruction::Or ||
                Opcode == Instruction::Xor)) ||
-              ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
-               (Kind == RK_Min || Kind == RK_Max)) ||
              (Opcode == Instruction::ICmp &&
-               (Kind == RK_UMin || Kind == RK_UMax)));
+               (Kind == RK_SMin || Kind == RK_SMax ||
+                Kind == RK_UMin || Kind == RK_UMax)));
    }

    /// Creates reduction operation with the current opcode.
@ -6303,13 +6299,13 @@ class HorizontalReduction {
      case RK_Arithmetic:
        return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
                                   Name);
-      case RK_Min:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
-                                          : Builder.CreateFCmpOLT(LHS, RHS);
+      case RK_SMin:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpSLT(LHS, RHS);
        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
-      case RK_Max:
-        Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
-                                          : Builder.CreateFCmpOGT(LHS, RHS);
+      case RK_SMax:
+        assert(Opcode == Instruction::ICmp && "Expected integer types.");
+        Cmp = Builder.CreateICmpSGT(LHS, RHS);
        return Builder.CreateSelect(Cmp, LHS, RHS, Name);
      case RK_UMin:
        assert(Opcode == Instruction::ICmp && "Expected integer types.");
@ -6337,9 +6333,8 @@ class HorizontalReduction {

    /// Constructor for reduction operations with opcode and its left and
    /// right operands.
-    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
-                  bool NoNaN = false)
-        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
+    OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind)
+        : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
      assert(Kind != RK_None && "One of the reduction operations is expected.");
    }

@ -6350,8 +6345,8 @@ class HorizontalReduction {
      switch (Kind) {
      case RK_Arithmetic:
        return false;
-      case RK_Min:
-      case RK_Max:
+      case RK_SMin:
+      case RK_SMax:
      case RK_UMin:
      case RK_UMax:
        return true;
@ -6433,10 +6428,8 @@ class HorizontalReduction {
      switch (Kind) {
      case RK_Arithmetic:
        return I->isAssociative();
-      case RK_Min:
-      case RK_Max:
-        return Opcode == Instruction::ICmp ||
-               cast<Instruction>(I->getOperand(0))->isFast();
+      case RK_SMin:
+      case RK_SMax:
      case RK_UMin:
      case RK_UMax:
        assert(Opcode == Instruction::ICmp &&
@ -6466,7 +6459,6 @@ class HorizontalReduction {
      LHS = nullptr;
      RHS = nullptr;
      Kind = RK_None;
-      NoNaN = false;
    }

    /// Get the opcode of the reduction operation.
@ -6494,8 +6486,8 @@ class HorizontalReduction {
      case RK_Arithmetic:
        propagateIRFlags(Op, ReductionOps[0]);
        return Op;
-      case RK_Min:
-      case RK_Max:
+      case RK_SMin:
+      case RK_SMax:
      case RK_UMin:
      case RK_UMax:
        if (auto *SI = dyn_cast<SelectInst>(Op))
@ -6518,8 +6510,8 @@ class HorizontalReduction {
      case RK_Arithmetic:
        propagateIRFlags(Op, I);
        return Op;
-      case RK_Min:
-      case RK_Max:
+      case RK_SMin:
+      case RK_SMax:
      case RK_UMin:
      case RK_UMax:
        if (auto *SI = dyn_cast<SelectInst>(Op)) {
@ -6536,16 +6528,15 @@ class HorizontalReduction {

    TargetTransformInfo::ReductionFlags getFlags() const {
      TargetTransformInfo::ReductionFlags Flags;
-      Flags.NoNaN = NoNaN;
      switch (Kind) {
      case RK_Arithmetic:
        break;
-      case RK_Min:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
+      case RK_SMin:
+        Flags.IsSigned = true;
        Flags.IsMaxOp = false;
        break;
-      case RK_Max:
-        Flags.IsSigned = Opcode == Instruction::ICmp;
+      case RK_SMax:
+        Flags.IsSigned = true;
        Flags.IsMaxOp = true;
        break;
      case RK_UMin:
@ -6610,21 +6601,11 @@ class HorizontalReduction {
      if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
      } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-      } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Min,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
+        return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin);
      } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
        return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
      } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-      } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
-                 m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
-        return OperationData(
-            Instruction::FCmp, LHS, RHS, RK_Max,
-            cast<Instruction>(Select->getCondition())->hasNoNaNs());
+        return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax);
      } else {
        // Try harder: look for min/max pattern based on instructions producing
        // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
@ -6672,14 +6653,7 @@ class HorizontalReduction {

        case CmpInst::ICMP_SLT:
        case CmpInst::ICMP_SLE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-
-        case CmpInst::FCMP_OLT:
-        case CmpInst::FCMP_OLE:
-        case CmpInst::FCMP_ULT:
-        case CmpInst::FCMP_ULE:
-          return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
-                               cast<Instruction>(Cond)->hasNoNaNs());
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_SMin);

        case CmpInst::ICMP_UGT:
        case CmpInst::ICMP_UGE:
@ -6687,14 +6661,7 @@ class HorizontalReduction {

        case CmpInst::ICMP_SGT:
        case CmpInst::ICMP_SGE:
-          return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-
-        case CmpInst::FCMP_OGT:
-        case CmpInst::FCMP_OGE:
-        case CmpInst::FCMP_UGT:
-        case CmpInst::FCMP_UGE:
-          return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
-                               cast<Instruction>(Cond)->hasNoNaNs());
+          return OperationData(Instruction::ICmp, LHS, RHS, RK_SMax);
        }
      }
    }
@ -7017,8 +6984,8 @@ private:
          TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
                                          /*IsPairwiseForm=*/false);
      break;
-    case RK_Min:
-    case RK_Max:
+    case RK_SMin:
+    case RK_SMax:
    case RK_UMin:
    case RK_UMax: {
      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
@ -7045,8 +7012,8 @@ private:
      ScalarReduxCost =
          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
      break;
-    case RK_Min:
-    case RK_Max:
+    case RK_SMin:
+    case RK_SMax:
    case RK_UMin:
    case RK_UMax:
      ScalarReduxCost =
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
@ -107,6 +107,8 @@ define i64 @sminv6() {
  ret i64 %select5
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define float @fmaxv6() {
 ; GFX9-LABEL: @fmaxv6(
 ; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16
@ -114,19 +116,21 @@ define float @fmaxv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
-; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
-; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
+; GFX9-NEXT:    [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
+; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
+; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
+; GFX9-NEXT:    [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
+; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
+; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
+; GFX9-NEXT:    [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
+; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
+; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
+; GFX9-NEXT:    [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
+; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
+; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
 ; GFX9-NEXT:    store float [[STORE_SELECT]], float* @fvar, align 8
-; GFX9-NEXT:    ret float [[OP_EXTRA]]
+; GFX9-NEXT:    ret float [[SELECT5]]
 ;
  %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
  %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
@ -154,6 +158,8 @@ define float @fmaxv6() {
  ret float %select5
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define double @dminv6() {
 ; GFX9-LABEL: @dminv6(
 ; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16
@ -161,19 +167,21 @@ define double @dminv6() {
 ; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
-; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
-; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
+; GFX9-NEXT:    [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
+; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
+; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
+; GFX9-NEXT:    [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
+; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
+; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
+; GFX9-NEXT:    [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
+; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
+; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
+; GFX9-NEXT:    [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
+; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
+; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
 ; GFX9-NEXT:    store double [[STORE_SELECT]], double* @dvar, align 8
-; GFX9-NEXT:    ret double [[OP_EXTRA]]
+; GFX9-NEXT:    ret double [[SELECT5]]
 ;
  %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
  %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@ -611,31 +611,22 @@ entry:
  ret i16 %max3
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define half @reduction_fmax_v4half(<4 x half> %vec4) {
-; GFX9-LABEL: @reduction_fmax_v4half(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_fmax_v4half(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
-; VI-NEXT:    [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
-; VI-NEXT:    [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
-; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
-; VI-NEXT:    [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
-; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
-; VI-NEXT:    ret half [[MAX3]]
+; GCN-LABEL: @reduction_fmax_v4half(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; GCN-NEXT:    [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; GCN-NEXT:    [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
+; GCN-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
+; GCN-NEXT:    [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
+; GCN-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
+; GCN-NEXT:    ret half [[MAX3]]
 ;
 entry:
  %elt0 = extractelement <4 x half> %vec4, i64 0
@ -653,31 +644,22 @@ entry:
  ret half %max3
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define half @reduction_fmin_v4half(<4 x half> %vec4) {
-; GFX9-LABEL: @reduction_fmin_v4half(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_fmin_v4half(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
-; VI-NEXT:    [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
-; VI-NEXT:    [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
-; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
-; VI-NEXT:    [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
-; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
-; VI-NEXT:    ret half [[MIN3]]
+; GCN-LABEL: @reduction_fmin_v4half(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; GCN-NEXT:    [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; GCN-NEXT:    [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
+; GCN-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
+; GCN-NEXT:    [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
+; GCN-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
+; GCN-NEXT:    ret half [[MIN3]]
 ;
 entry:
  %elt0 = extractelement <4 x half> %vec4, i64 0
@ -719,4 +701,4 @@ entry:
  %add3 = fadd fast float %elt3, %add2

  ret float %add3
-}
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@ -266,24 +266,52 @@ entry:
  ret i32 %conv4
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define float @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
-; CHECK-NEXT:    store float [[TMP3]], float* @res, align 4
-; CHECK-NEXT:    ret float [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
 ; THRESHOLD-LABEL: @bar(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
-; THRESHOLD-NEXT:    store float [[TMP3]], float* @res, align 4
-; THRESHOLD-NEXT:    ret float [[TMP3]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
 entry:
  %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@ -198,11 +198,59 @@ define i32 @maxi32(i32) {
  ret i32 %95
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define float @maxf8(float) {
-; CHECK-LABEL: @maxf8(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])
-; CHECK-NEXT:    ret float [[TMP3]]
+; DEFAULT-LABEL: @maxf8(
+; DEFAULT-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; DEFAULT-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; DEFAULT-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; DEFAULT-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; DEFAULT-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; DEFAULT-NEXT:    ret float [[TMP23]]
+;
+; THRESH-LABEL: @maxf8(
+; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
+; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+; THRESH-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
+; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
+; THRESH-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; THRESH-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
+; THRESH-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
+; THRESH-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; THRESH-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
+; THRESH-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
+; THRESH-NEXT:    [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; THRESH-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
+; THRESH-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
+; THRESH-NEXT:    [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
+; THRESH-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
+; THRESH-NEXT:    [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; THRESH-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
+; THRESH-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
+; THRESH-NEXT:    ret float [[TMP24]]
 ;
  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@ -229,11 +277,107 @@ define float @maxf8(float) {
  ret float %23
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define float @maxf16(float) {
-; CHECK-LABEL: @maxf16(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> [[TMP2]])
-; CHECK-NEXT:    ret float [[TMP3]]
+; DEFAULT-LABEL: @maxf16(
+; DEFAULT-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; DEFAULT-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; DEFAULT-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; DEFAULT-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; DEFAULT-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; DEFAULT-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; DEFAULT-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; DEFAULT-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; DEFAULT-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; DEFAULT-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; DEFAULT-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; DEFAULT-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; DEFAULT-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; DEFAULT-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; DEFAULT-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; DEFAULT-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; DEFAULT-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; DEFAULT-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; DEFAULT-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; DEFAULT-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; DEFAULT-NEXT:    ret float [[TMP47]]
+;
+; THRESH-LABEL: @maxf16(
+; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
+; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+; THRESH-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
+; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
+; THRESH-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; THRESH-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
+; THRESH-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
+; THRESH-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; THRESH-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
+; THRESH-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
+; THRESH-NEXT:    [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; THRESH-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
+; THRESH-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
+; THRESH-NEXT:    [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
+; THRESH-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
+; THRESH-NEXT:    [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; THRESH-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
+; THRESH-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
+; THRESH-NEXT:    [[TMP25:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; THRESH-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
+; THRESH-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]]
+; THRESH-NEXT:    [[TMP28:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; THRESH-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
+; THRESH-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]]
+; THRESH-NEXT:    [[TMP31:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; THRESH-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
+; THRESH-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]]
+; THRESH-NEXT:    [[TMP34:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; THRESH-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
+; THRESH-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]]
+; THRESH-NEXT:    [[TMP37:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; THRESH-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
+; THRESH-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]]
+; THRESH-NEXT:    [[TMP40:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; THRESH-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
+; THRESH-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]]
+; THRESH-NEXT:    [[TMP43:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; THRESH-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
+; THRESH-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]]
+; THRESH-NEXT:    [[TMP46:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; THRESH-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
+; THRESH-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]]
+; THRESH-NEXT:    ret float [[TMP48]]
 ;
  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
@ -284,11 +428,203 @@ define float @maxf16(float) {
  ret float %47
 }

+; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
+; with fastmath on the select.
 define float @maxf32(float) {
-; CHECK-LABEL: @maxf32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> [[TMP2]])
-; CHECK-NEXT:    ret float [[TMP3]]
+; DEFAULT-LABEL: @maxf32(
+; DEFAULT-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], float [[TMP2]], float [[TMP3]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP5]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = fcmp fast ogt float [[TMP8]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP8]], float [[TMP9]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; DEFAULT-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP11]], float [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; DEFAULT-NEXT:    [[TMP16:%.*]] = fcmp fast ogt float [[TMP14]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], float [[TMP14]], float [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; DEFAULT-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP17]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP17]], float [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; DEFAULT-NEXT:    [[TMP22:%.*]] = fcmp fast ogt float [[TMP20]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], float [[TMP20]], float [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; DEFAULT-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP23]], [[TMP24]]
+; DEFAULT-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP23]], float [[TMP24]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; DEFAULT-NEXT:    [[TMP28:%.*]] = fcmp fast ogt float [[TMP26]], [[TMP27]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], float [[TMP26]], float [[TMP27]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; DEFAULT-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP29]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP29]], float [[TMP30]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; DEFAULT-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP32]], [[TMP33]]
+; DEFAULT-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], float [[TMP32]], float [[TMP33]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; DEFAULT-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], [[TMP36]]
+; DEFAULT-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP35]], float [[TMP36]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; DEFAULT-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP38]], [[TMP39]]
+; DEFAULT-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], float [[TMP38]], float [[TMP39]]
+; DEFAULT-NEXT:    [[TMP42:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; DEFAULT-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP41]], [[TMP42]]
+; DEFAULT-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP41]], float [[TMP42]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; DEFAULT-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP44]], [[TMP45]]
+; DEFAULT-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], float [[TMP44]], float [[TMP45]]
+; DEFAULT-NEXT:    [[TMP48:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+; DEFAULT-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP47]], [[TMP48]]
+; DEFAULT-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP47]], float [[TMP48]]
+; DEFAULT-NEXT:    [[TMP51:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+; DEFAULT-NEXT:    [[TMP52:%.*]] = fcmp fast ogt float [[TMP50]], [[TMP51]]
+; DEFAULT-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], float [[TMP50]], float [[TMP51]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+; DEFAULT-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP53]], [[TMP54]]
+; DEFAULT-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP53]], float [[TMP54]]
+; DEFAULT-NEXT:    [[TMP57:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+; DEFAULT-NEXT:    [[TMP58:%.*]] = fcmp fast ogt float [[TMP56]], [[TMP57]]
+; DEFAULT-NEXT:    [[TMP59:%.*]] = select i1 [[TMP58]], float [[TMP56]], float [[TMP57]]
+; DEFAULT-NEXT:    [[TMP60:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+; DEFAULT-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP59]], [[TMP60]]
+; DEFAULT-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP59]], float [[TMP60]]
+; DEFAULT-NEXT:    [[TMP63:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+; DEFAULT-NEXT:    [[TMP64:%.*]] = fcmp fast ogt float [[TMP62]], [[TMP63]]
+; DEFAULT-NEXT:    [[TMP65:%.*]] = select i1 [[TMP64]], float [[TMP62]], float [[TMP63]]
+; DEFAULT-NEXT:    [[TMP66:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+; DEFAULT-NEXT:    [[TMP67:%.*]] = fcmp fast ogt float [[TMP65]], [[TMP66]]
+; DEFAULT-NEXT:    [[TMP68:%.*]] = select i1 [[TMP67]], float [[TMP65]], float [[TMP66]]
+; DEFAULT-NEXT:    [[TMP69:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+; DEFAULT-NEXT:    [[TMP70:%.*]] = fcmp fast ogt float [[TMP68]], [[TMP69]]
+; DEFAULT-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], float [[TMP68]], float [[TMP69]]
+; DEFAULT-NEXT:    [[TMP72:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+; DEFAULT-NEXT:    [[TMP73:%.*]] = fcmp fast ogt float [[TMP71]], [[TMP72]]
+; DEFAULT-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], float [[TMP71]], float [[TMP72]]
+; DEFAULT-NEXT:    [[TMP75:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+; DEFAULT-NEXT:    [[TMP76:%.*]] = fcmp fast ogt float [[TMP74]], [[TMP75]]
+; DEFAULT-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], float [[TMP74]], float [[TMP75]]
+; DEFAULT-NEXT:    [[TMP78:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+; DEFAULT-NEXT:    [[TMP79:%.*]] = fcmp fast ogt float [[TMP77]], [[TMP78]]
+; DEFAULT-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], float [[TMP77]], float [[TMP78]]
+; DEFAULT-NEXT:    [[TMP81:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+; DEFAULT-NEXT:    [[TMP82:%.*]] = fcmp fast ogt float [[TMP80]], [[TMP81]]
+; DEFAULT-NEXT:    [[TMP83:%.*]] = select i1 [[TMP82]], float [[TMP80]], float [[TMP81]]
+; DEFAULT-NEXT:    [[TMP84:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+; DEFAULT-NEXT:    [[TMP85:%.*]] = fcmp fast ogt float [[TMP83]], [[TMP84]]
+; DEFAULT-NEXT:    [[TMP86:%.*]] = select i1 [[TMP85]], float [[TMP83]], float [[TMP84]]
+; DEFAULT-NEXT:    [[TMP87:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+; DEFAULT-NEXT:    [[TMP88:%.*]] = fcmp fast ogt float [[TMP86]], [[TMP87]]
+; DEFAULT-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], float [[TMP86]], float [[TMP87]]
+; DEFAULT-NEXT:    [[TMP90:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+; DEFAULT-NEXT:    [[TMP91:%.*]] = fcmp fast ogt float [[TMP89]], [[TMP90]]
+; DEFAULT-NEXT:    [[TMP92:%.*]] = select i1 [[TMP91]], float [[TMP89]], float [[TMP90]]
+; DEFAULT-NEXT:    [[TMP93:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+; DEFAULT-NEXT:    [[TMP94:%.*]] = fcmp fast ogt float [[TMP92]], [[TMP93]]
+; DEFAULT-NEXT:    [[TMP95:%.*]] = select i1 [[TMP94]], float [[TMP92]], float [[TMP93]]
+; DEFAULT-NEXT:    ret float [[TMP95]]
+;
+; THRESH-LABEL: @maxf32(
+; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @arr1 to <2 x float>*), align 16
+; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP4]]
+; THRESH-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = fcmp fast ogt float [[TMP6]], [[TMP7]]
+; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
+; THRESH-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+; THRESH-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP9]], [[TMP10]]
+; THRESH-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP9]], float [[TMP10]]
+; THRESH-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+; THRESH-NEXT:    [[TMP14:%.*]] = fcmp fast ogt float [[TMP12]], [[TMP13]]
+; THRESH-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], float [[TMP12]], float [[TMP13]]
+; THRESH-NEXT:    [[TMP16:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+; THRESH-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP15]], [[TMP16]]
+; THRESH-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP15]], float [[TMP16]]
+; THRESH-NEXT:    [[TMP19:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP20:%.*]] = fcmp fast ogt float [[TMP18]], [[TMP19]]
+; THRESH-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP18]], float [[TMP19]]
+; THRESH-NEXT:    [[TMP22:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+; THRESH-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP21]], [[TMP22]]
+; THRESH-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP21]], float [[TMP22]]
+; THRESH-NEXT:    [[TMP25:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+; THRESH-NEXT:    [[TMP26:%.*]] = fcmp fast ogt float [[TMP24]], [[TMP25]]
+; THRESH-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], float [[TMP24]], float [[TMP25]]
+; THRESH-NEXT:    [[TMP28:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+; THRESH-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP27]], [[TMP28]]
+; THRESH-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP27]], float [[TMP28]]
+; THRESH-NEXT:    [[TMP31:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+; THRESH-NEXT:    [[TMP32:%.*]] = fcmp fast ogt float [[TMP30]], [[TMP31]]
+; THRESH-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], float [[TMP30]], float [[TMP31]]
+; THRESH-NEXT:    [[TMP34:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+; THRESH-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP33]], [[TMP34]]
+; THRESH-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP33]], float [[TMP34]]
+; THRESH-NEXT:    [[TMP37:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+; THRESH-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP36]], [[TMP37]]
+; THRESH-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], float [[TMP36]], float [[TMP37]]
+; THRESH-NEXT:    [[TMP40:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+; THRESH-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], [[TMP40]]
+; THRESH-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP39]], float [[TMP40]]
+; THRESH-NEXT:    [[TMP43:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+; THRESH-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP42]], [[TMP43]]
+; THRESH-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], float [[TMP42]], float [[TMP43]]
+; THRESH-NEXT:    [[TMP46:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+; THRESH-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP45]], [[TMP46]]
+; THRESH-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP45]], float [[TMP46]]
+; THRESH-NEXT:    [[TMP49:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+; THRESH-NEXT:    [[TMP50:%.*]] = fcmp fast ogt float [[TMP48]], [[TMP49]]
+; THRESH-NEXT:    [[TMP51:%.*]] = select i1 [[TMP50]], float [[TMP48]], float [[TMP49]]
+; THRESH-NEXT:    [[TMP52:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+; THRESH-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP51]], [[TMP52]]
+; THRESH-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP51]], float [[TMP52]]
+; THRESH-NEXT:    [[TMP55:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+; THRESH-NEXT:    [[TMP56:%.*]] = fcmp fast ogt float [[TMP54]], [[TMP55]]
+; THRESH-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], float [[TMP54]], float [[TMP55]]
+; THRESH-NEXT:    [[TMP58:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+; THRESH-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP57]], [[TMP58]]
+; THRESH-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP57]], float [[TMP58]]
+; THRESH-NEXT:    [[TMP61:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+; THRESH-NEXT:    [[TMP62:%.*]] = fcmp fast ogt float [[TMP60]], [[TMP61]]
+; THRESH-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], float [[TMP60]], float [[TMP61]]
+; THRESH-NEXT:    [[TMP64:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+; THRESH-NEXT:    [[TMP65:%.*]] = fcmp fast ogt float [[TMP63]], [[TMP64]]
+; THRESH-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], float [[TMP63]], float [[TMP64]]
+; THRESH-NEXT:    [[TMP67:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+; THRESH-NEXT:    [[TMP68:%.*]] = fcmp fast ogt float [[TMP66]], [[TMP67]]
+; THRESH-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], float [[TMP66]], float [[TMP67]]
+; THRESH-NEXT:    [[TMP70:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+; THRESH-NEXT:    [[TMP71:%.*]] = fcmp fast ogt float [[TMP69]], [[TMP70]]
+; THRESH-NEXT:    [[TMP72:%.*]] = select i1 [[TMP71]], float [[TMP69]], float [[TMP70]]
+; THRESH-NEXT:    [[TMP73:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+; THRESH-NEXT:    [[TMP74:%.*]] = fcmp fast ogt float [[TMP72]], [[TMP73]]
+; THRESH-NEXT:    [[TMP75:%.*]] = select i1 [[TMP74]], float [[TMP72]], float [[TMP73]]
+; THRESH-NEXT:    [[TMP76:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+; THRESH-NEXT:    [[TMP77:%.*]] = fcmp fast ogt float [[TMP75]], [[TMP76]]
+; THRESH-NEXT:    [[TMP78:%.*]] = select i1 [[TMP77]], float [[TMP75]], float [[TMP76]]
+; THRESH-NEXT:    [[TMP79:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+; THRESH-NEXT:    [[TMP80:%.*]] = fcmp fast ogt float [[TMP78]], [[TMP79]]
+; THRESH-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], float [[TMP78]], float [[TMP79]]
+; THRESH-NEXT:    [[TMP82:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+; THRESH-NEXT:    [[TMP83:%.*]] = fcmp fast ogt float [[TMP81]], [[TMP82]]
+; THRESH-NEXT:    [[TMP84:%.*]] = select i1 [[TMP83]], float [[TMP81]], float [[TMP82]]
+; THRESH-NEXT:    [[TMP85:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+; THRESH-NEXT:    [[TMP86:%.*]] = fcmp fast ogt float [[TMP84]], [[TMP85]]
+; THRESH-NEXT:    [[TMP87:%.*]] = select i1 [[TMP86]], float [[TMP84]], float [[TMP85]]
+; THRESH-NEXT:    [[TMP88:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+; THRESH-NEXT:    [[TMP89:%.*]] = fcmp fast ogt float [[TMP87]], [[TMP88]]
+; THRESH-NEXT:    [[TMP90:%.*]] = select i1 [[TMP89]], float [[TMP87]], float [[TMP88]]
+; THRESH-NEXT:    [[TMP91:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+; THRESH-NEXT:    [[TMP92:%.*]] = fcmp fast ogt float [[TMP90]], [[TMP91]]
+; THRESH-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], float [[TMP90]], float [[TMP91]]
+; THRESH-NEXT:    [[TMP94:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+; THRESH-NEXT:    [[TMP95:%.*]] = fcmp fast ogt float [[TMP93]], [[TMP94]]
+; THRESH-NEXT:    [[TMP96:%.*]] = select i1 [[TMP95]], float [[TMP93]], float [[TMP94]]
+; THRESH-NEXT:    ret float [[TMP96]]
 ;
  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4