[VectorCombine] position pass after SLP in the optimization pipeline rather than before

There are 2 known problem patterns shown in the test diffs here: vector horizontal ops (an x86 specialization) and vector reductions. SLP has greater ability to match and fold those than vector-combine, so let SLP have first chance at that. This is a quick fix while we continue to improve vector-combine and possibly canonicalize to reduction intrinsics. In the longer term, we should improve matching of these patterns because if they were created in the "bad" forms shown here, then we would miss optimizing them. I'm not sure what is happening with alias analysis on the addsub test. The old pass manager now shows an extra line for that, and we see an improvement that comes from SLP vectorizing a store. I don't know what's missing with the new pass manager to make that happen. Strangely, I can't reproduce the behavior if I compile from C++ with clang and invoke the new PM with "-fexperimental-new-pass-manager". Differential Revision: https://reviews.llvm.org/D80236
2020-05-22 12:13:18 -04:00 · 2020-05-22 12:13:18 -04:00 · 6438ea45e0
parent 22ed724975
commit 6438ea45e0
14 changed files with 79 additions and 84 deletions
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
  OptimizePM.addPass(LoopVectorizePass(
      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
  // Enhance/cleanup vector code.
  OptimizePM.addPass(VectorCombinePass());
  OptimizePM.addPass(EarlyCSEPass());
  // Eliminate loads by forwarding stores from the previous iteration to loads
  // of the current iteration.
  OptimizePM.addPass(LoopLoadEliminationPass());
@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
  if (PTO.SLPVectorization)
    OptimizePM.addPass(SLPVectorizerPass());
  // Enhance/cleanup vector code.
  OptimizePM.addPass(VectorCombinePass());
  OptimizePM.addPass(EarlyCSEPass());
  OptimizePM.addPass(InstCombinePass());
  // Unroll small loops to hide loop backedge latency and saturate any parallel
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
  MPM.add(createLoopDistributePass());
  MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
  MPM.add(createVectorCombinePass());
  MPM.add(createEarlyCSEPass());
  // Eliminate loads by forwarding stores from the previous iteration to loads
  // of the current iteration.
@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
    }
  }
  // Enhance/cleanup vector code.
  MPM.add(createVectorCombinePass());
  MPM.add(createEarlyCSEPass());
  addExtensionsToPM(EP_Peephole, MPM);
  MPM.add(createInstructionCombiningPass());
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@ -230,8 +230,6 @@
 ; GCN-O1-NEXT:       Optimization Remark Emitter
 ; GCN-O1-NEXT:       Inject TLI Mappings
 ; GCN-O1-NEXT:       Loop Vectorization
 ; GCN-O1-NEXT:       Optimize scalar/vector ops
 ; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Canonicalize natural loops
 ; GCN-O1-NEXT:       Scalar Evolution Analysis
 ; GCN-O1-NEXT:       Function Alias Analysis Results
@ -247,6 +245,8 @@
 ; GCN-O1-NEXT:       Combine redundant instructions
 ; GCN-O1-NEXT:       Simplify the CFG
 ; GCN-O1-NEXT:       Dominator Tree Construction
 ; GCN-O1-NEXT:       Optimize scalar/vector ops
 ; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:       Function Alias Analysis Results
 ; GCN-O1-NEXT:       Natural Loop Information
@ -571,8 +571,6 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       Loop Vectorization
 ; GCN-O2-NEXT:       Optimize scalar/vector ops
 ; GCN-O2-NEXT:       Early CSE
 ; GCN-O2-NEXT:       Canonicalize natural loops
 ; GCN-O2-NEXT:       Scalar Evolution Analysis
 ; GCN-O2-NEXT:       Function Alias Analysis Results
@ -598,6 +596,9 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       SLP Vectorizer
 ; GCN-O2-NEXT:       Optimize scalar/vector ops
 ; GCN-O2-NEXT:       Early CSE
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Combine redundant instructions
 ; GCN-O2-NEXT:       Canonicalize natural loops
@ -924,8 +925,6 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       Loop Vectorization
 ; GCN-O3-NEXT:       Optimize scalar/vector ops
 ; GCN-O3-NEXT:       Early CSE
 ; GCN-O3-NEXT:       Canonicalize natural loops
 ; GCN-O3-NEXT:       Scalar Evolution Analysis
 ; GCN-O3-NEXT:       Function Alias Analysis Results
@ -951,6 +950,9 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       SLP Vectorizer
 ; GCN-O3-NEXT:       Optimize scalar/vector ops
 ; GCN-O3-NEXT:       Early CSE
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Combine redundant instructions
 ; GCN-O3-NEXT:       Canonicalize natural loops
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@ -253,8 +253,6 @@
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -262,6 +260,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@ -223,8 +223,6 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
@ -232,6 +230,8 @@
 ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@ -191,8 +191,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -200,6 +198,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@ -202,8 +202,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -211,6 +209,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O-NEXT: Running pass: VectorCombinePass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@ -227,8 +227,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@ -254,6 +252,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@ -232,8 +232,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@ -259,6 +257,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@ -213,8 +213,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@ -240,6 +238,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
 ; CHECK-NEXT:       Optimize scalar/vector ops
 ; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@ -12,15 +12,15 @@
 ; OLDPM_O1-LABEL:  Pass Arguments:
 ; OLDPM_O1:        Loop Vectorization
 ; OLDPM_O1:        Optimize scalar/vector ops
 ; OLDPM_O1-NOT:    SLP Vectorizer
 ; OLDPM_O1:        Optimize scalar/vector ops
 ; Everything runs at -O2.
 ; OLDPM_O2-LABEL:  Pass Arguments:
 ; OLDPM_O2:        Loop Vectorization
 ; OLDPM_O2:        Optimize scalar/vector ops
 ; OLDPM_O2:        SLP Vectorizer
 ; OLDPM_O2:        Optimize scalar/vector ops
 ; The loop vectorizer still runs at both -O1/-O2 even with the
 ; debug flag, but it only works on loops explicitly annotated
@ -28,24 +28,24 @@
 ; OLDPM_O1_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O1_FORCE_OFF:        Loop Vectorization
 ; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O1_FORCE_OFF-NOT:    SLP Vectorizer
 ; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O2_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O2_FORCE_OFF:        Loop Vectorization
 ; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O2_FORCE_OFF:        SLP Vectorizer
 ; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 ; There should be no difference with the new pass manager.
 ; This is tested more thoroughly in other test files.
 ; NEWPM_O1-LABEL:  Running pass: LoopVectorizePass
 ; NEWPM_O1:        Running pass: VectorCombinePass
 ; NEWPM_O1-NOT:    Running pass: SLPVectorizerPass
 ; NEWPM_O1:        Running pass: VectorCombinePass
 ; NEWPM_O2-LABEL:  Running pass: LoopVectorizePass
 ; NEWPM_O2:        Running pass: VectorCombinePass
 ; NEWPM_O2:        Running pass: SLPVectorizerPass
 ; NEWPM_O2:        Running pass: VectorCombinePass
 define void @f() {
  ret void
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S                   | FileCheck %s
+; RUN: opt < %s -O3 -S                   | FileCheck %s --check-prefixes=CHECK,OLDPM
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
 }
 define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
-; CHECK-LABEL: @add_aggregate_store(
+; OLDPM-LABEL: @add_aggregate_store(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; OLDPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; OLDPM-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
+; OLDPM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    store float [[TMP2]], float* [[R0]], align 4
+; OLDPM-NEXT:    [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; OLDPM-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
+; OLDPM-NEXT:    ret void
-; CHECK-NEXT:    store float [[TMP3]], float* [[R1]], align 4
+;
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; NEWPM-LABEL: @add_aggregate_store(
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NEWPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
-; CHECK-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
+; NEWPM-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    store float [[TMP5]], float* [[R2]], align 4
+; NEWPM-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NEWPM-NEXT:    store float [[TMP2]], float* [[R0]], align 4
-; CHECK-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
+; NEWPM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP6]], float* [[R3]], align 4
+; NEWPM-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
-; CHECK-NEXT:    ret void
+; NEWPM-NEXT:    store float [[TMP3]], float* [[R1]], align 4
 ; NEWPM-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
 ; NEWPM-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; NEWPM-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
 ; NEWPM-NEXT:    store float [[TMP5]], float* [[R2]], align 4
 ; NEWPM-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; NEWPM-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
 ; NEWPM-NEXT:    store float [[TMP6]], float* [[R3]], align 4
 ; NEWPM-NEXT:    ret void
 ;
  %a00 = extractelement <2 x float> %a0, i32 0
  %b00 = extractelement <2 x float> %b0, i32 0
--- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @hadd_reverse_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ; CHECK-NEXT:    [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
 ; CHECK-NEXT:    [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
 ; CHECK-NEXT:    [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
 ; CHECK-NEXT:    ret <4 x float> [[VECINIT14]]
 ;
  %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @reverse_hadd_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[TMP11]]
 ;
  %vecext = extractelement <4 x float> %a, i32 0
  %vecext1 = extractelement <4 x float> %a, i32 1
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@ -5,19 +5,15 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; FIXME: This should only need 2 'or' instructions.
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
  %z = and <4 x i32> %x, %y
  %z0 = extractelement <4 x i32> %z, i32 0