[VectorCombine] position pass after SLP in the optimization pipeline rather than before

There are 2 known problem patterns shown in the test diffs here:
vector horizontal ops (an x86 specialization) and vector reductions.

SLP has greater ability to match and fold those than vector-combine,
so let SLP have first chance at that.

This is a quick fix while we continue to improve vector-combine and
possibly canonicalize to reduction intrinsics.

In the longer term, we should improve matching of these patterns
because if they were created in the "bad" forms shown here, then we
would miss optimizing them.

I'm not sure what is happening with alias analysis on the addsub test.
The old pass manager now shows an extra line for that, and we see an
improvement that comes from SLP vectorizing a store. I don't know
what's missing with the new pass manager to make that happen.
Strangely, I can't reproduce the behavior if I compile from C++ with
clang and invoke the new PM with "-fexperimental-new-pass-manager".

Differential Revision: https://reviews.llvm.org/D80236
This commit is contained in:
Sanjay Patel 2020-05-22 12:13:18 -04:00
parent 22ed724975
commit 6438ea45e0
14 changed files with 79 additions and 84 deletions

View File

@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
OptimizePM.addPass(LoopVectorizePass( OptimizePM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(EarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads // Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration. // of the current iteration.
OptimizePM.addPass(LoopLoadEliminationPass()); OptimizePM.addPass(LoopLoadEliminationPass());
@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
if (PTO.SLPVectorization) if (PTO.SLPVectorization)
OptimizePM.addPass(SLPVectorizerPass()); OptimizePM.addPass(SLPVectorizerPass());
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(EarlyCSEPass());
OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(InstCombinePass());
// Unroll small loops to hide loop backedge latency and saturate any parallel // Unroll small loops to hide loop backedge latency and saturate any parallel

View File

@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createLoopDistributePass()); MPM.add(createLoopDistributePass());
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
MPM.add(createVectorCombinePass());
MPM.add(createEarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads // Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration. // of the current iteration.
@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
} }
} }
// Enhance/cleanup vector code.
MPM.add(createVectorCombinePass());
MPM.add(createEarlyCSEPass());
addExtensionsToPM(EP_Peephole, MPM); addExtensionsToPM(EP_Peephole, MPM);
MPM.add(createInstructionCombiningPass()); MPM.add(createInstructionCombiningPass());

View File

@ -230,8 +230,6 @@
; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Optimization Remark Emitter
; GCN-O1-NEXT: Inject TLI Mappings ; GCN-O1-NEXT: Inject TLI Mappings
; GCN-O1-NEXT: Loop Vectorization ; GCN-O1-NEXT: Loop Vectorization
; GCN-O1-NEXT: Optimize scalar/vector ops
; GCN-O1-NEXT: Early CSE
; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis ; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Function Alias Analysis Results
@ -247,6 +245,8 @@
; GCN-O1-NEXT: Combine redundant instructions ; GCN-O1-NEXT: Combine redundant instructions
; GCN-O1-NEXT: Simplify the CFG ; GCN-O1-NEXT: Simplify the CFG
; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Optimize scalar/vector ops
; GCN-O1-NEXT: Early CSE
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Function Alias Analysis Results
; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Natural Loop Information
@ -571,8 +571,6 @@
; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Inject TLI Mappings ; GCN-O2-NEXT: Inject TLI Mappings
; GCN-O2-NEXT: Loop Vectorization ; GCN-O2-NEXT: Loop Vectorization
; GCN-O2-NEXT: Optimize scalar/vector ops
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: Canonicalize natural loops
; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Function Alias Analysis Results
@ -598,6 +596,9 @@
; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Inject TLI Mappings ; GCN-O2-NEXT: Inject TLI Mappings
; GCN-O2-NEXT: SLP Vectorizer ; GCN-O2-NEXT: SLP Vectorizer
; GCN-O2-NEXT: Optimize scalar/vector ops
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: Function Alias Analysis Results
; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Optimization Remark Emitter
; GCN-O2-NEXT: Combine redundant instructions ; GCN-O2-NEXT: Combine redundant instructions
; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: Canonicalize natural loops
@ -924,8 +925,6 @@
; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Inject TLI Mappings ; GCN-O3-NEXT: Inject TLI Mappings
; GCN-O3-NEXT: Loop Vectorization ; GCN-O3-NEXT: Loop Vectorization
; GCN-O3-NEXT: Optimize scalar/vector ops
; GCN-O3-NEXT: Early CSE
; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: Canonicalize natural loops
; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Function Alias Analysis Results
@ -951,6 +950,9 @@
; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Inject TLI Mappings ; GCN-O3-NEXT: Inject TLI Mappings
; GCN-O3-NEXT: SLP Vectorizer ; GCN-O3-NEXT: SLP Vectorizer
; GCN-O3-NEXT: Optimize scalar/vector ops
; GCN-O3-NEXT: Early CSE
; GCN-O3-NEXT: Function Alias Analysis Results
; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Combine redundant instructions ; GCN-O3-NEXT: Combine redundant instructions
; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: Canonicalize natural loops

View File

@ -253,8 +253,6 @@
; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -262,6 +260,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -223,8 +223,6 @@
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
@ -232,6 +230,8 @@
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -191,8 +191,6 @@
; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -200,6 +198,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -202,8 +202,6 @@
; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -211,6 +209,8 @@
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -227,8 +227,6 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Function Alias Analysis Results
@ -254,6 +252,9 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops

View File

@ -232,8 +232,6 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Function Alias Analysis Results
@ -259,6 +257,9 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops

View File

@ -213,8 +213,6 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Function Alias Analysis Results
@ -240,6 +238,9 @@
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer ; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Canonicalize natural loops

View File

@ -12,15 +12,15 @@
; OLDPM_O1-LABEL: Pass Arguments: ; OLDPM_O1-LABEL: Pass Arguments:
; OLDPM_O1: Loop Vectorization ; OLDPM_O1: Loop Vectorization
; OLDPM_O1: Optimize scalar/vector ops
; OLDPM_O1-NOT: SLP Vectorizer ; OLDPM_O1-NOT: SLP Vectorizer
; OLDPM_O1: Optimize scalar/vector ops
; Everything runs at -O2. ; Everything runs at -O2.
; OLDPM_O2-LABEL: Pass Arguments: ; OLDPM_O2-LABEL: Pass Arguments:
; OLDPM_O2: Loop Vectorization ; OLDPM_O2: Loop Vectorization
; OLDPM_O2: Optimize scalar/vector ops
; OLDPM_O2: SLP Vectorizer ; OLDPM_O2: SLP Vectorizer
; OLDPM_O2: Optimize scalar/vector ops
; The loop vectorizer still runs at both -O1/-O2 even with the ; The loop vectorizer still runs at both -O1/-O2 even with the
; debug flag, but it only works on loops explicitly annotated ; debug flag, but it only works on loops explicitly annotated
@ -28,24 +28,24 @@
; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments: ; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments:
; OLDPM_O1_FORCE_OFF: Loop Vectorization ; OLDPM_O1_FORCE_OFF: Loop Vectorization
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer ; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments: ; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments:
; OLDPM_O2_FORCE_OFF: Loop Vectorization ; OLDPM_O2_FORCE_OFF: Loop Vectorization
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
; OLDPM_O2_FORCE_OFF: SLP Vectorizer ; OLDPM_O2_FORCE_OFF: SLP Vectorizer
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
; There should be no difference with the new pass manager. ; There should be no difference with the new pass manager.
; This is tested more thoroughly in other test files. ; This is tested more thoroughly in other test files.
; NEWPM_O1-LABEL: Running pass: LoopVectorizePass ; NEWPM_O1-LABEL: Running pass: LoopVectorizePass
; NEWPM_O1: Running pass: VectorCombinePass
; NEWPM_O1-NOT: Running pass: SLPVectorizerPass ; NEWPM_O1-NOT: Running pass: SLPVectorizerPass
; NEWPM_O1: Running pass: VectorCombinePass
; NEWPM_O2-LABEL: Running pass: LoopVectorizePass ; NEWPM_O2-LABEL: Running pass: LoopVectorizePass
; NEWPM_O2: Running pass: VectorCombinePass
; NEWPM_O2: Running pass: SLPVectorizerPass ; NEWPM_O2: Running pass: SLPVectorizerPass
; NEWPM_O2: Running pass: VectorCombinePass
define void @f() { define void @f() {
ret void ret void

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -O3 -S | FileCheck %s ; RUN: opt < %s -O3 -S | FileCheck %s --check-prefixes=CHECK,OLDPM
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s ; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
target triple = "x86_64--" target triple = "x86_64--"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
} }
define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) { define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
; CHECK-LABEL: @add_aggregate_store( ; OLDPM-LABEL: @add_aggregate_store(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]] ; OLDPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; OLDPM-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0 ; OLDPM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4 ; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 ; OLDPM-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1 ; OLDPM-NEXT: ret void
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4 ;
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] ; NEWPM-LABEL: @add_aggregate_store(
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; NEWPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2 ; NEWPM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4 ; NEWPM-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; NEWPM-NEXT: store float [[TMP2]], float* [[R0]], align 4
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3 ; NEWPM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4 ; NEWPM-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
; CHECK-NEXT: ret void ; NEWPM-NEXT: store float [[TMP3]], float* [[R1]], align 4
; NEWPM-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; NEWPM-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; NEWPM-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
; NEWPM-NEXT: store float [[TMP5]], float* [[R2]], align 4
; NEWPM-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
; NEWPM-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
; NEWPM-NEXT: store float [[TMP6]], float* [[R3]], align 4
; NEWPM-NEXT: ret void
; ;
%a00 = extractelement <2 x float> %a0, i32 0 %a00 = extractelement <2 x float> %a0, i32 0
%b00 = extractelement <2 x float> %b0, i32 0 %b00 = extractelement <2 x float> %b0, i32 0

View File

@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 { define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: @hadd_reverse_v4f32( ; CHECK-LABEL: @hadd_reverse_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]] ; CHECK-NEXT: ret <4 x float> [[TMP3]]
; CHECK-NEXT: [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
; CHECK-NEXT: [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
; CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; CHECK-NEXT: ret <4 x float> [[VECINIT14]]
; ;
%shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0> %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 { define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: @reverse_hadd_v4f32( ; CHECK-LABEL: @reverse_hadd_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0> ; CHECK-NEXT: ret <4 x float> [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x float> [[TMP11]]
; ;
%vecext = extractelement <4 x float> %a, i32 0 %vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1 %vecext1 = extractelement <4 x float> %a, i32 1

View File

@ -5,19 +5,15 @@
target triple = "x86_64--" target triple = "x86_64--"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; FIXME: This should only need 2 'or' instructions.
define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @ext_ext_or_reduction_v4i32( ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]] ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret i32 [[TMP1]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
; CHECK-NEXT: ret i32 [[TMP7]]
; ;
%z = and <4 x i32> %x, %y %z = and <4 x i32> %x, %y
%z0 = extractelement <4 x i32> %z, i32 0 %z0 = extractelement <4 x i32> %z, i32 0