forked from OSchip/llvm-project
[VectorCombine] position pass after SLP in the optimization pipeline rather than before
There are 2 known problem patterns shown in the test diffs here: vector horizontal ops (an x86 specialization) and vector reductions. SLP has greater ability to match and fold those than vector-combine, so let SLP have first chance at that. This is a quick fix while we continue to improve vector-combine and possibly canonicalize to reduction intrinsics. In the longer term, we should improve matching of these patterns because if they were created in the "bad" forms shown here, then we would miss optimizing them. I'm not sure what is happening with alias analysis on the addsub test. The old pass manager now shows an extra line for that, and we see an improvement that comes from SLP vectorizing a store. I don't know what's missing with the new pass manager to make that happen. Strangely, I can't reproduce the behavior if I compile from C++ with clang and invoke the new PM with "-fexperimental-new-pass-manager". Differential Revision: https://reviews.llvm.org/D80236
This commit is contained in:
parent
22ed724975
commit
6438ea45e0
|
@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
|
||||||
OptimizePM.addPass(LoopVectorizePass(
|
OptimizePM.addPass(LoopVectorizePass(
|
||||||
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
|
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
|
||||||
|
|
||||||
// Enhance/cleanup vector code.
|
|
||||||
OptimizePM.addPass(VectorCombinePass());
|
|
||||||
OptimizePM.addPass(EarlyCSEPass());
|
|
||||||
|
|
||||||
// Eliminate loads by forwarding stores from the previous iteration to loads
|
// Eliminate loads by forwarding stores from the previous iteration to loads
|
||||||
// of the current iteration.
|
// of the current iteration.
|
||||||
OptimizePM.addPass(LoopLoadEliminationPass());
|
OptimizePM.addPass(LoopLoadEliminationPass());
|
||||||
|
@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
|
||||||
if (PTO.SLPVectorization)
|
if (PTO.SLPVectorization)
|
||||||
OptimizePM.addPass(SLPVectorizerPass());
|
OptimizePM.addPass(SLPVectorizerPass());
|
||||||
|
|
||||||
|
// Enhance/cleanup vector code.
|
||||||
|
OptimizePM.addPass(VectorCombinePass());
|
||||||
|
OptimizePM.addPass(EarlyCSEPass());
|
||||||
OptimizePM.addPass(InstCombinePass());
|
OptimizePM.addPass(InstCombinePass());
|
||||||
|
|
||||||
// Unroll small loops to hide loop backedge latency and saturate any parallel
|
// Unroll small loops to hide loop backedge latency and saturate any parallel
|
||||||
|
|
|
@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
|
||||||
MPM.add(createLoopDistributePass());
|
MPM.add(createLoopDistributePass());
|
||||||
|
|
||||||
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
|
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
|
||||||
MPM.add(createVectorCombinePass());
|
|
||||||
MPM.add(createEarlyCSEPass());
|
|
||||||
|
|
||||||
// Eliminate loads by forwarding stores from the previous iteration to loads
|
// Eliminate loads by forwarding stores from the previous iteration to loads
|
||||||
// of the current iteration.
|
// of the current iteration.
|
||||||
|
@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Enhance/cleanup vector code.
|
||||||
|
MPM.add(createVectorCombinePass());
|
||||||
|
MPM.add(createEarlyCSEPass());
|
||||||
|
|
||||||
addExtensionsToPM(EP_Peephole, MPM);
|
addExtensionsToPM(EP_Peephole, MPM);
|
||||||
MPM.add(createInstructionCombiningPass());
|
MPM.add(createInstructionCombiningPass());
|
||||||
|
|
||||||
|
|
|
@ -230,8 +230,6 @@
|
||||||
; GCN-O1-NEXT: Optimization Remark Emitter
|
; GCN-O1-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O1-NEXT: Inject TLI Mappings
|
; GCN-O1-NEXT: Inject TLI Mappings
|
||||||
; GCN-O1-NEXT: Loop Vectorization
|
; GCN-O1-NEXT: Loop Vectorization
|
||||||
; GCN-O1-NEXT: Optimize scalar/vector ops
|
|
||||||
; GCN-O1-NEXT: Early CSE
|
|
||||||
; GCN-O1-NEXT: Canonicalize natural loops
|
; GCN-O1-NEXT: Canonicalize natural loops
|
||||||
; GCN-O1-NEXT: Scalar Evolution Analysis
|
; GCN-O1-NEXT: Scalar Evolution Analysis
|
||||||
; GCN-O1-NEXT: Function Alias Analysis Results
|
; GCN-O1-NEXT: Function Alias Analysis Results
|
||||||
|
@ -247,6 +245,8 @@
|
||||||
; GCN-O1-NEXT: Combine redundant instructions
|
; GCN-O1-NEXT: Combine redundant instructions
|
||||||
; GCN-O1-NEXT: Simplify the CFG
|
; GCN-O1-NEXT: Simplify the CFG
|
||||||
; GCN-O1-NEXT: Dominator Tree Construction
|
; GCN-O1-NEXT: Dominator Tree Construction
|
||||||
|
; GCN-O1-NEXT: Optimize scalar/vector ops
|
||||||
|
; GCN-O1-NEXT: Early CSE
|
||||||
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
|
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||||
; GCN-O1-NEXT: Function Alias Analysis Results
|
; GCN-O1-NEXT: Function Alias Analysis Results
|
||||||
; GCN-O1-NEXT: Natural Loop Information
|
; GCN-O1-NEXT: Natural Loop Information
|
||||||
|
@ -571,8 +571,6 @@
|
||||||
; GCN-O2-NEXT: Optimization Remark Emitter
|
; GCN-O2-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O2-NEXT: Inject TLI Mappings
|
; GCN-O2-NEXT: Inject TLI Mappings
|
||||||
; GCN-O2-NEXT: Loop Vectorization
|
; GCN-O2-NEXT: Loop Vectorization
|
||||||
; GCN-O2-NEXT: Optimize scalar/vector ops
|
|
||||||
; GCN-O2-NEXT: Early CSE
|
|
||||||
; GCN-O2-NEXT: Canonicalize natural loops
|
; GCN-O2-NEXT: Canonicalize natural loops
|
||||||
; GCN-O2-NEXT: Scalar Evolution Analysis
|
; GCN-O2-NEXT: Scalar Evolution Analysis
|
||||||
; GCN-O2-NEXT: Function Alias Analysis Results
|
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||||
|
@ -598,6 +596,9 @@
|
||||||
; GCN-O2-NEXT: Optimization Remark Emitter
|
; GCN-O2-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O2-NEXT: Inject TLI Mappings
|
; GCN-O2-NEXT: Inject TLI Mappings
|
||||||
; GCN-O2-NEXT: SLP Vectorizer
|
; GCN-O2-NEXT: SLP Vectorizer
|
||||||
|
; GCN-O2-NEXT: Optimize scalar/vector ops
|
||||||
|
; GCN-O2-NEXT: Early CSE
|
||||||
|
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||||
; GCN-O2-NEXT: Optimization Remark Emitter
|
; GCN-O2-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O2-NEXT: Combine redundant instructions
|
; GCN-O2-NEXT: Combine redundant instructions
|
||||||
; GCN-O2-NEXT: Canonicalize natural loops
|
; GCN-O2-NEXT: Canonicalize natural loops
|
||||||
|
@ -924,8 +925,6 @@
|
||||||
; GCN-O3-NEXT: Optimization Remark Emitter
|
; GCN-O3-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O3-NEXT: Inject TLI Mappings
|
; GCN-O3-NEXT: Inject TLI Mappings
|
||||||
; GCN-O3-NEXT: Loop Vectorization
|
; GCN-O3-NEXT: Loop Vectorization
|
||||||
; GCN-O3-NEXT: Optimize scalar/vector ops
|
|
||||||
; GCN-O3-NEXT: Early CSE
|
|
||||||
; GCN-O3-NEXT: Canonicalize natural loops
|
; GCN-O3-NEXT: Canonicalize natural loops
|
||||||
; GCN-O3-NEXT: Scalar Evolution Analysis
|
; GCN-O3-NEXT: Scalar Evolution Analysis
|
||||||
; GCN-O3-NEXT: Function Alias Analysis Results
|
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||||
|
@ -951,6 +950,9 @@
|
||||||
; GCN-O3-NEXT: Optimization Remark Emitter
|
; GCN-O3-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O3-NEXT: Inject TLI Mappings
|
; GCN-O3-NEXT: Inject TLI Mappings
|
||||||
; GCN-O3-NEXT: SLP Vectorizer
|
; GCN-O3-NEXT: SLP Vectorizer
|
||||||
|
; GCN-O3-NEXT: Optimize scalar/vector ops
|
||||||
|
; GCN-O3-NEXT: Early CSE
|
||||||
|
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||||
; GCN-O3-NEXT: Optimization Remark Emitter
|
; GCN-O3-NEXT: Optimization Remark Emitter
|
||||||
; GCN-O3-NEXT: Combine redundant instructions
|
; GCN-O3-NEXT: Combine redundant instructions
|
||||||
; GCN-O3-NEXT: Canonicalize natural loops
|
; GCN-O3-NEXT: Canonicalize natural loops
|
||||||
|
|
|
@ -253,8 +253,6 @@
|
||||||
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
||||||
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
|
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
|
||||||
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
|
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
|
||||||
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
|
||||||
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
|
||||||
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
||||||
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
|
@ -262,6 +260,8 @@
|
||||||
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
||||||
|
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
||||||
|
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
||||||
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
||||||
|
|
|
@ -223,8 +223,6 @@
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
|
||||||
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
|
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
|
||||||
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
|
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
|
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
|
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
|
||||||
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
|
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
||||||
|
@ -232,6 +230,8 @@
|
||||||
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
|
||||||
|
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
|
||||||
|
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
|
||||||
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
||||||
|
|
|
@ -191,8 +191,6 @@
|
||||||
; CHECK-O-NEXT: Running pass: LoopDistributePass
|
; CHECK-O-NEXT: Running pass: LoopDistributePass
|
||||||
; CHECK-O-NEXT: Running pass: InjectTLIMappings
|
; CHECK-O-NEXT: Running pass: InjectTLIMappings
|
||||||
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
||||||
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
|
||||||
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
|
||||||
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
||||||
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
|
@ -200,6 +198,8 @@
|
||||||
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
||||||
|
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
||||||
|
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
||||||
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
||||||
|
|
|
@ -202,8 +202,6 @@
|
||||||
; CHECK-O-NEXT: Running pass: LoopDistributePass
|
; CHECK-O-NEXT: Running pass: LoopDistributePass
|
||||||
; CHECK-O-NEXT: Running pass: InjectTLIMappings
|
; CHECK-O-NEXT: Running pass: InjectTLIMappings
|
||||||
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
; CHECK-O-NEXT: Running pass: LoopVectorizePass
|
||||||
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
|
||||||
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
|
||||||
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
|
||||||
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
|
@ -211,6 +209,8 @@
|
||||||
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
|
||||||
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
|
||||||
|
; CHECK-O-NEXT: Running pass: VectorCombinePass
|
||||||
|
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
||||||
; CHECK-O-NEXT: Running pass: InstCombinePass
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
||||||
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
; CHECK-O-NEXT: Running pass: LoopUnrollPass
|
||||||
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
||||||
|
|
|
@ -227,8 +227,6 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: Loop Vectorization
|
; CHECK-NEXT: Loop Vectorization
|
||||||
; CHECK-NEXT: Optimize scalar/vector ops
|
|
||||||
; CHECK-NEXT: Early CSE
|
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
; CHECK-NEXT: Scalar Evolution Analysis
|
; CHECK-NEXT: Scalar Evolution Analysis
|
||||||
; CHECK-NEXT: Function Alias Analysis Results
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
|
@ -254,6 +252,9 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
|
; CHECK-NEXT: Optimize scalar/vector ops
|
||||||
|
; CHECK-NEXT: Early CSE
|
||||||
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
|
|
|
@ -232,8 +232,6 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: Loop Vectorization
|
; CHECK-NEXT: Loop Vectorization
|
||||||
; CHECK-NEXT: Optimize scalar/vector ops
|
|
||||||
; CHECK-NEXT: Early CSE
|
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
; CHECK-NEXT: Scalar Evolution Analysis
|
; CHECK-NEXT: Scalar Evolution Analysis
|
||||||
; CHECK-NEXT: Function Alias Analysis Results
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
|
@ -259,6 +257,9 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
|
; CHECK-NEXT: Optimize scalar/vector ops
|
||||||
|
; CHECK-NEXT: Early CSE
|
||||||
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
|
|
|
@ -213,8 +213,6 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: Loop Vectorization
|
; CHECK-NEXT: Loop Vectorization
|
||||||
; CHECK-NEXT: Optimize scalar/vector ops
|
|
||||||
; CHECK-NEXT: Early CSE
|
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
; CHECK-NEXT: Scalar Evolution Analysis
|
; CHECK-NEXT: Scalar Evolution Analysis
|
||||||
; CHECK-NEXT: Function Alias Analysis Results
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
|
@ -240,6 +238,9 @@
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Inject TLI Mappings
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
|
; CHECK-NEXT: Optimize scalar/vector ops
|
||||||
|
; CHECK-NEXT: Early CSE
|
||||||
|
; CHECK-NEXT: Function Alias Analysis Results
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
; CHECK-NEXT: Canonicalize natural loops
|
; CHECK-NEXT: Canonicalize natural loops
|
||||||
|
|
|
@ -12,15 +12,15 @@
|
||||||
|
|
||||||
; OLDPM_O1-LABEL: Pass Arguments:
|
; OLDPM_O1-LABEL: Pass Arguments:
|
||||||
; OLDPM_O1: Loop Vectorization
|
; OLDPM_O1: Loop Vectorization
|
||||||
; OLDPM_O1: Optimize scalar/vector ops
|
|
||||||
; OLDPM_O1-NOT: SLP Vectorizer
|
; OLDPM_O1-NOT: SLP Vectorizer
|
||||||
|
; OLDPM_O1: Optimize scalar/vector ops
|
||||||
|
|
||||||
; Everything runs at -O2.
|
; Everything runs at -O2.
|
||||||
|
|
||||||
; OLDPM_O2-LABEL: Pass Arguments:
|
; OLDPM_O2-LABEL: Pass Arguments:
|
||||||
; OLDPM_O2: Loop Vectorization
|
; OLDPM_O2: Loop Vectorization
|
||||||
; OLDPM_O2: Optimize scalar/vector ops
|
|
||||||
; OLDPM_O2: SLP Vectorizer
|
; OLDPM_O2: SLP Vectorizer
|
||||||
|
; OLDPM_O2: Optimize scalar/vector ops
|
||||||
|
|
||||||
; The loop vectorizer still runs at both -O1/-O2 even with the
|
; The loop vectorizer still runs at both -O1/-O2 even with the
|
||||||
; debug flag, but it only works on loops explicitly annotated
|
; debug flag, but it only works on loops explicitly annotated
|
||||||
|
@ -28,24 +28,24 @@
|
||||||
|
|
||||||
; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments:
|
; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments:
|
||||||
; OLDPM_O1_FORCE_OFF: Loop Vectorization
|
; OLDPM_O1_FORCE_OFF: Loop Vectorization
|
||||||
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
|
|
||||||
; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer
|
; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer
|
||||||
|
; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops
|
||||||
|
|
||||||
; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments:
|
; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments:
|
||||||
; OLDPM_O2_FORCE_OFF: Loop Vectorization
|
; OLDPM_O2_FORCE_OFF: Loop Vectorization
|
||||||
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
|
|
||||||
; OLDPM_O2_FORCE_OFF: SLP Vectorizer
|
; OLDPM_O2_FORCE_OFF: SLP Vectorizer
|
||||||
|
; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops
|
||||||
|
|
||||||
; There should be no difference with the new pass manager.
|
; There should be no difference with the new pass manager.
|
||||||
; This is tested more thoroughly in other test files.
|
; This is tested more thoroughly in other test files.
|
||||||
|
|
||||||
; NEWPM_O1-LABEL: Running pass: LoopVectorizePass
|
; NEWPM_O1-LABEL: Running pass: LoopVectorizePass
|
||||||
; NEWPM_O1: Running pass: VectorCombinePass
|
|
||||||
; NEWPM_O1-NOT: Running pass: SLPVectorizerPass
|
; NEWPM_O1-NOT: Running pass: SLPVectorizerPass
|
||||||
|
; NEWPM_O1: Running pass: VectorCombinePass
|
||||||
|
|
||||||
; NEWPM_O2-LABEL: Running pass: LoopVectorizePass
|
; NEWPM_O2-LABEL: Running pass: LoopVectorizePass
|
||||||
; NEWPM_O2: Running pass: VectorCombinePass
|
|
||||||
; NEWPM_O2: Running pass: SLPVectorizerPass
|
; NEWPM_O2: Running pass: SLPVectorizerPass
|
||||||
|
; NEWPM_O2: Running pass: VectorCombinePass
|
||||||
|
|
||||||
define void @f() {
|
define void @f() {
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||||
; RUN: opt < %s -O3 -S | FileCheck %s
|
; RUN: opt < %s -O3 -S | FileCheck %s --check-prefixes=CHECK,OLDPM
|
||||||
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
|
; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
|
||||||
|
|
||||||
target triple = "x86_64--"
|
target triple = "x86_64--"
|
||||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
|
||||||
}
|
}
|
||||||
|
|
||||||
define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
|
define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
|
||||||
; CHECK-LABEL: @add_aggregate_store(
|
; OLDPM-LABEL: @add_aggregate_store(
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
|
; OLDPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
|
; OLDPM-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
|
||||||
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
|
; OLDPM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4
|
; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
|
; OLDPM-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
|
||||||
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
|
; OLDPM-NEXT: ret void
|
||||||
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4
|
;
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
|
; NEWPM-LABEL: @add_aggregate_store(
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
|
; NEWPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
|
||||||
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
|
; NEWPM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
|
||||||
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4
|
; NEWPM-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
|
; NEWPM-NEXT: store float [[TMP2]], float* [[R0]], align 4
|
||||||
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
|
; NEWPM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
|
||||||
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4
|
; NEWPM-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
|
||||||
; CHECK-NEXT: ret void
|
; NEWPM-NEXT: store float [[TMP3]], float* [[R1]], align 4
|
||||||
|
; NEWPM-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
|
||||||
|
; NEWPM-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
|
||||||
|
; NEWPM-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
|
||||||
|
; NEWPM-NEXT: store float [[TMP5]], float* [[R2]], align 4
|
||||||
|
; NEWPM-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
|
||||||
|
; NEWPM-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
|
||||||
|
; NEWPM-NEXT: store float [[TMP6]], float* [[R3]], align 4
|
||||||
|
; NEWPM-NEXT: ret void
|
||||||
;
|
;
|
||||||
%a00 = extractelement <2 x float> %a0, i32 0
|
%a00 = extractelement <2 x float> %a0, i32 0
|
||||||
%b00 = extractelement <2 x float> %b0, i32 0
|
%b00 = extractelement <2 x float> %b0, i32 0
|
||||||
|
|
|
@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
|
||||||
define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
|
define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
|
||||||
; CHECK-LABEL: @hadd_reverse_v4f32(
|
; CHECK-LABEL: @hadd_reverse_v4f32(
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
|
; CHECK-NEXT: ret <4 x float> [[TMP3]]
|
||||||
; CHECK-NEXT: [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
|
|
||||||
; CHECK-NEXT: [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
|
|
||||||
; CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
|
||||||
; CHECK-NEXT: ret <4 x float> [[VECINIT14]]
|
|
||||||
;
|
;
|
||||||
%shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
%shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||||
%shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
%shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||||
|
@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
|
||||||
|
|
||||||
define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
|
define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
|
||||||
; CHECK-LABEL: @reverse_hadd_v4f32(
|
; CHECK-LABEL: @reverse_hadd_v4f32(
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
|
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
|
; CHECK-NEXT: ret <4 x float> [[TMP4]]
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
|
|
||||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
|
|
||||||
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
|
|
||||||
; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
|
|
||||||
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
|
|
||||||
; CHECK-NEXT: ret <4 x float> [[TMP11]]
|
|
||||||
;
|
;
|
||||||
%vecext = extractelement <4 x float> %a, i32 0
|
%vecext = extractelement <4 x float> %a, i32 0
|
||||||
%vecext1 = extractelement <4 x float> %a, i32 1
|
%vecext1 = extractelement <4 x float> %a, i32 1
|
||||||
|
|
|
@ -5,19 +5,15 @@
|
||||||
target triple = "x86_64--"
|
target triple = "x86_64--"
|
||||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
|
||||||
; FIXME: This should only need 2 'or' instructions.
|
|
||||||
|
|
||||||
define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
|
define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
|
||||||
; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
|
; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
|
||||||
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
|
; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
|
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
|
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
|
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
|
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
|
; CHECK-NEXT: ret i32 [[TMP1]]
|
||||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
|
|
||||||
; CHECK-NEXT: ret i32 [[TMP7]]
|
|
||||||
;
|
;
|
||||||
%z = and <4 x i32> %x, %y
|
%z = and <4 x i32> %x, %y
|
||||||
%z0 = extractelement <4 x i32> %z, i32 0
|
%z0 = extractelement <4 x i32> %z, i32 0
|
||||||
|
|
Loading…
Reference in New Issue