Revert "[PassManager] add helper function to hold set of vector passes"

This reverts commit fefcb1f878.
It was supposed to be NFC, but as noted in the post-commit
comments in D102002, that was not true: SimplifyCFG uses
different parameters and there's a difference in an
extension point / callback.
This commit is contained in:
Sanjay Patel 2021-05-10 10:54:42 -04:00
parent 6da348569c
commit 822be4bec8
4 changed files with 200 additions and 223 deletions

View File

@ -709,9 +709,6 @@ private:
void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM,
bool IsLTO);
static Optional<std::vector<PipelineElement>>
parsePipelineText(StringRef Text);

View File

@ -218,7 +218,6 @@ private:
void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM);
void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO);
public:
/// populateFunctionPassManager - This fills in the function pass manager,

View File

@ -1201,118 +1201,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
return MPM;
}
/// FIXME: Should LTO cause any differences to this set of passes?
void PassBuilder::addVectorPasses(OptimizationLevel Level,
FunctionPassManager &FPM, bool IsLTO) {
FPM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
if (IsLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
// parallel execution resources of an out-of-order processor. We also then
// need to clean up redundancies and loop invariant code.
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && PTO.LoopUnrolling)
FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
FPM.addPass(WarnMissedTransformationsPass());
}
if (!IsLTO) {
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
FPM.addPass(LoopLoadEliminationPass());
}
// Cleanup after the loop optimization passes.
FPM.addPass(InstCombinePass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correlated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
FPM.addPass(EarlyCSEPass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(InstCombinePass());
LoopPassManager LPM;
LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
OptimizationLevel::O3));
FPM.addPass(
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
FPM.addPass(createFunctionToLoopPassAdaptor(
std::move(LPM), EnableMSSALoopDependency,
/*UseBlockFrequencyInfo=*/true));
FPM.addPass(SimplifyCFGPass());
FPM.addPass(InstCombinePass());
}
// Now that we've formed fast to execute loop structures, we do further
// optimizations. These are run afterward as they might block doing complex
// analyses and transforms such as what are needed for loop vectorization.
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));
if (IsLTO) {
FPM.addPass(SCCPPass());
FPM.addPass(InstCombinePass());
FPM.addPass(BDCEPass());
}
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization) {
FPM.addPass(SLPVectorizerPass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
FPM.addPass(EarlyCSEPass());
}
}
// Enhance/cleanup vector code.
FPM.addPass(VectorCombinePass());
if (IsLTO) {
// After vectorization, assume intrinsics may tell us more about pointer
// alignments.
FPM.addPass(AlignmentFromAssumptionsPass());
}
FPM.addPass(InstCombinePass());
if (!IsLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
// parallel execution resources of an out-of-order processor. We also then
// need to clean up redundancies and loop invariant code.
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && PTO.LoopUnrolling)
FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
FPM.addPass(WarnMissedTransformationsPass());
FPM.addPass(InstCombinePass());
}
}
ModulePassManager
PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
bool LTOPreLink) {
@ -1407,8 +1295,83 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// from the TargetLibraryInfo.
OptimizePM.addPass(InjectTLIMappings());
addVectorPasses(Level, OptimizePM, /* IsLTO */ false);
// Now run the core loop vectorizer.
OptimizePM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
OptimizePM.addPass(LoopLoadEliminationPass());
// Cleanup after the loop optimization passes.
OptimizePM.addPass(InstCombinePass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correlated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
OptimizePM.addPass(EarlyCSEPass());
OptimizePM.addPass(CorrelatedValuePropagationPass());
OptimizePM.addPass(InstCombinePass());
LoopPassManager LPM;
LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
OptimizationLevel::O3));
OptimizePM.addPass(
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
std::move(LPM), EnableMSSALoopDependency,
/*UseBlockFrequencyInfo=*/true));
OptimizePM.addPass(SimplifyCFGPass());
OptimizePM.addPass(InstCombinePass());
}
// Now that we've formed fast to execute loop structures, we do further
// optimizations. These are run afterward as they might block doing complex
// analyses and transforms such as what are needed for loop vectorization.
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization) {
OptimizePM.addPass(SLPVectorizerPass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
OptimizePM.addPass(EarlyCSEPass());
}
}
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(InstCombinePass());
// Unroll small loops to hide loop backedge latency and saturate any parallel
// execution resources of an out-of-order processor. We also then need to
// clean up redundancies and loop invariant code.
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
}
OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
OptimizePM.addPass(WarnMissedTransformationsPass());
OptimizePM.addPass(InstCombinePass());
OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
@ -1862,9 +1825,39 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
MainFPM.addPass(LoopDistributePass());
MainFPM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
// The vectorizer may have significantly shortened a loop body; unroll again.
MainFPM.addPass(LoopUnrollPass(LoopUnrollOptions(
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
PTO.ForgetAllSCEVInLoopUnroll)));
addVectorPasses(Level, MainFPM, /* IsLTO */ true);
MainFPM.addPass(WarnMissedTransformationsPass());
MainFPM.addPass(InstCombinePass());
MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
MainFPM.addPass(SCCPPass());
MainFPM.addPass(InstCombinePass());
MainFPM.addPass(BDCEPass());
// More scalar chains could be vectorized due to more alias information
if (PTO.SLPVectorization) {
MainFPM.addPass(SLPVectorizerPass());
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
MainFPM.addPass(EarlyCSEPass());
}
}
MainFPM.addPass(VectorCombinePass()); // Clean up partial vectorization.
// After vectorization, assume intrinsics may tell us more about pointer
// alignments.
MainFPM.addPass(AlignmentFromAssumptionsPass());
// FIXME: Conditionally run LoadCombine here, after it's ported
// (in case we still have this pass, given its questionable usefulness).
MainFPM.addPass(InstCombinePass());
invokePeepholeEPCallbacks(MainFPM, Level);
MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));

View File

@ -523,109 +523,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
MPM.add(createControlHeightReductionLegacyPass());
}
/// FIXME: Should LTO cause any differences to this set of passes?
void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
bool IsLTO) {
PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
if (IsLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
// parallel execution resources of an out-of-order processor. We also then
// need to clean up redundancies and loop invariant code.
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && !DisableUnrollLoops)
PM.add(createLoopUnrollAndJamPass(OptLevel));
PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
PM.add(createWarnMissedTransformationsPass());
}
if (!IsLTO) {
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
PM.add(createLoopLoadEliminationPass());
}
// Cleanup after the loop optimization passes.
PM.add(createInstructionCombiningPass());
if (OptLevel > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correlated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
PM.add(createEarlyCSEPass());
PM.add(createCorrelatedValuePropagationPass());
PM.add(createInstructionCombiningPass());
PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
PM.add(createCFGSimplificationPass());
PM.add(createInstructionCombiningPass());
}
// Now that we've formed fast to execute loop structures, we do further
// optimizations. These are run afterward as they might block doing complex
// analyses and transforms such as what are needed for loop vectorization.
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
PM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));
if (IsLTO) {
PM.add(createSCCPPass()); // Propagate exposed constants
PM.add(createInstructionCombiningPass()); // Clean up again
PM.add(createBitTrackingDCEPass());
}
// Optimize parallel scalar instruction chains into SIMD instructions.
if (SLPVectorize) {
PM.add(createSLPVectorizerPass());
if (OptLevel > 1 && ExtraVectorizerPasses)
PM.add(createEarlyCSEPass());
}
// Enhance/cleanup vector code.
PM.add(createVectorCombinePass());
if (IsLTO) {
// After vectorization, assume intrinsics may tell us more about pointer
// alignments.
PM.add(createAlignmentFromAssumptionsPass());
}
addExtensionsToPM(EP_Peephole, PM);
PM.add(createInstructionCombiningPass());
if (!IsLTO) {
// The vectorizer may have significantly shortened a loop body; unroll
// again. Unroll small loops to hide loop backedge latency and saturate any
// parallel execution resources of an out-of-order processor. We also then
// need to clean up redundancies and loop invariant code.
// FIXME: It would be really good to use a loop-integrated instruction
// combiner for cleanup here so that the unrolling and LICM can be pipelined
// across the loop nests.
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
if (EnableUnrollAndJam && !DisableUnrollLoops)
PM.add(createLoopUnrollAndJamPass(OptLevel));
PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
if (!DisableUnrollLoops)
PM.add(createInstructionCombiningPass());
}
}
void PassManagerBuilder::populateModulePassManager(
legacy::PassManagerBase &MPM) {
// Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
@ -897,9 +794,74 @@ void PassManagerBuilder::populateModulePassManager(
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
MPM.add(createLoopDistributePass());
addVectorPasses(MPM, /* IsLTO */ false);
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
MPM.add(createLoopLoadEliminationPass());
// FIXME: Because of #pragma vectorize enable, the passes below are always
// inserted in the pipeline, even when the vectorizer doesn't run (ex. when
// on -O1 and no #pragma is found). Would be good to have these two passes
// as function calls, so that we can only pass them when the vectorizer
// changed the code.
MPM.add(createInstructionCombiningPass());
if (OptLevel > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correllated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
MPM.add(createEarlyCSEPass());
MPM.add(createCorrelatedValuePropagationPass());
MPM.add(createInstructionCombiningPass());
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
MPM.add(createCFGSimplificationPass());
MPM.add(createInstructionCombiningPass());
}
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
// GVN, loop transforms, and others have already run, so it's now better to
// convert to more optimized IR using more aggressive simplify CFG options.
// The extra sinking transform can create larger basic blocks, so do this
// before SLP vectorization.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
.forwardSwitchCondToPhi(true)
.convertSwitchToLookupTable(true)
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));
if (SLPVectorize) {
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
if (OptLevel > 1 && ExtraVectorizerPasses) {
MPM.add(createEarlyCSEPass());
}
}
// Enhance/cleanup vector code.
MPM.add(createVectorCombinePass());
addExtensionsToPM(EP_Peephole, MPM);
MPM.add(createInstructionCombiningPass());
if (EnableUnrollAndJam && !DisableUnrollLoops) {
// Unroll and Jam. We do this before unroll but need to be in a separate
// loop pass manager in order for the outer loop to be processed by
// unroll and jam before the inner loop is unrolled.
MPM.add(createLoopUnrollAndJamPass(OptLevel));
}
// Unroll small loops
MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
if (!DisableUnrollLoops) {
// LoopUnroll may generate some redundency to cleanup.
MPM.add(createInstructionCombiningPass());
// Runtime unrolling will introduce runtime check in loop prologue. If the
// unrolled loop is a inner loop, then the prologue will be inside the
// outer loop. LICM pass can help to promote the runtime check out if the
@ -1121,9 +1083,35 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
PM.add(createLoopDistributePass());
PM.add(createLoopVectorizePass(true, !LoopVectorize));
// The vectorizer may have significantly shortened a loop body; unroll again.
PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
ForgetAllSCEVInLoopUnroll));
addVectorPasses(PM, /* IsLTO */ true);
PM.add(createWarnMissedTransformationsPass());
// Now that we've optimized loops (in particular loop induction variables),
// we may have exposed more scalar opportunities. Run parts of the scalar
// optimizer again at this point.
PM.add(createInstructionCombiningPass()); // Initial cleanup
PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
.hoistCommonInsts(true)));
PM.add(createSCCPPass()); // Propagate exposed constants
PM.add(createInstructionCombiningPass()); // Clean up again
PM.add(createBitTrackingDCEPass());
// More scalar chains could be vectorized due to more alias information
if (SLPVectorize)
PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
PM.add(createVectorCombinePass()); // Clean up partial vectorization.
// After vectorization, assume intrinsics may tell us more about pointer
// alignments.
PM.add(createAlignmentFromAssumptionsPass());
// Cleanup and simplify the code after the scalar optimizations.
PM.add(createInstructionCombiningPass());
addExtensionsToPM(EP_Peephole, PM);
PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));