2021-09-15 07:44:29 +08:00
|
|
|
//===- Construction of pass pipelines -------------------------------------===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// \file
|
|
|
|
///
|
|
|
|
/// This file provides the implementation of the PassBuilder based on our
|
|
|
|
/// static pass registry as well as related functionality. It also provides
|
|
|
|
/// helpers to aid in analyzing, debugging, and testing passes and pass
|
|
|
|
/// pipelines.
|
|
|
|
///
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/BasicAliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/CGSCCPassManager.h"
|
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
|
|
|
#include "llvm/Analysis/InlineAdvisor.h"
|
|
|
|
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
|
|
|
#include "llvm/Analysis/ProfileSummaryInfo.h"
|
|
|
|
#include "llvm/Analysis/ScopedNoAliasAA.h"
|
|
|
|
#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
|
|
|
|
#include "llvm/IR/PassManager.h"
|
|
|
|
#include "llvm/Passes/OptimizationLevel.h"
|
|
|
|
#include "llvm/Passes/PassBuilder.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/PGOOptions.h"
|
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
|
|
|
|
#include "llvm/Transforms/Coroutines/CoroCleanup.h"
|
|
|
|
#include "llvm/Transforms/Coroutines/CoroEarly.h"
|
|
|
|
#include "llvm/Transforms/Coroutines/CoroElide.h"
|
|
|
|
#include "llvm/Transforms/Coroutines/CoroSplit.h"
|
|
|
|
#include "llvm/Transforms/IPO/AlwaysInliner.h"
|
|
|
|
#include "llvm/Transforms/IPO/Annotation2Metadata.h"
|
|
|
|
#include "llvm/Transforms/IPO/ArgumentPromotion.h"
|
|
|
|
#include "llvm/Transforms/IPO/Attributor.h"
|
|
|
|
#include "llvm/Transforms/IPO/CalledValuePropagation.h"
|
|
|
|
#include "llvm/Transforms/IPO/ConstantMerge.h"
|
|
|
|
#include "llvm/Transforms/IPO/CrossDSOCFI.h"
|
|
|
|
#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
|
|
|
|
#include "llvm/Transforms/IPO/ElimAvailExtern.h"
|
|
|
|
#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
|
|
|
|
#include "llvm/Transforms/IPO/FunctionAttrs.h"
|
|
|
|
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
|
|
|
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
|
|
|
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
|
|
|
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
|
|
|
#include "llvm/Transforms/IPO/IROutliner.h"
|
|
|
|
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
|
|
|
#include "llvm/Transforms/IPO/Inliner.h"
|
|
|
|
#include "llvm/Transforms/IPO/LowerTypeTests.h"
|
|
|
|
#include "llvm/Transforms/IPO/MergeFunctions.h"
|
2021-11-09 11:01:48 +08:00
|
|
|
#include "llvm/Transforms/IPO/ModuleInliner.h"
|
2021-09-15 07:44:29 +08:00
|
|
|
#include "llvm/Transforms/IPO/OpenMPOpt.h"
|
|
|
|
#include "llvm/Transforms/IPO/PartialInlining.h"
|
|
|
|
#include "llvm/Transforms/IPO/SCCP.h"
|
|
|
|
#include "llvm/Transforms/IPO/SampleProfile.h"
|
|
|
|
#include "llvm/Transforms/IPO/SampleProfileProbe.h"
|
|
|
|
#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
|
|
|
|
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
|
|
|
|
#include "llvm/Transforms/InstCombine/InstCombine.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/CGProfile.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/MemProfiler.h"
|
|
|
|
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
|
|
|
|
#include "llvm/Transforms/Scalar/ADCE.h"
|
|
|
|
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
|
|
|
|
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
|
|
|
|
#include "llvm/Transforms/Scalar/BDCE.h"
|
|
|
|
#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
|
|
|
|
#include "llvm/Transforms/Scalar/ConstraintElimination.h"
|
|
|
|
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
|
|
|
|
#include "llvm/Transforms/Scalar/DFAJumpThreading.h"
|
|
|
|
#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
|
|
|
|
#include "llvm/Transforms/Scalar/DivRemPairs.h"
|
|
|
|
#include "llvm/Transforms/Scalar/EarlyCSE.h"
|
|
|
|
#include "llvm/Transforms/Scalar/Float2Int.h"
|
|
|
|
#include "llvm/Transforms/Scalar/GVN.h"
|
|
|
|
#include "llvm/Transforms/Scalar/IndVarSimplify.h"
|
|
|
|
#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
|
|
|
|
#include "llvm/Transforms/Scalar/JumpThreading.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LICM.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopDeletion.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopDistribute.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopFlatten.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopInterchange.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopPassManager.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopRotation.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopSink.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
|
|
|
|
#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
|
|
|
|
#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
|
|
|
|
#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
|
|
|
|
#include "llvm/Transforms/Scalar/NewGVN.h"
|
|
|
|
#include "llvm/Transforms/Scalar/Reassociate.h"
|
|
|
|
#include "llvm/Transforms/Scalar/SCCP.h"
|
|
|
|
#include "llvm/Transforms/Scalar/SROA.h"
|
|
|
|
#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
|
|
|
|
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
|
|
|
|
#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
|
|
|
|
#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
|
|
|
|
#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
|
|
|
|
#include "llvm/Transforms/Utils/AddDiscriminators.h"
|
|
|
|
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
|
|
|
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
|
|
|
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
|
|
|
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
|
|
|
|
#include "llvm/Transforms/Utils/Mem2Reg.h"
|
|
|
|
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
|
|
|
|
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
|
|
|
|
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
|
|
|
|
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
|
|
|
|
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
|
|
|
|
#include "llvm/Transforms/Vectorize/VectorCombine.h"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
|
|
|
|
"enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
|
|
|
|
cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
|
|
|
|
cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
|
|
|
|
"Heuristics-based inliner version."),
|
|
|
|
clEnumValN(InliningAdvisorMode::Development, "development",
|
|
|
|
"Use development mode (runtime-loadable model)."),
|
|
|
|
clEnumValN(InliningAdvisorMode::Release, "release",
|
|
|
|
"Use release mode (AOT-compiled model).")));
|
|
|
|
|
|
|
|
static cl::opt<bool> EnableSyntheticCounts(
|
|
|
|
"enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
|
|
|
|
cl::desc("Run synthetic function entry count generation "
|
|
|
|
"pass"));
|
|
|
|
|
|
|
|
/// Flag to enable inline deferral during PGO.
|
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
|
|
|
|
cl::Hidden,
|
|
|
|
cl::desc("Enable inline deferral during PGO"));
|
|
|
|
|
|
|
|
static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false),
|
|
|
|
cl::Hidden, cl::ZeroOrMore,
|
|
|
|
cl::desc("Enable memory profiler"));
|
|
|
|
|
2021-11-09 11:01:48 +08:00
|
|
|
static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
|
|
|
|
cl::init(false), cl::Hidden,
|
|
|
|
cl::desc("Enable module inliner"));
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
static cl::opt<bool> PerformMandatoryInliningsFirst(
|
|
|
|
"mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore,
|
|
|
|
cl::desc("Perform mandatory inlinings module-wide, before performing "
|
|
|
|
"inlining."));
|
|
|
|
|
|
|
|
static cl::opt<bool> EnableO3NonTrivialUnswitching(
|
|
|
|
"enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden,
|
|
|
|
cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3"));
|
|
|
|
|
2021-11-04 06:45:30 +08:00
|
|
|
static cl::opt<bool> EnableEagerlyInvalidateAnalyses(
|
[NewPM] Only invalidate modified functions' analyses in CGSCC passes + turn on eagerly invalidate analyses
Previously, any change in any function in an SCC would cause all
analyses for all functions in the SCC to be invalidated. With this
change, we now manually invalidate analyses for functions we modify,
then let the pass manager know that all function analyses should be
preserved since we've already handled function analysis invalidation.
So far this only touches the inliner, argpromotion, function-attrs, and
updateCGAndAnalysisManager(), since they are the most used.
This is part of an effort to investigate running the function
simplification pipeline less on functions we visit multiple times in the
inliner pipeline.
However, this causes major memory regressions especially on larger IR.
To counteract this, turn on the option to eagerly invalidate function
analyses. This invalidates analyses on functions immediately after
they're processed in a module or scc to function adaptor for specific
parts of the pipeline.
Within an SCC, if a pass only modifies one function, other functions in
the SCC do not have their analyses invalidated, so in later function
passes in the SCC pass manager the analyses may still be cached. It is
only after the function passes that the eager invalidation takes effect.
For the default pipelines this makes sense because the inliner pipeline
runs the function simplification pipeline after all other SCC passes
(except CoroSplit which doesn't request any analyses).
Overall this has mostly positive effects on compile time and positive effects on memory usage.
https://llvm-compile-time-tracker.com/compare.php?from=7f627596977624730f9298a1b69883af1555765e&to=39e824e0d3ca8a517502f13032dfa67304841c90&stat=instructions
https://llvm-compile-time-tracker.com/compare.php?from=7f627596977624730f9298a1b69883af1555765e&to=39e824e0d3ca8a517502f13032dfa67304841c90&stat=max-rss
D113196 shows that we slightly regressed compile times in exchange for
some memory improvements when turning on eager invalidation. D100917
shows that we slightly improved compile times in exchange for major
memory regressions in some cases when invalidating less in SCC passes.
Turning these on at the same time keeps the memory improvements while
keeping compile times neutral/slightly positive.
Reviewed By: asbirlea, nikic
Differential Revision: https://reviews.llvm.org/D113304
2021-05-04 07:50:26 +08:00
|
|
|
"eagerly-invalidate-analyses", cl::init(true), cl::Hidden,
|
2021-11-04 06:45:30 +08:00
|
|
|
cl::desc("Eagerly invalidate more analyses in default pipelines"));
|
|
|
|
|
2021-11-17 04:58:13 +08:00
|
|
|
static cl::opt<bool> EnableNoRerunSimplificationPipeline(
|
|
|
|
"enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc(
|
|
|
|
"Prevent running the simplification pipeline on a function more "
|
|
|
|
"than once in the case that SCC mutations cause a function to be "
|
|
|
|
"visited multiple times as long as the function has not been changed"));
|
|
|
|
|
2021-12-04 06:26:54 +08:00
|
|
|
static cl::opt<bool> EnableMergeFunctions(
|
|
|
|
"enable-merge-functions", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc("Enable function merging as part of the optimization pipeline"));
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
PipelineTuningOptions::PipelineTuningOptions() {
|
|
|
|
LoopInterleaving = true;
|
|
|
|
LoopVectorization = true;
|
|
|
|
SLPVectorization = false;
|
|
|
|
LoopUnrolling = true;
|
|
|
|
ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
|
|
|
|
LicmMssaOptCap = SetLicmMssaOptCap;
|
|
|
|
LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
|
|
|
|
CallGraphProfile = true;
|
2021-12-04 06:26:54 +08:00
|
|
|
MergeFunctions = EnableMergeFunctions;
|
2021-11-04 06:45:30 +08:00
|
|
|
EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
|
2021-09-15 07:44:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
|
|
|
extern cl::opt<unsigned> MaxDevirtIterations;
|
|
|
|
extern cl::opt<bool> EnableConstraintElimination;
|
|
|
|
extern cl::opt<bool> EnableFunctionSpecialization;
|
|
|
|
extern cl::opt<bool> EnableGVNHoist;
|
|
|
|
extern cl::opt<bool> EnableGVNSink;
|
|
|
|
extern cl::opt<bool> EnableHotColdSplit;
|
|
|
|
extern cl::opt<bool> EnableIROutliner;
|
|
|
|
extern cl::opt<bool> EnableOrderFileInstrumentation;
|
|
|
|
extern cl::opt<bool> EnableCHR;
|
|
|
|
extern cl::opt<bool> EnableLoopInterchange;
|
|
|
|
extern cl::opt<bool> EnableUnrollAndJam;
|
|
|
|
extern cl::opt<bool> EnableLoopFlatten;
|
|
|
|
extern cl::opt<bool> EnableDFAJumpThreading;
|
|
|
|
extern cl::opt<bool> RunNewGVN;
|
|
|
|
extern cl::opt<bool> RunPartialInlining;
|
|
|
|
extern cl::opt<bool> ExtraVectorizerPasses;
|
|
|
|
|
|
|
|
extern cl::opt<bool> FlattenedProfileUsed;
|
|
|
|
|
|
|
|
extern cl::opt<AttributorRunOption> AttributorRun;
|
|
|
|
extern cl::opt<bool> EnableKnowledgeRetention;
|
|
|
|
|
|
|
|
extern cl::opt<bool> EnableMatrix;
|
|
|
|
|
|
|
|
extern cl::opt<bool> DisablePreInliner;
|
|
|
|
extern cl::opt<int> PreInlineThreshold;
|
|
|
|
} // namespace llvm
|
|
|
|
|
|
|
|
void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM,
|
|
|
|
OptimizationLevel Level) {
|
|
|
|
for (auto &C : PeepholeEPCallbacks)
|
|
|
|
C(FPM, Level);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Helper to add AnnotationRemarksPass.
|
|
|
|
static void addAnnotationRemarksPass(ModulePassManager &MPM) {
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
FPM.addPass(AnnotationRemarksPass());
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Helper to check if the current compilation phase is preparing for LTO
|
|
|
|
static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
|
|
|
|
return Phase == ThinOrFullLTOPhase::ThinLTOPreLink ||
|
|
|
|
Phase == ThinOrFullLTOPhase::FullLTOPreLink;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: Investigate the cost/benefit of tail call elimination on debugging.
|
|
|
|
FunctionPassManager
|
|
|
|
PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
|
|
|
|
ThinOrFullLTOPhase Phase) {
|
|
|
|
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
|
|
|
|
// Form SSA out of local memory accesses after breaking apart aggregates into
|
|
|
|
// scalars.
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Catch trivial redundancies
|
|
|
|
FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
|
|
|
|
|
|
|
|
// Hoisting of scalars and load expressions.
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
|
|
|
|
FPM.addPass(LibCallsShrinkWrapPass());
|
|
|
|
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
|
|
|
|
// Form canonically associated expression trees, and simplify the trees using
|
|
|
|
// basic mathematical properties. For example, this will form (nearly)
|
|
|
|
// minimal multiplication trees.
|
|
|
|
FPM.addPass(ReassociatePass());
|
|
|
|
|
|
|
|
// Add the primary loop simplification pipeline.
|
|
|
|
// FIXME: Currently this is split into two loop pass pipelines because we run
|
|
|
|
// some function passes in between them. These can and should be removed
|
|
|
|
// and/or replaced by scheduling the loop pass equivalents in the correct
|
|
|
|
// positions. But those equivalent passes aren't powerful enough yet.
|
|
|
|
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
|
|
|
|
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
|
|
|
|
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
|
|
|
|
// `LoopInstSimplify`.
|
|
|
|
LoopPassManager LPM1, LPM2;
|
|
|
|
|
|
|
|
// Simplify the loop body. We do this initially to clean up after other loop
|
|
|
|
// passes run, either when iterating on a loop or on inner loops with
|
|
|
|
// implications on the outer loop.
|
|
|
|
LPM1.addPass(LoopInstSimplifyPass());
|
|
|
|
LPM1.addPass(LoopSimplifyCFGPass());
|
|
|
|
|
|
|
|
// Try to remove as much code from the loop header as possible,
|
|
|
|
// to reduce amount of IR that will have to be duplicated.
|
|
|
|
// TODO: Investigate promotion cap for O1.
|
|
|
|
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
|
|
|
|
|
|
|
|
LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
|
|
|
|
isLTOPreLink(Phase)));
|
|
|
|
// TODO: Investigate promotion cap for O1.
|
|
|
|
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
|
|
|
|
LPM1.addPass(SimpleLoopUnswitchPass());
|
2022-01-19 22:09:59 +08:00
|
|
|
if (EnableLoopFlatten)
|
|
|
|
LPM1.addPass(LoopFlattenPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
LPM2.addPass(LoopIdiomRecognizePass());
|
|
|
|
LPM2.addPass(IndVarSimplifyPass());
|
|
|
|
|
|
|
|
for (auto &C : LateLoopOptimizationsEPCallbacks)
|
|
|
|
C(LPM2, Level);
|
|
|
|
|
|
|
|
LPM2.addPass(LoopDeletionPass());
|
|
|
|
|
|
|
|
if (EnableLoopInterchange)
|
|
|
|
LPM2.addPass(LoopInterchangePass());
|
|
|
|
|
|
|
|
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
|
|
|
|
// because it changes IR to makes profile annotation in back compile
|
|
|
|
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
|
|
|
|
// attributes so we need to make sure and allow the full unroll pass to pay
|
|
|
|
// attention to it.
|
|
|
|
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
|
|
|
|
PGOOpt->Action != PGOOptions::SampleUse)
|
|
|
|
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
|
|
|
|
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
|
|
|
|
PTO.ForgetAllSCEVInLoopUnroll));
|
|
|
|
|
|
|
|
for (auto &C : LoopOptimizerEndEPCallbacks)
|
|
|
|
C(LPM2, Level);
|
|
|
|
|
|
|
|
// We provide the opt remark emitter pass for LICM to use. We only need to do
|
|
|
|
// this once as it is immutable.
|
|
|
|
FPM.addPass(
|
|
|
|
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
|
|
|
|
/*UseMemorySSA=*/true,
|
|
|
|
/*UseBlockFrequencyInfo=*/true));
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
// The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
|
|
|
|
// *All* loop passes must preserve it, in order to be able to use it.
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
|
|
|
|
/*UseMemorySSA=*/false,
|
|
|
|
/*UseBlockFrequencyInfo=*/false));
|
|
|
|
|
|
|
|
// Delete small array after loop unroll.
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Specially optimize memory movement as it doesn't look like dataflow in SSA.
|
|
|
|
FPM.addPass(MemCpyOptPass());
|
|
|
|
|
|
|
|
// Sparse conditional constant propagation.
|
|
|
|
// FIXME: It isn't clear why we do this *after* loop passes rather than
|
|
|
|
// before...
|
|
|
|
FPM.addPass(SCCPPass());
|
|
|
|
|
|
|
|
// Delete dead bit computations (instcombine runs after to fold away the dead
|
|
|
|
// computations, and then ADCE will run later to exploit any new DCE
|
|
|
|
// opportunities that creates).
|
|
|
|
FPM.addPass(BDCEPass());
|
|
|
|
|
|
|
|
// Run instcombine after redundancy and dead bit elimination to exploit
|
|
|
|
// opportunities opened up by them.
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
FPM.addPass(CoroElidePass());
|
|
|
|
|
|
|
|
for (auto &C : ScalarOptimizerLateEPCallbacks)
|
|
|
|
C(FPM, Level);
|
|
|
|
|
|
|
|
// Finally, do an expensive DCE pass to catch all the dead code exposed by
|
|
|
|
// the simplifications and basic cleanup after all the simplifications.
|
|
|
|
// TODO: Investigate if this is too expensive.
|
|
|
|
FPM.addPass(ADCEPass());
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
return FPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
FunctionPassManager
|
|
|
|
PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
|
|
|
|
ThinOrFullLTOPhase Phase) {
|
|
|
|
assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
|
|
|
|
|
|
|
|
// The O1 pipeline has a separate pipeline creation function to simplify
|
|
|
|
// construction readability.
|
|
|
|
if (Level.getSpeedupLevel() == 1)
|
|
|
|
return buildO1FunctionSimplificationPipeline(Level, Phase);
|
|
|
|
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
|
|
|
|
// Form SSA out of local memory accesses after breaking apart aggregates into
|
|
|
|
// scalars.
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Catch trivial redundancies
|
|
|
|
FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
|
|
|
|
if (EnableKnowledgeRetention)
|
|
|
|
FPM.addPass(AssumeSimplifyPass());
|
|
|
|
|
|
|
|
// Hoisting of scalars and load expressions.
|
|
|
|
if (EnableGVNHoist)
|
|
|
|
FPM.addPass(GVNHoistPass());
|
|
|
|
|
|
|
|
// Global value numbering based sinking.
|
|
|
|
if (EnableGVNSink) {
|
|
|
|
FPM.addPass(GVNSinkPass());
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (EnableConstraintElimination)
|
|
|
|
FPM.addPass(ConstraintEliminationPass());
|
|
|
|
|
|
|
|
// Speculative execution if the target has divergent branches; otherwise nop.
|
|
|
|
FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
|
|
|
|
|
|
|
|
// Optimize based on known information about branches, and cleanup afterward.
|
|
|
|
FPM.addPass(JumpThreadingPass());
|
|
|
|
FPM.addPass(CorrelatedValuePropagationPass());
|
|
|
|
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
2021-11-01 21:48:52 +08:00
|
|
|
FPM.addPass(InstCombinePass());
|
2021-09-15 07:44:29 +08:00
|
|
|
if (Level == OptimizationLevel::O3)
|
|
|
|
FPM.addPass(AggressiveInstCombinePass());
|
|
|
|
|
|
|
|
if (!Level.isOptimizingForSize())
|
|
|
|
FPM.addPass(LibCallsShrinkWrapPass());
|
|
|
|
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
// For PGO use pipeline, try to optimize memory intrinsics such as memcpy
|
|
|
|
// using the size value profile. Don't perform this when optimizing for size.
|
|
|
|
if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
|
|
|
|
!Level.isOptimizingForSize())
|
|
|
|
FPM.addPass(PGOMemOPSizeOpt());
|
|
|
|
|
|
|
|
FPM.addPass(TailCallElimPass());
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
|
|
|
|
// Form canonically associated expression trees, and simplify the trees using
|
|
|
|
// basic mathematical properties. For example, this will form (nearly)
|
|
|
|
// minimal multiplication trees.
|
|
|
|
FPM.addPass(ReassociatePass());
|
|
|
|
|
|
|
|
// Add the primary loop simplification pipeline.
|
|
|
|
// FIXME: Currently this is split into two loop pass pipelines because we run
|
|
|
|
// some function passes in between them. These can and should be removed
|
|
|
|
// and/or replaced by scheduling the loop pass equivalents in the correct
|
|
|
|
// positions. But those equivalent passes aren't powerful enough yet.
|
|
|
|
// Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
|
|
|
|
// used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
|
|
|
|
// fully replace `SimplifyCFGPass`, and the closest to the other we have is
|
|
|
|
// `LoopInstSimplify`.
|
|
|
|
LoopPassManager LPM1, LPM2;
|
|
|
|
|
|
|
|
// Simplify the loop body. We do this initially to clean up after other loop
|
|
|
|
// passes run, either when iterating on a loop or on inner loops with
|
|
|
|
// implications on the outer loop.
|
|
|
|
LPM1.addPass(LoopInstSimplifyPass());
|
|
|
|
LPM1.addPass(LoopSimplifyCFGPass());
|
|
|
|
|
|
|
|
// Try to remove as much code from the loop header as possible,
|
|
|
|
// to reduce amount of IR that will have to be duplicated.
|
|
|
|
// TODO: Investigate promotion cap for O1.
|
|
|
|
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
|
|
|
|
|
|
|
|
// Disable header duplication in loop rotation at -Oz.
|
|
|
|
LPM1.addPass(
|
|
|
|
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
|
|
|
|
// TODO: Investigate promotion cap for O1.
|
|
|
|
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
|
|
|
|
LPM1.addPass(
|
|
|
|
SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
|
|
|
|
EnableO3NonTrivialUnswitching));
|
2022-01-19 22:09:59 +08:00
|
|
|
if (EnableLoopFlatten)
|
|
|
|
LPM1.addPass(LoopFlattenPass());
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
LPM2.addPass(LoopIdiomRecognizePass());
|
|
|
|
LPM2.addPass(IndVarSimplifyPass());
|
|
|
|
|
|
|
|
for (auto &C : LateLoopOptimizationsEPCallbacks)
|
|
|
|
C(LPM2, Level);
|
|
|
|
|
|
|
|
LPM2.addPass(LoopDeletionPass());
|
|
|
|
|
|
|
|
if (EnableLoopInterchange)
|
|
|
|
LPM2.addPass(LoopInterchangePass());
|
|
|
|
|
|
|
|
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
|
|
|
|
// because it changes IR to makes profile annotation in back compile
|
|
|
|
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
|
|
|
|
// attributes so we need to make sure and allow the full unroll pass to pay
|
|
|
|
// attention to it.
|
|
|
|
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
|
|
|
|
PGOOpt->Action != PGOOptions::SampleUse)
|
|
|
|
LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
|
|
|
|
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
|
|
|
|
PTO.ForgetAllSCEVInLoopUnroll));
|
|
|
|
|
|
|
|
for (auto &C : LoopOptimizerEndEPCallbacks)
|
|
|
|
C(LPM2, Level);
|
|
|
|
|
|
|
|
// We provide the opt remark emitter pass for LICM to use. We only need to do
|
|
|
|
// this once as it is immutable.
|
|
|
|
FPM.addPass(
|
|
|
|
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
|
|
|
|
/*UseMemorySSA=*/true,
|
|
|
|
/*UseBlockFrequencyInfo=*/true));
|
|
|
|
FPM.addPass(SimplifyCFGPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
// The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
|
|
|
|
// LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
|
|
|
|
// *All* loop passes must preserve it, in order to be able to use it.
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
|
|
|
|
/*UseMemorySSA=*/false,
|
|
|
|
/*UseBlockFrequencyInfo=*/false));
|
|
|
|
|
|
|
|
// Delete small array after loop unroll.
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
2021-09-22 19:29:48 +08:00
|
|
|
// The matrix extension can introduce large vector operations early, which can
|
|
|
|
// benefit from running vector-combine early on.
|
|
|
|
if (EnableMatrix)
|
2021-10-16 02:27:23 +08:00
|
|
|
FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true));
|
2021-09-22 19:29:48 +08:00
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
// Eliminate redundancies.
|
|
|
|
FPM.addPass(MergedLoadStoreMotionPass());
|
|
|
|
if (RunNewGVN)
|
|
|
|
FPM.addPass(NewGVNPass());
|
|
|
|
else
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(GVNPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Sparse conditional constant propagation.
|
|
|
|
// FIXME: It isn't clear why we do this *after* loop passes rather than
|
|
|
|
// before...
|
|
|
|
FPM.addPass(SCCPPass());
|
|
|
|
|
|
|
|
// Delete dead bit computations (instcombine runs after to fold away the dead
|
|
|
|
// computations, and then ADCE will run later to exploit any new DCE
|
|
|
|
// opportunities that creates).
|
|
|
|
FPM.addPass(BDCEPass());
|
|
|
|
|
|
|
|
// Run instcombine after redundancy and dead bit elimination to exploit
|
|
|
|
// opportunities opened up by them.
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
// Re-consider control flow based optimizations after redundancy elimination,
|
|
|
|
// redo DCE, etc.
|
|
|
|
if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
|
|
|
|
FPM.addPass(DFAJumpThreadingPass());
|
|
|
|
|
|
|
|
FPM.addPass(JumpThreadingPass());
|
|
|
|
FPM.addPass(CorrelatedValuePropagationPass());
|
|
|
|
|
|
|
|
// Finally, do an expensive DCE pass to catch all the dead code exposed by
|
|
|
|
// the simplifications and basic cleanup after all the simplifications.
|
|
|
|
// TODO: Investigate if this is too expensive.
|
|
|
|
FPM.addPass(ADCEPass());
|
|
|
|
|
|
|
|
// Specially optimize memory movement as it doesn't look like dataflow in SSA.
|
|
|
|
FPM.addPass(MemCpyOptPass());
|
|
|
|
|
|
|
|
FPM.addPass(DSEPass());
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
|
|
|
|
/*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
|
|
|
|
|
|
|
|
FPM.addPass(CoroElidePass());
|
|
|
|
|
|
|
|
for (auto &C : ScalarOptimizerLateEPCallbacks)
|
|
|
|
C(FPM, Level);
|
|
|
|
|
|
|
|
FPM.addPass(SimplifyCFGPass(
|
|
|
|
SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
|
|
|
|
(PGOOpt->Action == PGOOptions::IRUse ||
|
|
|
|
PGOOpt->Action == PGOOptions::SampleUse))
|
|
|
|
FPM.addPass(ControlHeightReductionPass());
|
|
|
|
|
|
|
|
return FPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
|
|
|
|
MPM.addPass(CanonicalizeAliasesPass());
|
|
|
|
MPM.addPass(NameAnonGlobalPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
|
|
|
|
OptimizationLevel Level, bool RunProfileGen,
|
|
|
|
bool IsCS, std::string ProfileFile,
|
|
|
|
std::string ProfileRemappingFile) {
|
|
|
|
assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
|
|
|
|
if (!IsCS && !DisablePreInliner) {
|
|
|
|
InlineParams IP;
|
|
|
|
|
|
|
|
IP.DefaultThreshold = PreInlineThreshold;
|
|
|
|
|
|
|
|
// FIXME: The hint threshold has the same value used by the regular inliner
|
|
|
|
// when not optimzing for size. This should probably be lowered after
|
|
|
|
// performance testing.
|
|
|
|
// FIXME: this comment is cargo culted from the old pass manager, revisit).
|
|
|
|
IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
|
|
|
|
ModuleInlinerWrapperPass MIWP(IP);
|
|
|
|
CGSCCPassManager &CGPipeline = MIWP.getPM();
|
|
|
|
|
|
|
|
FunctionPassManager FPM;
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
|
|
|
|
FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
|
|
|
|
FPM.addPass(InstCombinePass()); // Combine silly sequences.
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
2021-11-04 06:45:30 +08:00
|
|
|
CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
|
|
|
|
std::move(FPM), PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
MPM.addPass(std::move(MIWP));
|
|
|
|
|
|
|
|
// Delete anything that is now dead to make sure that we don't instrument
|
|
|
|
// dead code. Instrumentation can end up keeping dead code around and
|
|
|
|
// dramatically increase code size.
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!RunProfileGen) {
|
|
|
|
assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
|
|
|
|
MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
|
|
|
|
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
|
|
|
|
// RequireAnalysisPass for PSI before subsequent non-module passes.
|
|
|
|
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Perform PGO instrumentation.
|
|
|
|
MPM.addPass(PGOInstrumentationGen(IsCS));
|
|
|
|
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
// Disable header duplication in loop rotation at -Oz.
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false,
|
|
|
|
/*UseBlockFrequencyInfo=*/false));
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Add the profile lowering pass.
|
|
|
|
InstrProfOptions Options;
|
|
|
|
if (!ProfileFile.empty())
|
|
|
|
Options.InstrProfileOutput = ProfileFile;
|
|
|
|
// Do counter promotion at Level greater than O0.
|
|
|
|
Options.DoCounterPromotion = true;
|
|
|
|
Options.UseBFIInPromotion = IsCS;
|
|
|
|
MPM.addPass(InstrProfiling(Options, IsCS));
|
|
|
|
}
|
|
|
|
|
|
|
|
void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
|
|
|
|
bool RunProfileGen, bool IsCS,
|
|
|
|
std::string ProfileFile,
|
|
|
|
std::string ProfileRemappingFile) {
|
|
|
|
if (!RunProfileGen) {
|
|
|
|
assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
|
|
|
|
MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
|
|
|
|
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
|
|
|
|
// RequireAnalysisPass for PSI before subsequent non-module passes.
|
|
|
|
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Perform PGO instrumentation.
|
|
|
|
MPM.addPass(PGOInstrumentationGen(IsCS));
|
|
|
|
// Add the profile lowering pass.
|
|
|
|
InstrProfOptions Options;
|
|
|
|
if (!ProfileFile.empty())
|
|
|
|
Options.InstrProfileOutput = ProfileFile;
|
|
|
|
// Do not do counter promotion at O0.
|
|
|
|
Options.DoCounterPromotion = false;
|
|
|
|
Options.UseBFIInPromotion = IsCS;
|
|
|
|
MPM.addPass(InstrProfiling(Options, IsCS));
|
|
|
|
}
|
|
|
|
|
|
|
|
static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) {
|
|
|
|
return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
|
|
|
|
}
|
|
|
|
|
|
|
|
ModuleInlinerWrapperPass
|
|
|
|
PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
|
|
|
|
ThinOrFullLTOPhase Phase) {
|
|
|
|
InlineParams IP = getInlineParamsFromOptLevel(Level);
|
|
|
|
if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
|
|
|
|
PGOOpt->Action == PGOOptions::SampleUse)
|
|
|
|
IP.HotCallSiteThreshold = 0;
|
|
|
|
|
|
|
|
if (PGOOpt)
|
|
|
|
IP.EnableDeferral = EnablePGOInlineDeferral;
|
|
|
|
|
|
|
|
ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst,
|
|
|
|
UseInlineAdvisor, MaxDevirtIterations);
|
|
|
|
|
|
|
|
// Require the GlobalsAA analysis for the module so we can query it within
|
|
|
|
// the CGSCC pipeline.
|
|
|
|
MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>());
|
|
|
|
// Invalidate AAManager so it can be recreated and pick up the newly available
|
|
|
|
// GlobalsAA.
|
|
|
|
MIWP.addModulePass(
|
|
|
|
createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
|
|
|
|
|
|
|
|
// Require the ProfileSummaryAnalysis for the module so we can query it within
|
|
|
|
// the inliner pass.
|
|
|
|
MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
|
|
|
|
|
|
|
// Now begin the main postorder CGSCC pipeline.
|
|
|
|
// FIXME: The current CGSCC pipeline has its origins in the legacy pass
|
|
|
|
// manager and trying to emulate its precise behavior. Much of this doesn't
|
|
|
|
// make a lot of sense and we should revisit the core CGSCC structure.
|
|
|
|
CGSCCPassManager &MainCGPipeline = MIWP.getPM();
|
|
|
|
|
|
|
|
// Note: historically, the PruneEH pass was run first to deduce nounwind and
|
|
|
|
// generally clean up exception handling overhead. It isn't clear this is
|
|
|
|
// valuable as the inliner doesn't currently care whether it is inlining an
|
|
|
|
// invoke or a call.
|
|
|
|
|
|
|
|
if (AttributorRun & AttributorRunOption::CGSCC)
|
|
|
|
MainCGPipeline.addPass(AttributorCGSCCPass());
|
|
|
|
|
|
|
|
// Now deduce any function attributes based in the current code.
|
|
|
|
MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
|
|
|
|
|
|
|
|
// When at O3 add argument promotion to the pass pipeline.
|
|
|
|
// FIXME: It isn't at all clear why this should be limited to O3.
|
|
|
|
if (Level == OptimizationLevel::O3)
|
|
|
|
MainCGPipeline.addPass(ArgumentPromotionPass());
|
|
|
|
|
|
|
|
// Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
|
|
|
|
// there are no OpenMP runtime calls present in the module.
|
|
|
|
if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
|
|
|
|
MainCGPipeline.addPass(OpenMPOptCGSCCPass());
|
|
|
|
|
|
|
|
for (auto &C : CGSCCOptimizerLateEPCallbacks)
|
|
|
|
C(MainCGPipeline, Level);
|
|
|
|
|
|
|
|
// Lastly, add the core function simplification pipeline nested inside the
|
|
|
|
// CGSCC walk.
|
|
|
|
MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
|
2021-11-04 06:45:30 +08:00
|
|
|
buildFunctionSimplificationPipeline(Level, Phase),
|
2021-11-17 04:58:13 +08:00
|
|
|
PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
|
|
|
|
|
2021-11-17 04:58:13 +08:00
|
|
|
if (EnableNoRerunSimplificationPipeline)
|
|
|
|
MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
|
|
|
|
InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>()));
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
return MIWP;
|
|
|
|
}
|
|
|
|
|
2021-12-03 11:13:07 +08:00
|
|
|
ModulePassManager
|
2021-11-09 11:01:48 +08:00
|
|
|
PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
|
|
|
|
ThinOrFullLTOPhase Phase) {
|
2021-12-03 11:13:07 +08:00
|
|
|
ModulePassManager MPM;
|
|
|
|
|
2021-11-09 11:01:48 +08:00
|
|
|
InlineParams IP = getInlineParamsFromOptLevel(Level);
|
|
|
|
if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
|
|
|
|
PGOOpt->Action == PGOOptions::SampleUse)
|
|
|
|
IP.HotCallSiteThreshold = 0;
|
|
|
|
|
|
|
|
if (PGOOpt)
|
|
|
|
IP.EnableDeferral = EnablePGOInlineDeferral;
|
|
|
|
|
|
|
|
// The inline deferral logic is used to avoid losing some
|
|
|
|
// inlining chance in future. It is helpful in SCC inliner, in which
|
|
|
|
// inlining is processed in bottom-up order.
|
|
|
|
// While in module inliner, the inlining order is a priority-based order
|
|
|
|
// by default. The inline deferral is unnecessary there. So we disable the
|
|
|
|
// inline deferral logic in module inliner.
|
|
|
|
IP.EnableDeferral = false;
|
|
|
|
|
2021-12-03 11:13:07 +08:00
|
|
|
MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor));
|
|
|
|
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
|
|
|
buildFunctionSimplificationPipeline(Level, Phase),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
|
|
|
|
|
|
|
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
|
|
|
|
CoroSplitPass(Level != OptimizationLevel::O0)));
|
|
|
|
|
|
|
|
return MPM;
|
2021-11-09 11:01:48 +08:00
|
|
|
}
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
|
|
|
|
ThinOrFullLTOPhase Phase) {
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Place pseudo probe instrumentation as the first pass of the pipeline to
|
|
|
|
// minimize the impact of optimization changes.
|
|
|
|
if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
|
|
|
|
Phase != ThinOrFullLTOPhase::ThinLTOPostLink)
|
|
|
|
MPM.addPass(SampleProfileProbePass(TM));
|
|
|
|
|
|
|
|
bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
|
|
|
|
|
|
|
|
// In ThinLTO mode, when flattened profile is used, all the available
|
|
|
|
// profile information will be annotated in PreLink phase so there is
|
|
|
|
// no need to load the profile again in PostLink.
|
|
|
|
bool LoadSampleProfile =
|
|
|
|
HasSampleProfile &&
|
|
|
|
!(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink);
|
|
|
|
|
|
|
|
// During the ThinLTO backend phase we perform early indirect call promotion
|
|
|
|
// here, before globalopt. Otherwise imported available_externally functions
|
|
|
|
// look unreferenced and are removed. If we are going to load the sample
|
|
|
|
// profile then defer until later.
|
|
|
|
// TODO: See if we can move later and consolidate with the location where
|
|
|
|
// we perform ICP when we are loading a sample profile.
|
|
|
|
// TODO: We pass HasSampleProfile (whether there was a sample profile file
|
|
|
|
// passed to the compile) to the SamplePGO flag of ICP. This is used to
|
|
|
|
// determine whether the new direct calls are annotated with prof metadata.
|
|
|
|
// Ideally this should be determined from whether the IR is annotated with
|
|
|
|
// sample profile, and not whether the a sample profile was provided on the
|
|
|
|
// command line. E.g. for flattened profiles where we will not be reloading
|
|
|
|
// the sample profile in the ThinLTO backend, we ideally shouldn't have to
|
|
|
|
// provide the sample profile file.
|
|
|
|
if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
|
|
|
|
MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
|
|
|
|
|
|
|
|
// Do basic inference of function attributes from known properties of system
|
|
|
|
// libraries and other oracles.
|
|
|
|
MPM.addPass(InferFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Create an early function pass manager to cleanup the output of the
|
|
|
|
// frontend.
|
|
|
|
FunctionPassManager EarlyFPM;
|
|
|
|
// Lower llvm.expect to metadata before attempting transforms.
|
|
|
|
// Compare/branch metadata may alter the behavior of passes like SimplifyCFG.
|
|
|
|
EarlyFPM.addPass(LowerExpectIntrinsicPass());
|
|
|
|
EarlyFPM.addPass(SimplifyCFGPass());
|
2021-11-01 23:56:48 +08:00
|
|
|
EarlyFPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
EarlyFPM.addPass(EarlyCSEPass());
|
|
|
|
EarlyFPM.addPass(CoroEarlyPass());
|
|
|
|
if (Level == OptimizationLevel::O3)
|
|
|
|
EarlyFPM.addPass(CallSiteSplittingPass());
|
|
|
|
|
|
|
|
// In SamplePGO ThinLTO backend, we need instcombine before profile annotation
|
|
|
|
// to convert bitcast to direct calls so that they can be inlined during the
|
|
|
|
// profile annotation prepration step.
|
|
|
|
// More details about SamplePGO design can be found in:
|
|
|
|
// https://research.google.com/pubs/pub45290.html
|
|
|
|
// FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured.
|
|
|
|
if (LoadSampleProfile)
|
|
|
|
EarlyFPM.addPass(InstCombinePass());
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
if (LoadSampleProfile) {
|
|
|
|
// Annotate sample profile right after early FPM to ensure freshness of
|
|
|
|
// the debug info.
|
|
|
|
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
|
|
|
|
PGOOpt->ProfileRemappingFile, Phase));
|
|
|
|
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
|
|
|
|
// RequireAnalysisPass for PSI before subsequent non-module passes.
|
|
|
|
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
|
|
|
// Do not invoke ICP in the LTOPrelink phase as it makes it hard
|
|
|
|
// for the profile annotation to be accurate in the LTO backend.
|
|
|
|
if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink &&
|
|
|
|
Phase != ThinOrFullLTOPhase::FullLTOPreLink)
|
|
|
|
// We perform early indirect call promotion here, before globalopt.
|
|
|
|
// This is important for the ThinLTO backend phase because otherwise
|
|
|
|
// imported available_externally functions look unreferenced and are
|
|
|
|
// removed.
|
|
|
|
MPM.addPass(
|
|
|
|
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try to perform OpenMP specific optimizations on the module. This is a
|
|
|
|
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
|
|
|
|
if (Level != OptimizationLevel::O0)
|
|
|
|
MPM.addPass(OpenMPOptPass());
|
|
|
|
|
|
|
|
if (AttributorRun & AttributorRunOption::MODULE)
|
|
|
|
MPM.addPass(AttributorPass());
|
|
|
|
|
|
|
|
// Lower type metadata and the type.test intrinsic in the ThinLTO
|
|
|
|
// post link pipeline after ICP. This is to enable usage of the type
|
|
|
|
// tests in ICP sequences.
|
|
|
|
if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink)
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
|
|
|
|
|
|
|
|
for (auto &C : PipelineEarlySimplificationEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
// Specialize functions with IPSCCP.
|
2021-11-04 18:36:19 +08:00
|
|
|
if (EnableFunctionSpecialization && Level == OptimizationLevel::O3)
|
2021-09-15 07:44:29 +08:00
|
|
|
MPM.addPass(FunctionSpecializationPass());
|
|
|
|
|
|
|
|
// Interprocedural constant propagation now that basic cleanup has occurred
|
|
|
|
// and prior to optimizing globals.
|
|
|
|
// FIXME: This position in the pipeline hasn't been carefully considered in
|
|
|
|
// years, it should be re-analyzed.
|
|
|
|
MPM.addPass(IPSCCPPass());
|
|
|
|
|
|
|
|
// Attach metadata to indirect call sites indicating the set of functions
|
|
|
|
// they may target at run-time. This should follow IPSCCP.
|
|
|
|
MPM.addPass(CalledValuePropagationPass());
|
|
|
|
|
|
|
|
// Optimize globals to try and fold them into constants.
|
|
|
|
MPM.addPass(GlobalOptPass());
|
|
|
|
|
|
|
|
// Promote any localized globals to SSA registers.
|
|
|
|
// FIXME: Should this instead by a run of SROA?
|
|
|
|
// FIXME: We should probably run instcombine and simplifycfg afterward to
|
|
|
|
// delete control flows that are dead once globals have been folded to
|
|
|
|
// constants.
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
|
|
|
|
|
|
|
|
// Remove any dead arguments exposed by cleanups and constant folding
|
|
|
|
// globals.
|
|
|
|
MPM.addPass(DeadArgumentEliminationPass());
|
|
|
|
|
|
|
|
// Create a small function pass pipeline to cleanup after all the global
|
|
|
|
// optimizations.
|
|
|
|
FunctionPassManager GlobalCleanupPM;
|
|
|
|
GlobalCleanupPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
|
|
|
|
|
|
|
|
GlobalCleanupPM.addPass(SimplifyCFGPass());
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Add all the requested passes for instrumentation PGO, if requested.
|
|
|
|
if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
|
|
|
|
(PGOOpt->Action == PGOOptions::IRInstr ||
|
|
|
|
PGOOpt->Action == PGOOptions::IRUse)) {
|
|
|
|
addPGOInstrPasses(MPM, Level,
|
|
|
|
/* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr,
|
|
|
|
/* IsCS */ false, PGOOpt->ProfileFile,
|
|
|
|
PGOOpt->ProfileRemappingFile);
|
|
|
|
MPM.addPass(PGOIndirectCallPromotion(false, false));
|
|
|
|
}
|
|
|
|
if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
|
|
|
|
PGOOpt->CSAction == PGOOptions::CSIRInstr)
|
|
|
|
MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile));
|
|
|
|
|
|
|
|
// Synthesize function entry counts for non-PGO compilation.
|
|
|
|
if (EnableSyntheticCounts && !PGOOpt)
|
|
|
|
MPM.addPass(SyntheticCountsPropagation());
|
|
|
|
|
2021-11-09 11:01:48 +08:00
|
|
|
if (EnableModuleInliner)
|
|
|
|
MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
|
|
|
|
else
|
|
|
|
MPM.addPass(buildInlinerPipeline(Level, Phase));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) {
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass()));
|
|
|
|
MPM.addPass(ModuleMemProfilerPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// TODO: Should LTO cause any differences to this set of passes?
|
|
|
|
void PassBuilder::addVectorPasses(OptimizationLevel Level,
|
|
|
|
FunctionPassManager &FPM, bool IsFullLTO) {
|
|
|
|
FPM.addPass(LoopVectorizePass(
|
|
|
|
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
|
|
|
|
|
|
|
|
if (IsFullLTO) {
|
|
|
|
// The vectorizer may have significantly shortened a loop body; unroll
|
|
|
|
// again. Unroll small loops to hide loop backedge latency and saturate any
|
|
|
|
// parallel execution resources of an out-of-order processor. We also then
|
|
|
|
// need to clean up redundancies and loop invariant code.
|
|
|
|
// FIXME: It would be really good to use a loop-integrated instruction
|
|
|
|
// combiner for cleanup here so that the unrolling and LICM can be pipelined
|
|
|
|
// across the loop nests.
|
|
|
|
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
|
|
|
|
if (EnableUnrollAndJam && PTO.LoopUnrolling)
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
|
|
|
|
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
|
|
|
|
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
|
|
|
|
PTO.ForgetAllSCEVInLoopUnroll)));
|
|
|
|
FPM.addPass(WarnMissedTransformationsPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!IsFullLTO) {
|
|
|
|
// Eliminate loads by forwarding stores from the previous iteration to loads
|
|
|
|
// of the current iteration.
|
|
|
|
FPM.addPass(LoopLoadEliminationPass());
|
|
|
|
}
|
|
|
|
// Cleanup after the loop optimization passes.
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
|
|
|
|
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
|
2021-12-10 19:42:45 +08:00
|
|
|
ExtraVectorPassManager ExtraPasses;
|
2021-09-15 07:44:29 +08:00
|
|
|
// At higher optimization levels, try to clean up any runtime overlap and
|
|
|
|
// alignment checks inserted by the vectorizer. We want to track correlated
|
|
|
|
// runtime checks for two inner loops in the same outer loop, fold any
|
|
|
|
// common computations, hoist loop-invariant aspects out of any outer loop,
|
|
|
|
// and unswitch the runtime checks if possible. Once hoisted, we may have
|
|
|
|
// dead (or speculatable) control flows or more combining opportunities.
|
2021-12-10 19:42:45 +08:00
|
|
|
ExtraPasses.addPass(EarlyCSEPass());
|
|
|
|
ExtraPasses.addPass(CorrelatedValuePropagationPass());
|
|
|
|
ExtraPasses.addPass(InstCombinePass());
|
2021-09-15 07:44:29 +08:00
|
|
|
LoopPassManager LPM;
|
|
|
|
LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
|
|
|
|
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
|
|
|
|
OptimizationLevel::O3));
|
2021-12-10 19:42:45 +08:00
|
|
|
ExtraPasses.addPass(
|
2021-09-15 07:44:29 +08:00
|
|
|
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
|
2021-12-10 19:42:45 +08:00
|
|
|
ExtraPasses.addPass(
|
2021-09-15 07:44:29 +08:00
|
|
|
createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
|
|
|
|
/*UseBlockFrequencyInfo=*/true));
|
2021-12-10 19:42:45 +08:00
|
|
|
ExtraPasses.addPass(SimplifyCFGPass());
|
|
|
|
ExtraPasses.addPass(InstCombinePass());
|
|
|
|
FPM.addPass(std::move(ExtraPasses));
|
2021-09-15 07:44:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Now that we've formed fast to execute loop structures, we do further
|
|
|
|
// optimizations. These are run afterward as they might block doing complex
|
|
|
|
// analyses and transforms such as what are needed for loop vectorization.
|
|
|
|
|
|
|
|
// Cleanup after loop vectorization, etc. Simplification passes like CVP and
|
|
|
|
// GVN, loop transforms, and others have already run, so it's now better to
|
|
|
|
// convert to more optimized IR using more aggressive simplify CFG options.
|
|
|
|
// The extra sinking transform can create larger basic blocks, so do this
|
|
|
|
// before SLP vectorization.
|
|
|
|
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
|
|
|
|
.forwardSwitchCondToPhi(true)
|
|
|
|
.convertSwitchToLookupTable(true)
|
|
|
|
.needCanonicalLoops(false)
|
|
|
|
.hoistCommonInsts(true)
|
|
|
|
.sinkCommonInsts(true)));
|
|
|
|
|
|
|
|
if (IsFullLTO) {
|
|
|
|
FPM.addPass(SCCPPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
FPM.addPass(BDCEPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optimize parallel scalar instruction chains into SIMD instructions.
|
|
|
|
if (PTO.SLPVectorization) {
|
|
|
|
FPM.addPass(SLPVectorizerPass());
|
|
|
|
if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
|
|
|
|
FPM.addPass(EarlyCSEPass());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Enhance/cleanup vector code.
|
|
|
|
FPM.addPass(VectorCombinePass());
|
|
|
|
|
|
|
|
if (!IsFullLTO) {
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
// Unroll small loops to hide loop backedge latency and saturate any
|
|
|
|
// parallel execution resources of an out-of-order processor. We also then
|
|
|
|
// need to clean up redundancies and loop invariant code.
|
|
|
|
// FIXME: It would be really good to use a loop-integrated instruction
|
|
|
|
// combiner for cleanup here so that the unrolling and LICM can be pipelined
|
|
|
|
// across the loop nests.
|
|
|
|
// We do UnrollAndJam in a separate LPM to ensure it happens before unroll
|
|
|
|
if (EnableUnrollAndJam && PTO.LoopUnrolling) {
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LoopUnrollAndJamPass(Level.getSpeedupLevel())));
|
|
|
|
}
|
|
|
|
FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
|
|
|
|
Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
|
|
|
|
PTO.ForgetAllSCEVInLoopUnroll)));
|
|
|
|
FPM.addPass(WarnMissedTransformationsPass());
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
FPM.addPass(
|
|
|
|
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
|
|
|
|
FPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
|
|
|
|
/*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that we've vectorized and unrolled loops, we may have more refined
|
|
|
|
// alignment information, try to re-derive it here.
|
|
|
|
FPM.addPass(AlignmentFromAssumptionsPass());
|
|
|
|
|
|
|
|
if (IsFullLTO)
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
|
|
|
|
bool LTOPreLink) {
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Optimize globals now that the module is fully simplified.
|
|
|
|
MPM.addPass(GlobalOptPass());
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
|
|
|
|
// Run partial inlining pass to partially inline functions that have
|
|
|
|
// large bodies.
|
|
|
|
if (RunPartialInlining)
|
|
|
|
MPM.addPass(PartialInlinerPass());
|
|
|
|
|
|
|
|
// Remove avail extern fns and globals definitions since we aren't compiling
|
|
|
|
// an object file for later LTO. For LTO we want to preserve these so they
|
|
|
|
// are eligible for inlining at link-time. Note if they are unreferenced they
|
|
|
|
// will be removed by GlobalDCE later, so this only impacts referenced
|
|
|
|
// available externally globals. Eventually they will be suppressed during
|
|
|
|
// codegen, but eliminating here enables more opportunity for GlobalDCE as it
|
|
|
|
// may make globals referenced by available external functions dead and saves
|
|
|
|
// running remaining passes on the eliminated functions. These should be
|
|
|
|
// preserved during prelinking for link-time inlining decisions.
|
|
|
|
if (!LTOPreLink)
|
|
|
|
MPM.addPass(EliminateAvailableExternallyPass());
|
|
|
|
|
|
|
|
if (EnableOrderFileInstrumentation)
|
|
|
|
MPM.addPass(InstrOrderFilePass());
|
|
|
|
|
|
|
|
// Do RPO function attribute inference across the module to forward-propagate
|
|
|
|
// attributes where applicable.
|
|
|
|
// FIXME: Is this really an optimization rather than a canonicalization?
|
|
|
|
MPM.addPass(ReversePostOrderFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Do a post inline PGO instrumentation and use pass. This is a context
|
|
|
|
// sensitive PGO pass. We don't want to do this in LTOPreLink phrase as
|
|
|
|
// cross-module inline has not been done yet. The context sensitive
|
|
|
|
// instrumentation is after all the inlines are done.
|
|
|
|
if (!LTOPreLink && PGOOpt) {
|
|
|
|
if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
|
|
|
|
addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
|
|
|
|
/* IsCS */ true, PGOOpt->CSProfileGenFile,
|
|
|
|
PGOOpt->ProfileRemappingFile);
|
|
|
|
else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
|
|
|
|
addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
|
|
|
|
/* IsCS */ true, PGOOpt->ProfileFile,
|
|
|
|
PGOOpt->ProfileRemappingFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Re-require GloblasAA here prior to function passes. This is particularly
|
|
|
|
// useful as the above will have inlined, DCE'ed, and function-attr
|
|
|
|
// propagated everything. We should at this point have a reasonably minimal
|
|
|
|
// and richly annotated call graph. By computing aliasing and mod/ref
|
|
|
|
// information for all local globals here, the late loop passes and notably
|
|
|
|
// the vectorizer will be able to use them to help recognize vectorizable
|
|
|
|
// memory operations.
|
|
|
|
MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
|
|
|
|
|
|
|
|
FunctionPassManager OptimizePM;
|
|
|
|
OptimizePM.addPass(Float2IntPass());
|
|
|
|
OptimizePM.addPass(LowerConstantIntrinsicsPass());
|
|
|
|
|
|
|
|
if (EnableMatrix) {
|
|
|
|
OptimizePM.addPass(LowerMatrixIntrinsicsPass());
|
|
|
|
OptimizePM.addPass(EarlyCSEPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: We need to run some loop optimizations to re-rotate loops after
|
|
|
|
// simplifycfg and others undo their rotation.
|
|
|
|
|
|
|
|
// Optimize the loop execution. These passes operate on entire loop nests
|
|
|
|
// rather than on each loop in an inside-out manner, and so they are actually
|
|
|
|
// function passes.
|
|
|
|
|
|
|
|
for (auto &C : VectorizerStartEPCallbacks)
|
|
|
|
C(OptimizePM, Level);
|
|
|
|
|
[PassManager] `buildModuleOptimizationPipeline()`: schedule `LoopDeletion` pass run before vectorization passes
Test thanks to Michael Kuklinski from `#llvm`: https://godbolt.org/z/bdrah5Goo
originally inspired by Daniel Lemire's https://lemire.me/blog/2021/10/26/in-c-is-empty-faster-than-comparing-the-size-with-zero/
We manage to deduce that the answer does not require looping,
but we do that after the last `LoopDeletion` pass run,
so we end up being stuck with a dead loop.
Now, as with all things SCEV, this has
a very expected ~`+0.12%` compile time performance regression:
https://llvm-compile-time-tracker.com/compare.php?from=0ae7bf124a9bca76dd9a91b2f7379168ff13f562&to=c2ae57c9b961aeb4a28c747266949340613a6d84&stat=instructions
(for comparison, doing that in function simplification pipeline
would have been ~`+0.5` compile time performance regression, D112840)
Looking at the transformation stats over vanilla test-suite, i think it's rather expected:
```
| statistic name | baseline | proposed | Δ | % | |%| |
|--------------------------------------------------|----------:|----------:|------:|-------:|-------:|
| scalar-evolution.NumBruteForceTripCountsComputed | 789 | 888 | 99 | 12.55% | 12.55% |
| scalar-evolution.NumTripCountsNotComputed | 105592 | 117900 | 12308 | 11.66% | 11.66% |
| loop-delete.NumBackedgesBroken | 542 | 559 | 17 | 3.14% | 3.14% |
| regalloc.numExtends | 81 | 79 | -2 | -2.47% | 2.47% |
| indvars.NumFoldedUser | 408 | 400 | -8 | -1.96% | 1.96% |
| indvars.NumElimCmp | 3831 | 3758 | -73 | -1.91% | 1.91% |
| scalar-evolution.NumTripCountsComputed | 299759 | 304278 | 4519 | 1.51% | 1.51% |
| loop-delete.NumDeleted | 8055 | 8128 | 73 | 0.91% | 0.91% |
| machine-cse.NumCommutes | 111 | 110 | -1 | -0.90% | 0.90% |
| globaldce.NumFunctions | 1187 | 1192 | 5 | 0.42% | 0.42% |
| codegenprepare.NumSelectsExpanded | 277 | 278 | 1 | 0.36% | 0.36% |
| loop-unroll.NumRuntimeUnrolled | 13841 | 13791 | -50 | -0.36% | 0.36% |
| machinelicm.NumPostRAHoisted | 1168 | 1172 | 4 | 0.34% | 0.34% |
| phi-node-elimination.NumCriticalEdgesSplit | 83054 | 82879 | -175 | -0.21% | 0.21% |
| machine-cse.NumPREs | 3085 | 3079 | -6 | -0.19% | 0.19% |
| branch-folder.NumBranchOpts | 108122 | 107942 | -180 | -0.17% | 0.17% |
| loop-unroll.NumUnrolled | 40136 | 40067 | -69 | -0.17% | 0.17% |
| branch-folder.NumDeadBlocks | 130818 | 130607 | -211 | -0.16% | 0.16% |
| codegenprepare.NumBlocksElim | 92856 | 92714 | -142 | -0.15% | 0.15% |
| instsimplify.NumSimplified | 103263 | 103129 | -134 | -0.13% | 0.13% |
| instcombine.NumConstProp | 26070 | 26102 | 32 | 0.12% | 0.12% |
| instsimplify.NumExpand | 1716 | 1718 | 2 | 0.12% | 0.12% |
| loop-unroll.NumCompletelyUnrolled | 9236 | 9225 | -11 | -0.12% | 0.12% |
| branch-folder.NumHoist | 2773 | 2770 | -3 | -0.11% | 0.11% |
| regalloc.NumReloadsRemoved | 10822 | 10834 | 12 | 0.11% | 0.11% |
| regalloc.NumSnippets | 11394 | 11406 | 12 | 0.11% | 0.11% |
| machine-cse.NumCrossBBCSEs | 1052 | 1053 | 1 | 0.10% | 0.10% |
| machinelicm.NumCSEed | 99887 | 99784 | -103 | -0.10% | 0.10% |
| branch-folder.NumTailMerge | 72501 | 72435 | -66 | -0.09% | 0.09% |
| codegenprepare.NumExtUses | 22007 | 21987 | -20 | -0.09% | 0.09% |
| local.NumRemoved | 68232 | 68294 | 62 | 0.09% | 0.09% |
| loop-vectorize.LoopsAnalyzed | 75483 | 75413 | -70 | -0.09% | 0.09% |
```
Note that i'm only changing current PM, and not touching obsolete PM.
This is an alternative to the function simplification pipeline variant
of the same change, D112840. It has both less compile time impact
(since the additional number of SCEV trip count calculations
is way lass less than with the D112840), and it is
much more powerful/impactful (almost 2x more loops deleted).
I have checked, and doing this after loop rotation
is favorable (more loops deleted).
Reviewed By: mkazantsev
Differential Revision: https://reviews.llvm.org/D112851
2021-11-04 00:23:25 +08:00
|
|
|
LoopPassManager LPM;
|
2021-09-15 07:44:29 +08:00
|
|
|
// First rotate loops that may have been un-rotated by prior passes.
|
|
|
|
// Disable header duplication at -Oz.
|
[PassManager] `buildModuleOptimizationPipeline()`: schedule `LoopDeletion` pass run before vectorization passes
Test thanks to Michael Kuklinski from `#llvm`: https://godbolt.org/z/bdrah5Goo
originally inspired by Daniel Lemire's https://lemire.me/blog/2021/10/26/in-c-is-empty-faster-than-comparing-the-size-with-zero/
We manage to deduce that the answer does not require looping,
but we do that after the last `LoopDeletion` pass run,
so we end up being stuck with a dead loop.
Now, as with all things SCEV, this has
a very expected ~`+0.12%` compile time performance regression:
https://llvm-compile-time-tracker.com/compare.php?from=0ae7bf124a9bca76dd9a91b2f7379168ff13f562&to=c2ae57c9b961aeb4a28c747266949340613a6d84&stat=instructions
(for comparison, doing that in function simplification pipeline
would have been ~`+0.5` compile time performance regression, D112840)
Looking at the transformation stats over vanilla test-suite, i think it's rather expected:
```
| statistic name | baseline | proposed | Δ | % | |%| |
|--------------------------------------------------|----------:|----------:|------:|-------:|-------:|
| scalar-evolution.NumBruteForceTripCountsComputed | 789 | 888 | 99 | 12.55% | 12.55% |
| scalar-evolution.NumTripCountsNotComputed | 105592 | 117900 | 12308 | 11.66% | 11.66% |
| loop-delete.NumBackedgesBroken | 542 | 559 | 17 | 3.14% | 3.14% |
| regalloc.numExtends | 81 | 79 | -2 | -2.47% | 2.47% |
| indvars.NumFoldedUser | 408 | 400 | -8 | -1.96% | 1.96% |
| indvars.NumElimCmp | 3831 | 3758 | -73 | -1.91% | 1.91% |
| scalar-evolution.NumTripCountsComputed | 299759 | 304278 | 4519 | 1.51% | 1.51% |
| loop-delete.NumDeleted | 8055 | 8128 | 73 | 0.91% | 0.91% |
| machine-cse.NumCommutes | 111 | 110 | -1 | -0.90% | 0.90% |
| globaldce.NumFunctions | 1187 | 1192 | 5 | 0.42% | 0.42% |
| codegenprepare.NumSelectsExpanded | 277 | 278 | 1 | 0.36% | 0.36% |
| loop-unroll.NumRuntimeUnrolled | 13841 | 13791 | -50 | -0.36% | 0.36% |
| machinelicm.NumPostRAHoisted | 1168 | 1172 | 4 | 0.34% | 0.34% |
| phi-node-elimination.NumCriticalEdgesSplit | 83054 | 82879 | -175 | -0.21% | 0.21% |
| machine-cse.NumPREs | 3085 | 3079 | -6 | -0.19% | 0.19% |
| branch-folder.NumBranchOpts | 108122 | 107942 | -180 | -0.17% | 0.17% |
| loop-unroll.NumUnrolled | 40136 | 40067 | -69 | -0.17% | 0.17% |
| branch-folder.NumDeadBlocks | 130818 | 130607 | -211 | -0.16% | 0.16% |
| codegenprepare.NumBlocksElim | 92856 | 92714 | -142 | -0.15% | 0.15% |
| instsimplify.NumSimplified | 103263 | 103129 | -134 | -0.13% | 0.13% |
| instcombine.NumConstProp | 26070 | 26102 | 32 | 0.12% | 0.12% |
| instsimplify.NumExpand | 1716 | 1718 | 2 | 0.12% | 0.12% |
| loop-unroll.NumCompletelyUnrolled | 9236 | 9225 | -11 | -0.12% | 0.12% |
| branch-folder.NumHoist | 2773 | 2770 | -3 | -0.11% | 0.11% |
| regalloc.NumReloadsRemoved | 10822 | 10834 | 12 | 0.11% | 0.11% |
| regalloc.NumSnippets | 11394 | 11406 | 12 | 0.11% | 0.11% |
| machine-cse.NumCrossBBCSEs | 1052 | 1053 | 1 | 0.10% | 0.10% |
| machinelicm.NumCSEed | 99887 | 99784 | -103 | -0.10% | 0.10% |
| branch-folder.NumTailMerge | 72501 | 72435 | -66 | -0.09% | 0.09% |
| codegenprepare.NumExtUses | 22007 | 21987 | -20 | -0.09% | 0.09% |
| local.NumRemoved | 68232 | 68294 | 62 | 0.09% | 0.09% |
| loop-vectorize.LoopsAnalyzed | 75483 | 75413 | -70 | -0.09% | 0.09% |
```
Note that i'm only changing current PM, and not touching obsolete PM.
This is an alternative to the function simplification pipeline variant
of the same change, D112840. It has both less compile time impact
(since the additional number of SCEV trip count calculations
is way lass less than with the D112840), and it is
much more powerful/impactful (almost 2x more loops deleted).
I have checked, and doing this after loop rotation
is favorable (more loops deleted).
Reviewed By: mkazantsev
Differential Revision: https://reviews.llvm.org/D112851
2021-11-04 00:23:25 +08:00
|
|
|
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
|
|
|
|
// Some loops may have become dead by now. Try to delete them.
|
2021-12-07 05:04:57 +08:00
|
|
|
// FIXME: see discussion in https://reviews.llvm.org/D112851,
|
|
|
|
// this may need to be revisited once we run GVN before loop deletion
|
|
|
|
// in the simplification pipeline.
|
[PassManager] `buildModuleOptimizationPipeline()`: schedule `LoopDeletion` pass run before vectorization passes
Test thanks to Michael Kuklinski from `#llvm`: https://godbolt.org/z/bdrah5Goo
originally inspired by Daniel Lemire's https://lemire.me/blog/2021/10/26/in-c-is-empty-faster-than-comparing-the-size-with-zero/
We manage to deduce that the answer does not require looping,
but we do that after the last `LoopDeletion` pass run,
so we end up being stuck with a dead loop.
Now, as with all things SCEV, this has
a very expected ~`+0.12%` compile time performance regression:
https://llvm-compile-time-tracker.com/compare.php?from=0ae7bf124a9bca76dd9a91b2f7379168ff13f562&to=c2ae57c9b961aeb4a28c747266949340613a6d84&stat=instructions
(for comparison, doing that in function simplification pipeline
would have been ~`+0.5` compile time performance regression, D112840)
Looking at the transformation stats over vanilla test-suite, i think it's rather expected:
```
| statistic name | baseline | proposed | Δ | % | |%| |
|--------------------------------------------------|----------:|----------:|------:|-------:|-------:|
| scalar-evolution.NumBruteForceTripCountsComputed | 789 | 888 | 99 | 12.55% | 12.55% |
| scalar-evolution.NumTripCountsNotComputed | 105592 | 117900 | 12308 | 11.66% | 11.66% |
| loop-delete.NumBackedgesBroken | 542 | 559 | 17 | 3.14% | 3.14% |
| regalloc.numExtends | 81 | 79 | -2 | -2.47% | 2.47% |
| indvars.NumFoldedUser | 408 | 400 | -8 | -1.96% | 1.96% |
| indvars.NumElimCmp | 3831 | 3758 | -73 | -1.91% | 1.91% |
| scalar-evolution.NumTripCountsComputed | 299759 | 304278 | 4519 | 1.51% | 1.51% |
| loop-delete.NumDeleted | 8055 | 8128 | 73 | 0.91% | 0.91% |
| machine-cse.NumCommutes | 111 | 110 | -1 | -0.90% | 0.90% |
| globaldce.NumFunctions | 1187 | 1192 | 5 | 0.42% | 0.42% |
| codegenprepare.NumSelectsExpanded | 277 | 278 | 1 | 0.36% | 0.36% |
| loop-unroll.NumRuntimeUnrolled | 13841 | 13791 | -50 | -0.36% | 0.36% |
| machinelicm.NumPostRAHoisted | 1168 | 1172 | 4 | 0.34% | 0.34% |
| phi-node-elimination.NumCriticalEdgesSplit | 83054 | 82879 | -175 | -0.21% | 0.21% |
| machine-cse.NumPREs | 3085 | 3079 | -6 | -0.19% | 0.19% |
| branch-folder.NumBranchOpts | 108122 | 107942 | -180 | -0.17% | 0.17% |
| loop-unroll.NumUnrolled | 40136 | 40067 | -69 | -0.17% | 0.17% |
| branch-folder.NumDeadBlocks | 130818 | 130607 | -211 | -0.16% | 0.16% |
| codegenprepare.NumBlocksElim | 92856 | 92714 | -142 | -0.15% | 0.15% |
| instsimplify.NumSimplified | 103263 | 103129 | -134 | -0.13% | 0.13% |
| instcombine.NumConstProp | 26070 | 26102 | 32 | 0.12% | 0.12% |
| instsimplify.NumExpand | 1716 | 1718 | 2 | 0.12% | 0.12% |
| loop-unroll.NumCompletelyUnrolled | 9236 | 9225 | -11 | -0.12% | 0.12% |
| branch-folder.NumHoist | 2773 | 2770 | -3 | -0.11% | 0.11% |
| regalloc.NumReloadsRemoved | 10822 | 10834 | 12 | 0.11% | 0.11% |
| regalloc.NumSnippets | 11394 | 11406 | 12 | 0.11% | 0.11% |
| machine-cse.NumCrossBBCSEs | 1052 | 1053 | 1 | 0.10% | 0.10% |
| machinelicm.NumCSEed | 99887 | 99784 | -103 | -0.10% | 0.10% |
| branch-folder.NumTailMerge | 72501 | 72435 | -66 | -0.09% | 0.09% |
| codegenprepare.NumExtUses | 22007 | 21987 | -20 | -0.09% | 0.09% |
| local.NumRemoved | 68232 | 68294 | 62 | 0.09% | 0.09% |
| loop-vectorize.LoopsAnalyzed | 75483 | 75413 | -70 | -0.09% | 0.09% |
```
Note that i'm only changing current PM, and not touching obsolete PM.
This is an alternative to the function simplification pipeline variant
of the same change, D112840. It has both less compile time impact
(since the additional number of SCEV trip count calculations
is way lass less than with the D112840), and it is
much more powerful/impactful (almost 2x more loops deleted).
I have checked, and doing this after loop rotation
is favorable (more loops deleted).
Reviewed By: mkazantsev
Differential Revision: https://reviews.llvm.org/D112851
2021-11-04 00:23:25 +08:00
|
|
|
LPM.addPass(LoopDeletionPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
OptimizePM.addPass(createFunctionToLoopPassAdaptor(
|
[PassManager] `buildModuleOptimizationPipeline()`: schedule `LoopDeletion` pass run before vectorization passes
Test thanks to Michael Kuklinski from `#llvm`: https://godbolt.org/z/bdrah5Goo
originally inspired by Daniel Lemire's https://lemire.me/blog/2021/10/26/in-c-is-empty-faster-than-comparing-the-size-with-zero/
We manage to deduce that the answer does not require looping,
but we do that after the last `LoopDeletion` pass run,
so we end up being stuck with a dead loop.
Now, as with all things SCEV, this has
a very expected ~`+0.12%` compile time performance regression:
https://llvm-compile-time-tracker.com/compare.php?from=0ae7bf124a9bca76dd9a91b2f7379168ff13f562&to=c2ae57c9b961aeb4a28c747266949340613a6d84&stat=instructions
(for comparison, doing that in function simplification pipeline
would have been ~`+0.5` compile time performance regression, D112840)
Looking at the transformation stats over vanilla test-suite, i think it's rather expected:
```
| statistic name | baseline | proposed | Δ | % | |%| |
|--------------------------------------------------|----------:|----------:|------:|-------:|-------:|
| scalar-evolution.NumBruteForceTripCountsComputed | 789 | 888 | 99 | 12.55% | 12.55% |
| scalar-evolution.NumTripCountsNotComputed | 105592 | 117900 | 12308 | 11.66% | 11.66% |
| loop-delete.NumBackedgesBroken | 542 | 559 | 17 | 3.14% | 3.14% |
| regalloc.numExtends | 81 | 79 | -2 | -2.47% | 2.47% |
| indvars.NumFoldedUser | 408 | 400 | -8 | -1.96% | 1.96% |
| indvars.NumElimCmp | 3831 | 3758 | -73 | -1.91% | 1.91% |
| scalar-evolution.NumTripCountsComputed | 299759 | 304278 | 4519 | 1.51% | 1.51% |
| loop-delete.NumDeleted | 8055 | 8128 | 73 | 0.91% | 0.91% |
| machine-cse.NumCommutes | 111 | 110 | -1 | -0.90% | 0.90% |
| globaldce.NumFunctions | 1187 | 1192 | 5 | 0.42% | 0.42% |
| codegenprepare.NumSelectsExpanded | 277 | 278 | 1 | 0.36% | 0.36% |
| loop-unroll.NumRuntimeUnrolled | 13841 | 13791 | -50 | -0.36% | 0.36% |
| machinelicm.NumPostRAHoisted | 1168 | 1172 | 4 | 0.34% | 0.34% |
| phi-node-elimination.NumCriticalEdgesSplit | 83054 | 82879 | -175 | -0.21% | 0.21% |
| machine-cse.NumPREs | 3085 | 3079 | -6 | -0.19% | 0.19% |
| branch-folder.NumBranchOpts | 108122 | 107942 | -180 | -0.17% | 0.17% |
| loop-unroll.NumUnrolled | 40136 | 40067 | -69 | -0.17% | 0.17% |
| branch-folder.NumDeadBlocks | 130818 | 130607 | -211 | -0.16% | 0.16% |
| codegenprepare.NumBlocksElim | 92856 | 92714 | -142 | -0.15% | 0.15% |
| instsimplify.NumSimplified | 103263 | 103129 | -134 | -0.13% | 0.13% |
| instcombine.NumConstProp | 26070 | 26102 | 32 | 0.12% | 0.12% |
| instsimplify.NumExpand | 1716 | 1718 | 2 | 0.12% | 0.12% |
| loop-unroll.NumCompletelyUnrolled | 9236 | 9225 | -11 | -0.12% | 0.12% |
| branch-folder.NumHoist | 2773 | 2770 | -3 | -0.11% | 0.11% |
| regalloc.NumReloadsRemoved | 10822 | 10834 | 12 | 0.11% | 0.11% |
| regalloc.NumSnippets | 11394 | 11406 | 12 | 0.11% | 0.11% |
| machine-cse.NumCrossBBCSEs | 1052 | 1053 | 1 | 0.10% | 0.10% |
| machinelicm.NumCSEed | 99887 | 99784 | -103 | -0.10% | 0.10% |
| branch-folder.NumTailMerge | 72501 | 72435 | -66 | -0.09% | 0.09% |
| codegenprepare.NumExtUses | 22007 | 21987 | -20 | -0.09% | 0.09% |
| local.NumRemoved | 68232 | 68294 | 62 | 0.09% | 0.09% |
| loop-vectorize.LoopsAnalyzed | 75483 | 75413 | -70 | -0.09% | 0.09% |
```
Note that i'm only changing current PM, and not touching obsolete PM.
This is an alternative to the function simplification pipeline variant
of the same change, D112840. It has both less compile time impact
(since the additional number of SCEV trip count calculations
is way lass less than with the D112840), and it is
much more powerful/impactful (almost 2x more loops deleted).
I have checked, and doing this after loop rotation
is favorable (more loops deleted).
Reviewed By: mkazantsev
Differential Revision: https://reviews.llvm.org/D112851
2021-11-04 00:23:25 +08:00
|
|
|
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Distribute loops to allow partial vectorization. I.e. isolate dependences
|
|
|
|
// into separate loop that would otherwise inhibit vectorization. This is
|
|
|
|
// currently only performed for loops marked with the metadata
|
|
|
|
// llvm.loop.distribute=true or when -enable-loop-distribute is specified.
|
|
|
|
OptimizePM.addPass(LoopDistributePass());
|
|
|
|
|
|
|
|
// Populates the VFABI attribute with the scalar-to-vector mappings
|
|
|
|
// from the TargetLibraryInfo.
|
|
|
|
OptimizePM.addPass(InjectTLIMappings());
|
|
|
|
|
|
|
|
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
|
|
|
|
|
|
|
|
// LoopSink pass sinks instructions hoisted by LICM, which serves as a
|
|
|
|
// canonicalization pass that enables other optimizations. As a result,
|
|
|
|
// LoopSink pass needs to be a very late IR pass to avoid undoing LICM
|
|
|
|
// result too early.
|
|
|
|
OptimizePM.addPass(LoopSinkPass());
|
|
|
|
|
|
|
|
// And finally clean up LCSSA form before generating code.
|
|
|
|
OptimizePM.addPass(InstSimplifyPass());
|
|
|
|
|
|
|
|
// This hoists/decomposes div/rem ops. It should run after other sink/hoist
|
|
|
|
// passes to avoid re-sinking, but before SimplifyCFG because it can allow
|
|
|
|
// flattening of blocks.
|
|
|
|
OptimizePM.addPass(DivRemPairsPass());
|
|
|
|
|
|
|
|
// LoopSink (and other loop passes since the last simplifyCFG) might have
|
|
|
|
// resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
|
|
|
|
OptimizePM.addPass(SimplifyCFGPass());
|
|
|
|
|
|
|
|
OptimizePM.addPass(CoroCleanupPass());
|
|
|
|
|
|
|
|
// Add the core optimizing pipeline.
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
for (auto &C : OptimizerLastEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
2021-12-04 20:14:15 +08:00
|
|
|
// Split out cold code. Splitting is done late to avoid hiding context from
|
|
|
|
// other optimizations and inadvertently regressing performance. The tradeoff
|
|
|
|
// is that this has a higher code size cost than splitting early.
|
|
|
|
if (EnableHotColdSplit && !LTOPreLink)
|
|
|
|
MPM.addPass(HotColdSplittingPass());
|
|
|
|
|
|
|
|
// Search the code for similar regions of code. If enough similar regions can
|
|
|
|
// be found where extracting the regions into their own function will decrease
|
|
|
|
// the size of the program, we extract the regions, a deduplicate the
|
|
|
|
// structurally similar regions.
|
|
|
|
if (EnableIROutliner)
|
|
|
|
MPM.addPass(IROutlinerPass());
|
|
|
|
|
|
|
|
// Merge functions if requested.
|
|
|
|
if (PTO.MergeFunctions)
|
|
|
|
MPM.addPass(MergeFunctionsPass());
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
if (PTO.CallGraphProfile)
|
|
|
|
MPM.addPass(CGProfilePass());
|
|
|
|
|
|
|
|
// Now we need to do some global optimization transforms.
|
|
|
|
// FIXME: It would seem like these should come first in the optimization
|
|
|
|
// pipeline and maybe be the bottom of the canonicalization pipeline? Weird
|
|
|
|
// ordering here.
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
MPM.addPass(ConstantMergePass());
|
|
|
|
|
|
|
|
// TODO: Relative look table converter pass caused an issue when full lto is
|
|
|
|
// enabled. See https://reviews.llvm.org/D94355 for more details.
|
|
|
|
// Until the issue fixed, disable this pass during pre-linking phase.
|
|
|
|
if (!LTOPreLink)
|
|
|
|
MPM.addPass(RelLookupTableConverterPass());
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
|
|
|
|
bool LTOPreLink) {
|
|
|
|
assert(Level != OptimizationLevel::O0 &&
|
|
|
|
"Must request optimizations for the default pipeline!");
|
|
|
|
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Convert @llvm.global.annotations to !annotation metadata.
|
|
|
|
MPM.addPass(Annotation2MetadataPass());
|
|
|
|
|
|
|
|
// Force any function attributes we want the rest of the pipeline to observe.
|
|
|
|
MPM.addPass(ForceFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Apply module pipeline start EP callback.
|
|
|
|
for (auto &C : PipelineStartEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
if (PGOOpt && PGOOpt->DebugInfoForProfiling)
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
|
|
|
|
|
|
|
|
// Add the core simplification pipeline.
|
|
|
|
MPM.addPass(buildModuleSimplificationPipeline(
|
|
|
|
Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink
|
|
|
|
: ThinOrFullLTOPhase::None));
|
|
|
|
|
|
|
|
// Now add the optimization pipeline.
|
|
|
|
MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink));
|
|
|
|
|
2021-10-15 02:37:44 +08:00
|
|
|
if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
|
|
|
|
PGOOpt->Action == PGOOptions::SampleUse)
|
2021-09-15 07:44:29 +08:00
|
|
|
MPM.addPass(PseudoProbeUpdatePass());
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
if (LTOPreLink)
|
|
|
|
addRequiredLTOPreLinkPasses(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
|
|
|
|
assert(Level != OptimizationLevel::O0 &&
|
|
|
|
"Must request optimizations for the default pipeline!");
|
|
|
|
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Convert @llvm.global.annotations to !annotation metadata.
|
|
|
|
MPM.addPass(Annotation2MetadataPass());
|
|
|
|
|
|
|
|
// Force any function attributes we want the rest of the pipeline to observe.
|
|
|
|
MPM.addPass(ForceFunctionAttrsPass());
|
|
|
|
|
|
|
|
if (PGOOpt && PGOOpt->DebugInfoForProfiling)
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
|
|
|
|
|
|
|
|
// Apply module pipeline start EP callback.
|
|
|
|
for (auto &C : PipelineStartEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
// If we are planning to perform ThinLTO later, we don't bloat the code with
|
|
|
|
// unrolling/vectorization/... now. Just simplify the module as much as we
|
|
|
|
// can.
|
|
|
|
MPM.addPass(buildModuleSimplificationPipeline(
|
|
|
|
Level, ThinOrFullLTOPhase::ThinLTOPreLink));
|
|
|
|
|
|
|
|
// Run partial inlining pass to partially inline functions that have
|
|
|
|
// large bodies.
|
|
|
|
// FIXME: It isn't clear whether this is really the right place to run this
|
|
|
|
// in ThinLTO. Because there is another canonicalization and simplification
|
|
|
|
// phase that will run after the thin link, running this here ends up with
|
|
|
|
// less information than will be available later and it may grow functions in
|
|
|
|
// ways that aren't beneficial.
|
|
|
|
if (RunPartialInlining)
|
|
|
|
MPM.addPass(PartialInlinerPass());
|
|
|
|
|
|
|
|
// Reduce the size of the IR as much as possible.
|
|
|
|
MPM.addPass(GlobalOptPass());
|
|
|
|
|
|
|
|
// Module simplification splits coroutines, but does not fully clean up
|
|
|
|
// coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up
|
|
|
|
// on these, we schedule the cleanup here.
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
|
|
|
|
|
2021-10-15 02:37:44 +08:00
|
|
|
if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
|
|
|
|
PGOOpt->Action == PGOOptions::SampleUse)
|
2021-09-15 07:44:29 +08:00
|
|
|
MPM.addPass(PseudoProbeUpdatePass());
|
|
|
|
|
|
|
|
// Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual
|
|
|
|
// optimization is going to be done in PostLink stage, but clang can't
|
|
|
|
// add callbacks there in case of in-process ThinLTO called by linker.
|
|
|
|
for (auto &C : OptimizerLastEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
addRequiredLTOPreLinkPasses(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
|
|
|
|
OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Convert @llvm.global.annotations to !annotation metadata.
|
|
|
|
MPM.addPass(Annotation2MetadataPass());
|
|
|
|
|
|
|
|
if (ImportSummary) {
|
|
|
|
// These passes import type identifier resolutions for whole-program
|
|
|
|
// devirtualization and CFI. They must run early because other passes may
|
|
|
|
// disturb the specific instruction patterns that these passes look for,
|
|
|
|
// creating dependencies on resolutions that may not appear in the summary.
|
|
|
|
//
|
|
|
|
// For example, GVN may transform the pattern assume(type.test) appearing in
|
|
|
|
// two basic blocks into assume(phi(type.test, type.test)), which would
|
|
|
|
// transform a dependency on a WPD resolution into a dependency on a type
|
|
|
|
// identifier resolution for CFI.
|
|
|
|
//
|
|
|
|
// Also, WPD has access to more precise information than ICP and can
|
|
|
|
// devirtualize more effectively, so it should operate on the IR first.
|
|
|
|
//
|
|
|
|
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
|
|
|
|
// metadata and intrinsics.
|
|
|
|
MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Level == OptimizationLevel::O0) {
|
|
|
|
// Run a second time to clean up any type tests left behind by WPD for use
|
|
|
|
// in ICP.
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
|
|
|
|
// Drop available_externally and unreferenced globals. This is necessary
|
|
|
|
// with ThinLTO in order to avoid leaving undefined references to dead
|
|
|
|
// globals in the object file.
|
|
|
|
MPM.addPass(EliminateAvailableExternallyPass());
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Force any function attributes we want the rest of the pipeline to observe.
|
|
|
|
MPM.addPass(ForceFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Add the core simplification pipeline.
|
|
|
|
MPM.addPass(buildModuleSimplificationPipeline(
|
|
|
|
Level, ThinOrFullLTOPhase::ThinLTOPostLink));
|
|
|
|
|
|
|
|
// Now add the optimization pipeline.
|
|
|
|
MPM.addPass(buildModuleOptimizationPipeline(Level));
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
|
|
|
|
assert(Level != OptimizationLevel::O0 &&
|
|
|
|
"Must request optimizations for the default pipeline!");
|
|
|
|
// FIXME: We should use a customized pre-link pipeline!
|
|
|
|
return buildPerModuleDefaultPipeline(Level,
|
|
|
|
/* LTOPreLink */ true);
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager
|
|
|
|
PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
|
|
|
ModuleSummaryIndex *ExportSummary) {
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Convert @llvm.global.annotations to !annotation metadata.
|
|
|
|
MPM.addPass(Annotation2MetadataPass());
|
|
|
|
|
|
|
|
// Create a function that performs CFI checks for cross-DSO calls with targets
|
|
|
|
// in the current module.
|
|
|
|
MPM.addPass(CrossDSOCFIPass());
|
|
|
|
|
|
|
|
if (Level == OptimizationLevel::O0) {
|
|
|
|
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
|
|
|
|
// metadata and intrinsics.
|
|
|
|
MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
|
|
|
|
MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
|
|
|
|
// Run a second time to clean up any type tests left behind by WPD for use
|
|
|
|
// in ICP.
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) {
|
|
|
|
// Load sample profile before running the LTO optimization pipeline.
|
|
|
|
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
|
|
|
|
PGOOpt->ProfileRemappingFile,
|
|
|
|
ThinOrFullLTOPhase::FullLTOPostLink));
|
|
|
|
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
|
|
|
|
// RequireAnalysisPass for PSI before subsequent non-module passes.
|
|
|
|
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove unused virtual tables to improve the quality of code generated by
|
|
|
|
// whole-program devirtualization and bitset lowering.
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
|
|
|
|
// Force any function attributes we want the rest of the pipeline to observe.
|
|
|
|
MPM.addPass(ForceFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Do basic inference of function attributes from known properties of system
|
|
|
|
// libraries and other oracles.
|
|
|
|
MPM.addPass(InferFunctionAttrsPass());
|
|
|
|
|
|
|
|
if (Level.getSpeedupLevel() > 1) {
|
|
|
|
FunctionPassManager EarlyFPM;
|
|
|
|
EarlyFPM.addPass(CallSiteSplittingPass());
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
|
|
|
std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Indirect call promotion. This should promote all the targets that are
|
|
|
|
// left by the earlier promotion pass that promotes intra-module targets.
|
|
|
|
// This two-step promotion is to save the compile time. For LTO, it should
|
|
|
|
// produce the same result as if we only do promotion here.
|
|
|
|
MPM.addPass(PGOIndirectCallPromotion(
|
|
|
|
true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
|
|
|
|
|
2021-11-04 18:36:19 +08:00
|
|
|
if (EnableFunctionSpecialization && Level == OptimizationLevel::O3)
|
2021-09-15 07:44:29 +08:00
|
|
|
MPM.addPass(FunctionSpecializationPass());
|
|
|
|
// Propagate constants at call sites into the functions they call. This
|
|
|
|
// opens opportunities for globalopt (and inlining) by substituting function
|
|
|
|
// pointers passed as arguments to direct uses of functions.
|
|
|
|
MPM.addPass(IPSCCPPass());
|
|
|
|
|
|
|
|
// Attach metadata to indirect call sites indicating the set of functions
|
|
|
|
// they may target at run-time. This should follow IPSCCP.
|
|
|
|
MPM.addPass(CalledValuePropagationPass());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now deduce any function attributes based in the current code.
|
|
|
|
MPM.addPass(
|
|
|
|
createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
|
|
|
|
|
|
|
|
// Do RPO function attribute inference across the module to forward-propagate
|
|
|
|
// attributes where applicable.
|
|
|
|
// FIXME: Is this really an optimization rather than a canonicalization?
|
|
|
|
MPM.addPass(ReversePostOrderFunctionAttrsPass());
|
|
|
|
|
|
|
|
// Use in-range annotations on GEP indices to split globals where beneficial.
|
|
|
|
MPM.addPass(GlobalSplitPass());
|
|
|
|
|
|
|
|
// Run whole program optimization of virtual call when the list of callees
|
|
|
|
// is fixed.
|
|
|
|
MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
|
|
|
|
|
|
|
|
// Stop here at -O1.
|
|
|
|
if (Level == OptimizationLevel::O1) {
|
|
|
|
// The LowerTypeTestsPass needs to run to lower type metadata and the
|
|
|
|
// type.test intrinsics. The pass does nothing if CFI is disabled.
|
|
|
|
MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
|
|
|
|
// Run a second time to clean up any type tests left behind by WPD for use
|
|
|
|
// in ICP (which is performed earlier than this in the regular LTO
|
|
|
|
// pipeline).
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optimize globals to try and fold them into constants.
|
|
|
|
MPM.addPass(GlobalOptPass());
|
|
|
|
|
|
|
|
// Promote any localized globals to SSA registers.
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
|
|
|
|
|
|
|
|
// Linking modules together can lead to duplicate global constant, only
|
|
|
|
// keep one copy of each constant.
|
|
|
|
MPM.addPass(ConstantMergePass());
|
|
|
|
|
|
|
|
// Remove unused arguments from functions.
|
|
|
|
MPM.addPass(DeadArgumentEliminationPass());
|
|
|
|
|
|
|
|
// Reduce the code after globalopt and ipsccp. Both can open up significant
|
|
|
|
// simplification opportunities, and both can propagate functions through
|
|
|
|
// function pointers. When this happens, we often have to resolve varargs
|
|
|
|
// calls, etc, so let instcombine do this.
|
|
|
|
FunctionPassManager PeepholeFPM;
|
2021-11-01 21:48:52 +08:00
|
|
|
PeepholeFPM.addPass(InstCombinePass());
|
2021-09-15 07:44:29 +08:00
|
|
|
if (Level == OptimizationLevel::O3)
|
|
|
|
PeepholeFPM.addPass(AggressiveInstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(PeepholeFPM, Level);
|
|
|
|
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Note: historically, the PruneEH pass was run first to deduce nounwind and
|
|
|
|
// generally clean up exception handling overhead. It isn't clear this is
|
|
|
|
// valuable as the inliner doesn't currently care whether it is inlining an
|
|
|
|
// invoke or a call.
|
|
|
|
// Run the inliner now.
|
|
|
|
MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level)));
|
|
|
|
|
|
|
|
// Optimize globals again after we ran the inliner.
|
|
|
|
MPM.addPass(GlobalOptPass());
|
|
|
|
|
|
|
|
// Garbage collect dead functions.
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
|
|
|
|
// If we didn't decide to inline a function, check to see if we can
|
|
|
|
// transform it to pass arguments by value instead of by reference.
|
|
|
|
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
|
|
|
|
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
// The IPO Passes may leave cruft around. Clean up after them.
|
|
|
|
FPM.addPass(InstCombinePass());
|
|
|
|
invokePeepholeEPCallbacks(FPM, Level);
|
|
|
|
|
|
|
|
FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
|
|
|
|
|
|
|
|
// Do a post inline PGO instrumentation and use pass. This is a context
|
|
|
|
// sensitive PGO pass.
|
|
|
|
if (PGOOpt) {
|
|
|
|
if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
|
|
|
|
addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true,
|
|
|
|
/* IsCS */ true, PGOOpt->CSProfileGenFile,
|
|
|
|
PGOOpt->ProfileRemappingFile);
|
|
|
|
else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
|
|
|
|
addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false,
|
|
|
|
/* IsCS */ true, PGOOpt->ProfileFile,
|
|
|
|
PGOOpt->ProfileRemappingFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Break up allocas
|
2021-11-01 23:56:48 +08:00
|
|
|
FPM.addPass(SROAPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// LTO provides additional opportunities for tailcall elimination due to
|
|
|
|
// link-time inlining, and visibility of nocapture attribute.
|
|
|
|
FPM.addPass(TailCallElimPass());
|
|
|
|
|
|
|
|
// Run a few AA driver optimizations here and now to cleanup the code.
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
MPM.addPass(
|
|
|
|
createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass()));
|
|
|
|
|
|
|
|
// Require the GlobalsAA analysis for the module so we can query it within
|
|
|
|
// MainFPM.
|
|
|
|
MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
|
|
|
|
// Invalidate AAManager so it can be recreated and pick up the newly available
|
|
|
|
// GlobalsAA.
|
|
|
|
MPM.addPass(
|
|
|
|
createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>()));
|
|
|
|
|
|
|
|
FunctionPassManager MainFPM;
|
|
|
|
MainFPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
|
|
|
|
/*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
|
|
|
|
|
|
|
|
if (RunNewGVN)
|
|
|
|
MainFPM.addPass(NewGVNPass());
|
|
|
|
else
|
2021-11-01 23:56:48 +08:00
|
|
|
MainFPM.addPass(GVNPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Remove dead memcpy()'s.
|
|
|
|
MainFPM.addPass(MemCpyOptPass());
|
|
|
|
|
|
|
|
// Nuke dead stores.
|
|
|
|
MainFPM.addPass(DSEPass());
|
|
|
|
MainFPM.addPass(MergedLoadStoreMotionPass());
|
|
|
|
|
|
|
|
|
|
|
|
if (EnableConstraintElimination)
|
|
|
|
MainFPM.addPass(ConstraintEliminationPass());
|
|
|
|
|
|
|
|
LoopPassManager LPM;
|
2022-01-19 22:06:51 +08:00
|
|
|
if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
|
|
|
|
LPM.addPass(LoopFlattenPass());
|
2022-01-19 22:09:59 +08:00
|
|
|
LPM.addPass(IndVarSimplifyPass());
|
|
|
|
LPM.addPass(LoopDeletionPass());
|
2021-09-15 07:44:29 +08:00
|
|
|
// FIXME: Add loop interchange.
|
|
|
|
|
|
|
|
// Unroll small loops and perform peeling.
|
|
|
|
LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
|
|
|
|
/* OnlyWhenForced= */ !PTO.LoopUnrolling,
|
|
|
|
PTO.ForgetAllSCEVInLoopUnroll));
|
|
|
|
// The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
|
|
|
|
// *All* loop passes must preserve it, in order to be able to use it.
|
|
|
|
MainFPM.addPass(createFunctionToLoopPassAdaptor(
|
|
|
|
std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
|
|
|
|
|
|
|
|
MainFPM.addPass(LoopDistributePass());
|
|
|
|
|
|
|
|
addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
|
|
|
|
|
|
|
|
invokePeepholeEPCallbacks(MainFPM, Level);
|
|
|
|
MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
|
2021-11-04 06:45:30 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM),
|
|
|
|
PTO.EagerlyInvalidateAnalyses));
|
2021-09-15 07:44:29 +08:00
|
|
|
|
|
|
|
// Lower type metadata and the type.test intrinsic. This pass supports
|
|
|
|
// clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
|
|
|
|
// to be run at link time if CFI is enabled. This pass does nothing if
|
|
|
|
// CFI is disabled.
|
|
|
|
MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
|
|
|
|
// Run a second time to clean up any type tests left behind by WPD for use
|
|
|
|
// in ICP (which is performed earlier than this in the regular LTO pipeline).
|
|
|
|
MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
|
|
|
|
|
|
|
|
// Enable splitting late in the FullLTO post-link pipeline. This is done in
|
|
|
|
// the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
|
|
|
|
if (EnableHotColdSplit)
|
|
|
|
MPM.addPass(HotColdSplittingPass());
|
|
|
|
|
|
|
|
// Add late LTO optimization passes.
|
|
|
|
// Delete basic blocks, which optimization passes may have killed.
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
|
|
|
SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
|
|
|
|
|
|
|
|
// Drop bodies of available eternally objects to improve GlobalDCE.
|
|
|
|
MPM.addPass(EliminateAvailableExternallyPass());
|
|
|
|
|
|
|
|
// Now that we have optimized the program, discard unreachable functions.
|
|
|
|
MPM.addPass(GlobalDCEPass());
|
|
|
|
|
|
|
|
if (PTO.MergeFunctions)
|
|
|
|
MPM.addPass(MergeFunctionsPass());
|
|
|
|
|
|
|
|
// Emit annotation remarks.
|
|
|
|
addAnnotationRemarksPass(MPM);
|
|
|
|
|
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
|
|
|
|
bool LTOPreLink) {
|
|
|
|
assert(Level == OptimizationLevel::O0 &&
|
|
|
|
"buildO0DefaultPipeline should only be used with O0");
|
|
|
|
|
|
|
|
ModulePassManager MPM;
|
|
|
|
|
|
|
|
// Perform pseudo probe instrumentation in O0 mode. This is for the
|
|
|
|
// consistency between different build modes. For example, a LTO build can be
|
|
|
|
// mixed with an O0 prelink and an O2 postlink. Loading a sample profile in
|
|
|
|
// the postlink will require pseudo probe instrumentation in the prelink.
|
|
|
|
if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
|
|
|
|
MPM.addPass(SampleProfileProbePass(TM));
|
|
|
|
|
|
|
|
if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
|
|
|
|
PGOOpt->Action == PGOOptions::IRUse))
|
|
|
|
addPGOInstrPassesForO0(
|
|
|
|
MPM,
|
|
|
|
/* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
|
|
|
|
/* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile);
|
|
|
|
|
|
|
|
for (auto &C : PipelineStartEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
if (PGOOpt && PGOOpt->DebugInfoForProfiling)
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass()));
|
|
|
|
|
|
|
|
for (auto &C : PipelineEarlySimplificationEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
// Build a minimal pipeline based on the semantics required by LLVM,
|
|
|
|
// which is just that always inlining occurs. Further, disable generating
|
|
|
|
// lifetime intrinsics to avoid enabling further optimizations during
|
|
|
|
// code generation.
|
|
|
|
MPM.addPass(AlwaysInlinerPass(
|
|
|
|
/*InsertLifetimeIntrinsics=*/false));
|
|
|
|
|
|
|
|
if (PTO.MergeFunctions)
|
|
|
|
MPM.addPass(MergeFunctionsPass());
|
|
|
|
|
|
|
|
if (EnableMatrix)
|
|
|
|
MPM.addPass(
|
|
|
|
createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true)));
|
|
|
|
|
|
|
|
if (!CGSCCOptimizerLateEPCallbacks.empty()) {
|
|
|
|
CGSCCPassManager CGPM;
|
|
|
|
for (auto &C : CGSCCOptimizerLateEPCallbacks)
|
|
|
|
C(CGPM, Level);
|
|
|
|
if (!CGPM.isEmpty())
|
|
|
|
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
|
|
|
|
}
|
|
|
|
if (!LateLoopOptimizationsEPCallbacks.empty()) {
|
|
|
|
LoopPassManager LPM;
|
|
|
|
for (auto &C : LateLoopOptimizationsEPCallbacks)
|
|
|
|
C(LPM, Level);
|
|
|
|
if (!LPM.isEmpty()) {
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
|
|
|
createFunctionToLoopPassAdaptor(std::move(LPM))));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!LoopOptimizerEndEPCallbacks.empty()) {
|
|
|
|
LoopPassManager LPM;
|
|
|
|
for (auto &C : LoopOptimizerEndEPCallbacks)
|
|
|
|
C(LPM, Level);
|
|
|
|
if (!LPM.isEmpty()) {
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
|
|
|
createFunctionToLoopPassAdaptor(std::move(LPM))));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!ScalarOptimizerLateEPCallbacks.empty()) {
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
for (auto &C : ScalarOptimizerLateEPCallbacks)
|
|
|
|
C(FPM, Level);
|
|
|
|
if (!FPM.isEmpty())
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
|
|
|
|
}
|
|
|
|
if (!VectorizerStartEPCallbacks.empty()) {
|
|
|
|
FunctionPassManager FPM;
|
|
|
|
for (auto &C : VectorizerStartEPCallbacks)
|
|
|
|
C(FPM, Level);
|
|
|
|
if (!FPM.isEmpty())
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
|
|
|
|
}
|
|
|
|
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
|
|
|
|
CGSCCPassManager CGPM;
|
|
|
|
CGPM.addPass(CoroSplitPass());
|
|
|
|
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
|
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
|
|
|
|
|
|
|
|
for (auto &C : OptimizerLastEPCallbacks)
|
|
|
|
C(MPM, Level);
|
|
|
|
|
|
|
|
if (LTOPreLink)
|
|
|
|
addRequiredLTOPreLinkPasses(MPM);
|
|
|
|
|
2021-12-02 22:50:14 +08:00
|
|
|
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
|
|
|
|
|
2021-09-15 07:44:29 +08:00
|
|
|
return MPM;
|
|
|
|
}
|
|
|
|
|
|
|
|
AAManager PassBuilder::buildDefaultAAPipeline() {
|
|
|
|
AAManager AA;
|
|
|
|
|
|
|
|
// The order in which these are registered determines their priority when
|
|
|
|
// being queried.
|
|
|
|
|
|
|
|
// First we register the basic alias analysis that provides the majority of
|
|
|
|
// per-function local AA logic. This is a stateless, on-demand local set of
|
|
|
|
// AA techniques.
|
|
|
|
AA.registerFunctionAnalysis<BasicAA>();
|
|
|
|
|
|
|
|
// Next we query fast, specialized alias analyses that wrap IR-embedded
|
|
|
|
// information about aliasing.
|
|
|
|
AA.registerFunctionAnalysis<ScopedNoAliasAA>();
|
|
|
|
AA.registerFunctionAnalysis<TypeBasedAA>();
|
|
|
|
|
|
|
|
// Add support for querying global aliasing information when available.
|
|
|
|
// Because the `AAManager` is a function analysis and `GlobalsAA` is a module
|
|
|
|
// analysis, all that the `AAManager` can do is query for any *cached*
|
|
|
|
// results from `GlobalsAA` through a readonly proxy.
|
|
|
|
AA.registerModuleAnalysis<GlobalsAA>();
|
|
|
|
|
|
|
|
// Add target-specific alias analyses.
|
|
|
|
if (TM)
|
|
|
|
TM->registerDefaultAliasAnalyses(AA);
|
|
|
|
|
|
|
|
return AA;
|
|
|
|
}
|