[PGO] Profile guided code size optimization.

Summary:
Enable some of the existing size optimizations for cold code under PGO.

A ~5% code size saving in big internal app under PGO.

The way it gets BFI/PSI is discussed in the RFC thread

http://lists.llvm.org/pipermail/llvm-dev/2019-March/130894.html 

Note it doesn't currently touch loop passes.

Reviewers: davidxl, eraman

Reviewed By: eraman

Subscribers: mgorny, javed.absar, smeenai, mehdi_amini, eraman, zzheng, steven_wu, dexonsmith, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D59514

llvm-svn: 358422
This commit is contained in:
Hiroshi Yamauchi 2019-04-15 16:49:00 +00:00
parent 64041d7b90
commit 09e539fcae
28 changed files with 473 additions and 44 deletions

View File

@ -55,6 +55,7 @@ class DominatorTree;
class Function;
class GlobalVariable;
class Instruction;
class ProfileSummaryInfo;
class TargetTransformInfo;
/// A private "module" namespace for types and utilities used by
@ -124,7 +125,8 @@ public:
// Glue for old PM.
bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
BlockFrequencyInfo *BFI, BasicBlock &Entry);
BlockFrequencyInfo *BFI, BasicBlock &Entry,
ProfileSummaryInfo *PSI);
void cleanup() {
ClonedCastMap.clear();
@ -148,6 +150,7 @@ private:
LLVMContext *Ctx;
const DataLayout *DL;
BasicBlock *Entry;
ProfileSummaryInfo *PSI;
/// Keeps track of constant candidates found in the function.
using ConstCandVecType = std::vector<consthoist::ConstantCandidate>;

View File

@ -28,6 +28,8 @@ class TargetLibraryInfo;
class BasicBlock;
class Function;
class OptimizationRemarkEmitter;
class BlockFrequencyInfo;
class ProfileSummaryInfo;
/// This class implements simplifications for calls to fortified library
/// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to,
@ -74,6 +76,8 @@ private:
const DataLayout &DL;
const TargetLibraryInfo *TLI;
OptimizationRemarkEmitter &ORE;
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
bool UnsafeFPShrink;
function_ref<void(Instruction *, Value *)> Replacer;
function_ref<void(Instruction *)> Eraser;
@ -101,6 +105,7 @@ public:
LibCallSimplifier(
const DataLayout &DL, const TargetLibraryInfo *TLI,
OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<void(Instruction *, Value *)> Replacer =
&replaceAllUsesWithDefault,
function_ref<void(Instruction *)> Eraser = &eraseFromParentDefault);

View File

@ -0,0 +1,34 @@
//===- llvm/Transforms/Utils/SizeOpts.h - size optimization -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains some shared code size optimization related code.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
#define LLVM_TRANSFORMS_UTILS_SiZEOPTS_H
namespace llvm {
class BasicBlock;
class BlockFrequencyInfo;
class Function;
class ProfileSummaryInfo;
/// Returns true if function \p F is suggested to be size-optimized base on the
/// profile.
bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI);
/// Returns true if basic block \p BB is suggested to be size-optimized base
/// on the profile.
bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_SiZEOPTS_H

View File

@ -24,11 +24,13 @@ namespace llvm {
class AssumptionCache;
class BasicBlock;
class BlockFrequencyInfo;
class DependenceInfo;
class DominatorTree;
class Loop;
class LoopInfo;
class MDNode;
class ProfileSummaryInfo;
class OptimizationRemarkEmitter;
class ScalarEvolution;
@ -120,7 +122,8 @@ void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);

View File

@ -71,6 +71,7 @@ class Loop;
class LoopAccessInfo;
class LoopInfo;
class OptimizationRemarkEmitter;
class ProfileSummaryInfo;
class ScalarEvolution;
class TargetLibraryInfo;
class TargetTransformInfo;
@ -96,6 +97,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
AssumptionCache *AC;
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
OptimizationRemarkEmitter *ORE;
ProfileSummaryInfo *PSI;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@ -105,7 +107,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE);
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_);
bool processLoop(Loop *L);
};

View File

@ -575,8 +575,12 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
Options.DoCounterPromotion = true;
Options.UseBFIInPromotion = IsCS;
MPM.addPass(InstrProfiling(Options, IsCS));
} else if (!ProfileFile.empty())
} else if (!ProfileFile.empty()) {
MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
// RequireAnalysisPass for PSI before subsequent non-module passes.
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
}
}
static InlineParams
@ -649,6 +653,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
PGOOpt->ProfileRemappingFile,
Phase == ThinLTOPhase::PreLink));
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
// RequireAnalysisPass for PSI before subsequent non-module passes.
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
// Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard
// for the profile annotation to be accurate in the ThinLTO backend.
if (Phase != ThinLTOPhase::PreLink)
@ -1065,6 +1072,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
PGOOpt->ProfileRemappingFile,
false /* ThinLTOPhase::PreLink */));
// Cache ProfileSummaryAnalysis once to avoid the potential need to insert
// RequireAnalysisPass for PSI before subsequent non-module passes.
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
}
// Remove unused virtual tables to improve the quality of code generated by

View File

@ -4178,7 +4178,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
auto InstCombineErase = [this](Instruction *I) {
eraseInstFromFunction(*I);
};
LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW,
LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
InstCombineErase);
if (Value *With = Simplifier.optimizeCall(CI)) {
++NumSimplified;

View File

@ -52,12 +52,14 @@ namespace llvm {
class APInt;
class AssumptionCache;
class BlockFrequencyInfo;
class DataLayout;
class DominatorTree;
class GEPOperator;
class GlobalVariable;
class LoopInfo;
class OptimizationRemarkEmitter;
class ProfileSummaryInfo;
class TargetLibraryInfo;
class User;
@ -304,6 +306,8 @@ private:
const DataLayout &DL;
const SimplifyQuery SQ;
OptimizationRemarkEmitter &ORE;
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
// Optional analyses. When non-null, these can both be used to do better
// combining and will be updated to reflect any changes.
@ -315,11 +319,11 @@ public:
InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
OptimizationRemarkEmitter &ORE, const DataLayout &DL,
LoopInfo *LI)
OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
: Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {}
DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
/// Run the combiner over the entire worklist until it is empty.
///

View File

@ -46,14 +46,17 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@ -3478,7 +3481,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
static bool combineInstructionsOverFunction(
Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
OptimizationRemarkEmitter &ORE, bool ExpensiveCombines = true,
OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, bool ExpensiveCombines = true,
LoopInfo *LI = nullptr) {
auto &DL = F.getParent()->getDataLayout();
ExpensiveCombines |= EnableExpensiveCombines;
@ -3509,7 +3513,7 @@ static bool combineInstructionsOverFunction(
MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA,
AC, TLI, DT, ORE, DL, LI);
AC, TLI, DT, ORE, BFI, PSI, DL, LI);
IC.MaxArraySizeForCombine = MaxArraySize;
if (!IC.run())
@ -3529,8 +3533,15 @@ PreservedAnalyses InstCombinePass::run(Function &F,
auto *LI = AM.getCachedResult<LoopAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
const ModuleAnalysisManager &MAM =
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
ExpensiveCombines, LI))
BFI, PSI, ExpensiveCombines, LI))
// No changes, all analyses are preserved.
return PreservedAnalyses::all();
@ -3554,6 +3565,8 @@ void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
}
bool InstructionCombiningPass::runOnFunction(Function &F) {
@ -3570,9 +3583,15 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
// Optional analyses.
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
ProfileSummaryInfo *PSI =
&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
BlockFrequencyInfo *BFI =
(PSI && PSI->hasProfileSummary()) ?
&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
nullptr;
return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
ExpensiveCombines, LI);
BFI, PSI, ExpensiveCombines, LI);
}
char InstructionCombiningPass::ID = 0;
@ -3585,6 +3604,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
"Combine redundant instructions", false, false)

View File

@ -41,6 +41,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
@ -60,6 +61,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@ -111,6 +113,7 @@ public:
if (ConstHoistWithBlockFrequency)
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
@ -126,6 +129,7 @@ INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
@ -148,7 +152,8 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
ConstHoistWithBlockFrequency
? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
: nullptr,
Fn.getEntryBlock());
Fn.getEntryBlock(),
&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
if (MadeChange) {
LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
@ -548,7 +553,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
ConstCandVecType::iterator &MaxCostItr) {
unsigned NumUses = 0;
if(!Entry->getParent()->hasOptSize() || std::distance(S,E) > 100) {
bool OptForSize = Entry->getParent()->hasOptSize() ||
llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI);
if (!OptForSize || std::distance(S,E) > 100) {
for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
NumUses += ConstCand->Uses.size();
if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
@ -919,13 +926,14 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
/// Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
DominatorTree &DT, BlockFrequencyInfo *BFI,
BasicBlock &Entry) {
BasicBlock &Entry, ProfileSummaryInfo *PSI) {
this->TTI = &TTI;
this->DT = &DT;
this->BFI = BFI;
this->DL = &Fn.getParent()->getDataLayout();
this->Ctx = &Fn.getContext();
this->Entry = &Entry;
this->PSI = PSI;
// Collect all constant candidates.
collectConstantCandidates(Fn);
@ -962,7 +970,9 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
auto BFI = ConstHoistWithBlockFrequency
? &AM.getResult<BlockFrequencyAnalysis>(F)
: nullptr;
if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
return PreservedAnalyses::all();
PreservedAnalyses PA;

View File

@ -29,11 +29,14 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@ -54,6 +57,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
#include <cassert>
#include <forward_list>
@ -159,8 +163,9 @@ namespace {
class LoadEliminationForLoop {
public:
LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
DominatorTree *DT)
: L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
DominatorTree *DT, BlockFrequencyInfo *BFI,
ProfileSummaryInfo* PSI)
: L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
/// Look through the loop-carried and loop-independent dependences in
/// this loop and find store->load dependences.
@ -529,7 +534,11 @@ public:
}
if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
if (L->getHeader()->getParent()->hasOptSize()) {
auto *HeaderBB = L->getHeader();
auto *F = HeaderBB->getParent();
bool OptForSize = F->hasOptSize() ||
llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
if (OptForSize) {
LLVM_DEBUG(
dbgs() << "Versioning is needed but not allowed when optimizing "
"for size.\n");
@ -572,6 +581,8 @@ private:
LoopInfo *LI;
const LoopAccessInfo &LAI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
PredicatedScalarEvolution PSE;
};
@ -579,6 +590,7 @@ private:
static bool
eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
// Build up a worklist of inner-loops to transform to avoid iterator
// invalidation.
@ -597,7 +609,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
bool Changed = false;
for (Loop *L : Worklist) {
// The actual work is performed by LoadEliminationForLoop.
LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
Changed |= LEL.processLoop();
}
return Changed;
@ -622,10 +634,14 @@ public:
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
nullptr;
// Process each loop nest in the function.
return eliminateLoadsAcrossLoops(
F, LI, DT,
F, LI, DT, BFI, PSI,
[&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
}
@ -638,6 +654,8 @@ public:
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
}
};
@ -653,6 +671,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
FunctionPass *llvm::createLoopLoadEliminationPass() {
@ -668,13 +688,17 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
MemorySSA *MSSA = EnableMSSALoopDependency
? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
: nullptr;
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
bool Changed = eliminateLoadsAcrossLoops(
F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
});

View File

@ -294,7 +294,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
return LoopUnrollResult::Unmodified;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, None, None, None, None, None, None);
L, SE, TTI, nullptr, nullptr, OptLevel,
None, None, None, None, None, None);
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)

View File

@ -23,7 +23,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
@ -55,6 +57,7 @@
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <algorithm>
#include <cassert>
@ -165,7 +168,8 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
@ -198,7 +202,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
TTI.getUnrollingPreferences(L, SE, UP);
// Apply size attributes
if (L->getHeader()->getParent()->hasOptSize()) {
bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
if (OptForSize) {
UP.Threshold = UP.OptSizeThreshold;
UP.PartialThreshold = UP.PartialOptSizeThreshold;
}
@ -963,7 +969,9 @@ bool llvm::computeUnrollCount(
static LoopUnrollResult tryToUnrollLoop(
Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const TargetTransformInfo &TTI, AssumptionCache &AC,
OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
bool PreserveLCSSA, int OptLevel,
bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
@ -989,7 +997,7 @@ static LoopUnrollResult tryToUnrollLoop(
bool NotDuplicatable;
bool Convergent;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
ProvidedAllowPeeling);
// Exit early if unrolling is disabled.
@ -1176,7 +1184,8 @@ public:
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
LoopUnrollResult Result = tryToUnrollLoop(
L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced,
L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
PreserveLCSSA, OptLevel, OnlyWhenForced,
ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);
@ -1257,6 +1266,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
bool Changed =
tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
/*BFI*/ nullptr, /*PSI*/ nullptr,
/*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, /*AllowPartial*/ false,
@ -1359,6 +1369,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
bool Changed = false;
@ -1394,7 +1406,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
// The API here is quite complex to call and we allow to select some
// flavors of unrolling during construction time (by setting UnrollOpts).
LoopUnrollResult Result = tryToUnrollLoop(
&L, DT, &LI, SE, TTI, AC, ORE,
&L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
/*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,

View File

@ -51,6 +51,7 @@ add_llvm_library(LLVMTransformUtils
SimplifyCFG.cpp
SimplifyIndVar.cpp
SimplifyLibCalls.cpp
SizeOpts.cpp
SplitModule.cpp
StripNonLineTableDebugInfo.cpp
SymbolRewriter.cpp

View File

@ -16,8 +16,10 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
@ -34,6 +36,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
using namespace llvm;
using namespace PatternMatch;
@ -2375,7 +2378,9 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
// Don't rewrite fputs to fwrite when optimising for size because fwrite
// requires more arguments and thus extra MOVs are required.
if (CI->getFunction()->hasOptSize())
bool OptForSize = CI->getFunction()->hasOptSize() ||
llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
if (OptForSize)
return nullptr;
// Check if has any use
@ -2750,9 +2755,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
LibCallSimplifier::LibCallSimplifier(
const DataLayout &DL, const TargetLibraryInfo *TLI,
OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<void(Instruction *, Value *)> Replacer,
function_ref<void(Instruction *)> Eraser)
: FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE),
: FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {

View File

@ -0,0 +1,37 @@
//===-- SizeOpts.cpp - code size optimization related code ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains some shared code size optimization related code.
//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
using namespace llvm;
static cl::opt<bool> ProfileGuidedSizeOpt(
"pgso", cl::Hidden, cl::init(true),
cl::desc("Enable the profile guided size optimization. "));
bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) {
assert(F);
if (!PSI || !BFI || !PSI->hasProfileSummary())
return false;
return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
}
bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) {
assert(BB);
if (!PSI || !BFI || !PSI->hasProfileSummary())
return false;
return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
}

View File

@ -88,6 +88,7 @@
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@ -134,6 +135,7 @@
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
@ -1452,12 +1454,13 @@ struct LoopVectorize : public FunctionPass {
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
GetLAA, *ORE);
GetLAA, *ORE, PSI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@ -1483,6 +1486,7 @@ struct LoopVectorize : public FunctionPass {
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
};
@ -6054,6 +6058,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
@ -7147,7 +7152,8 @@ static bool processLoopInVPlanNativePath(
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
@ -7162,10 +7168,12 @@ static bool processLoopInVPlanNativePath(
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
// Check the function attributes to find out if this function should be
// optimized for size.
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
@ -7245,10 +7253,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
// Check the function attributes to find out if this function should be
// optimized for size.
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
@ -7257,7 +7267,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->empty())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, Hints);
ORE, BFI, PSI, Hints);
assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
@ -7523,7 +7533,7 @@ bool LoopVectorizePass::runImpl(
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE_) {
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
@ -7535,6 +7545,7 @@ bool LoopVectorizePass::runImpl(
GetLAA = &GetLAA_;
DB = &DB_;
ORE = &ORE_;
PSI = PSI_;
// Don't attempt if
// 1. the target claims to have no vector registers, and
@ -7603,8 +7614,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
const ModuleAnalysisManager &MAM =
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
bool Changed =
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;

View File

@ -106,6 +106,7 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
@ -245,7 +246,6 @@
; CHECK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis

View File

@ -69,6 +69,7 @@
; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
; CHECK-O2-NEXT: Running pass: InstCombinePass
; CHECK-O2-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>

View File

@ -88,6 +88,7 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@ -219,7 +220,6 @@
; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis

View File

@ -214,6 +214,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results

View File

@ -219,6 +219,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results

View File

@ -201,6 +201,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results

View File

@ -1,4 +1,6 @@
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -S < %s | FileCheck %s
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso -S < %s | FileCheck %s -check-prefix=PGSO
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO
; There are different candidates here for the base constant: 1073876992 and
; 1073876996. But we don't want to see the latter because it results in
@ -8,6 +10,7 @@ define void @foo() #0 {
entry:
; CHECK-LABEL: @foo
; CHECK-NOT: [[CONST1:%const_mat[0-9]*]] = add i32 %const, -4
; CHECK-LABEL: @foo_pgso
%0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%or = or i32 %0, 1
store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
@ -40,3 +43,59 @@ entry:
}
attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
define void @foo_pgso() #1 !prof !14 {
entry:
; PGSO-LABEL: @foo_pgso
; PGSO-NOT: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
; NPGSO-LABEL: @foo_pgso
; NPGSO: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
%0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%or = or i32 %0, 1
store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
%1 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
%and = and i32 %1, -117506048
store volatile i32 %and, i32* inttoptr (i32 1073876996 to i32*), align 4
%2 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%and1 = and i32 %2, -17367041
store volatile i32 %and1, i32* inttoptr (i32 1073876996 to i32*), align 4096
%3 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%and2 = and i32 %3, -262145
store volatile i32 %and2, i32* inttoptr (i32 1073876992 to i32*), align 4096
%4 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
%and3 = and i32 %4, -8323073
store volatile i32 %and3, i32* inttoptr (i32 1073876996 to i32*), align 4
store volatile i32 10420224, i32* inttoptr (i32 1073877000 to i32*), align 8
%5 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4096
%or4 = or i32 %5, 65536
store volatile i32 %or4, i32* inttoptr (i32 1073876996 to i32*), align 4096
%6 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%or6.i.i = or i32 %6, 16
store volatile i32 %or6.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
%7 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%and7.i.i = and i32 %7, -4
store volatile i32 %and7.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
%8 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%or8.i.i = or i32 %8, 2
store volatile i32 %or8.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
ret void
}
attributes #1 = { norecurse nounwind readnone uwtable } ; no optsize or minsize
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}

View File

@ -2,6 +2,8 @@
; because it requires more arguments and thus extra MOVs are required.
;
; RUN: opt < %s -instcombine -S | FileCheck %s
; RUN: opt < %s -instcombine -pgso -S | FileCheck %s -check-prefix=PGSO
; RUN: opt < %s -instcombine -pgso=false -S | FileCheck %s -check-prefix=NPGSO
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
@ -26,3 +28,34 @@ declare i32 @fputs(i8* nocapture readonly, %struct._IO_FILE* nocapture) local_un
attributes #0 = { nounwind optsize }
attributes #1 = { nounwind optsize }
define i32 @main_pgso() local_unnamed_addr !prof !14 {
entry:
; PGSO-LABEL: @main_pgso(
; PGSO-NOT: call i64 @fwrite
; PGSO: call i32 @fputs
; NPGSO-LABEL: @main_pgso(
; NPGSO: call i64 @fwrite
; NPGSO-NOT: call i32 @fputs
%call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0)) #2
%call1 = tail call i32 @fputs(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.2, i32 0, i32 0), %struct._IO_FILE* %call) #2
ret i32 0
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}

View File

@ -1,4 +1,6 @@
; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s
; RUN: opt -basicaa -loop-load-elim -pgso -S < %s | FileCheck %s -check-prefix=PGSO
; RUN: opt -basicaa -loop-load-elim -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO
; When optimizing for size don't eliminate in this loop because the loop would
; have to be versioned first because A and C may alias.
@ -74,3 +76,54 @@ for.body: ; preds = %for.body, %entry
for.end: ; preds = %for.body
ret void
}
; PGSO-LABEL: @f_pgso(
; NPGSO-LABEL: @f_pgso(
define void @f_pgso(i32* %A, i32* %B, i32* %C, i64 %N) !prof !14 {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
%Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
%Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%b = load i32, i32* %Bidx, align 4
%a_p1 = add i32 %b, 2
store i32 %a_p1, i32* %Aidx_next, align 4
%a = load i32, i32* %Aidx, align 4
; PGSO: %c = mul i32 %a, 2
; NPGSO-NOT: %c = mul i32 %a, 2
%c = mul i32 %a, 2
store i32 %c, i32* %Cidx, align 4
%exitcond = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}

View File

@ -1,5 +1,7 @@
; RUN: opt < %s -S -loop-unroll -unroll-count=4 | FileCheck -check-prefix=CHECK_COUNT4 %s
; RUN: opt < %s -S -loop-unroll | FileCheck -check-prefix=CHECK_NOCOUNT %s
; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso | FileCheck -check-prefix=PGSO %s
; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso=false | FileCheck -check-prefix=NPGSO %s
;///////////////////// TEST 1 //////////////////////////////
@ -128,3 +130,47 @@ for.end: ; preds = %for.body
; CHECK_NOCOUNT-LABEL: @Test4
; CHECK_NOCOUNT: phi
; CHECK_NOCOUNT: icmp
;///////////////////// TEST 5 //////////////////////////////
; This test shows that with PGO, this loop is cold and not unrolled.
define i32 @Test5() !prof !14 {
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds [24 x i32], [24 x i32]* @tab, i32 0, i32 %i.05
store i32 %i.05, i32* %arrayidx, align 4
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 24
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret i32 42
}
; PGSO-LABEL: @Test5
; PGSO: phi
; PGSO: icmp
; NPGSO-LABEL: @Test5
; NPGSO-NOT: phi
; NPGSO-NOT: icmp
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}

View File

@ -2,6 +2,8 @@
; loop with the optimize for size or the minimize size attributes.
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
; RUN: opt < %s -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO
target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
@ -36,6 +38,7 @@ define i32 @foo_minsize() #1 {
; CHECK-LABEL: @foo_minsize(
; CHECK-NOT: <2 x i8>
; CHECK-NOT: <4 x i8>
; CHECK-LABEL: @foo_pgso(
entry:
br label %for.body
@ -57,3 +60,43 @@ for.end: ; preds = %for.body
attributes #1 = { minsize }
define i32 @foo_pgso() !prof !14 {
; PGSO-LABEL: @foo_pgso(
; PGSO-NOT: <{{[0-9]+}} x i8>
; NPGSO-LABEL: @foo_pgso(
; NPGSO: <{{[0-9]+}} x i8>
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
%0 = load i8, i8* %arrayidx, align 1
%cmp1 = icmp eq i8 %0, 0
%. = select i1 %cmp1, i8 2, i8 1
store i8 %., i8* %arrayidx, align 1
%inc = add nsw i32 %i.08, 1
%exitcond = icmp eq i32 %i.08, 202
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret i32 0
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}