forked from OSchip/llvm-project
[llvm-profgen] Compute and show profile density
AutoFDO performance is sensitive to profile density, i.e., the amount of samples in the profile relative to the program size, because profiles with insufficient samples could be inaccurate due to statistical noise and thus hurt AutoFDO performance. A previous investigation showed that AutoFDO performed better on MySQL with increased amount of samples. Therefore, we implement a profile-density computation feature to give hints about profile density to users and the compiler. We define the density of a profile Prof as follows: - For each function A in the profile, density(A) = total_samples(A) / sizeof(A). - density(Prof) = min(density(A)) for all functions A that are warm (defined below). A function is considered warm if its total-samples is within top N percent of the profile. For implementation, we reuse the `ProfileSummaryBuilder::getHotCountThreshold(..)` as threshold which can be set by percent(`--profile-summary-cutoff-hot`) or by value(`--profile-summary-hot-count`). We also introduce `--hot-function-density-threshold` to set hot function density threshold and will give suggestion if profile density is below it which implies we should increase samples. This also applies for CS profile with all profiles merged into base. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D113781
This commit is contained in:
parent
8cd782487f
commit
c2e08aba1a
|
@ -0,0 +1,154 @@
|
|||
[main]
|
||||
8
|
||||
810-82f:15
|
||||
834-85c:15
|
||||
870-870:1544
|
||||
875-8a1:11
|
||||
875-8bf:1223
|
||||
875-8c3:185
|
||||
893-8bf:176
|
||||
8a7-8c3:13
|
||||
5
|
||||
82f->790:15
|
||||
870->540:1546
|
||||
8a1->810:15
|
||||
8bf->870:2022
|
||||
8c3->893:276
|
||||
[partition_pivot_first]
|
||||
10
|
||||
710-72d:238
|
||||
740-753:1
|
||||
740-75b:739
|
||||
740-75f:267
|
||||
740-761:1164
|
||||
743-753:12
|
||||
743-75b:2414
|
||||
743-761:793
|
||||
755-75b:103
|
||||
755-75f:115
|
||||
3
|
||||
753->770:13
|
||||
75b->743:3327
|
||||
75f->740:385
|
||||
[partition_pivot_first:4.2 @ swap]
|
||||
1
|
||||
764-76e:2904
|
||||
1
|
||||
76e->740:2999
|
||||
[partition_pivot_first:5 @ swap]
|
||||
2
|
||||
770-770:619
|
||||
77a-783:619
|
||||
0
|
||||
[partition_pivot_last]
|
||||
15
|
||||
650-66d:206
|
||||
650-675:182
|
||||
682-689:164
|
||||
686-689:193
|
||||
6b0-6b7:18
|
||||
6b0-6bf:2082
|
||||
6b0-6c8:1180
|
||||
6b0-6ca:683
|
||||
6b9-6bf:170
|
||||
6b9-6c8:92
|
||||
6b9-6ca:62
|
||||
6d0-6d3:2230
|
||||
6e3-6ea:712
|
||||
6e3-6ef:1518
|
||||
6ec-6ef:667
|
||||
8
|
||||
66d->686:206
|
||||
675->682:79
|
||||
689->6b9:359
|
||||
6b7->68b:18
|
||||
6bf->6d0:2307
|
||||
6c8->6b0:1300
|
||||
6ca->6ec:755
|
||||
6ea->6b0:724
|
||||
[partition_pivot_last:5 @ swap]
|
||||
3
|
||||
677-67d:292
|
||||
6d6-6df:3621
|
||||
6f2-700:3528
|
||||
1
|
||||
700->6b0:3619
|
||||
[partition_pivot_last:6 @ swap]
|
||||
2
|
||||
68b-68b:1124
|
||||
695-69e:1124
|
||||
0
|
||||
[quick_sort]
|
||||
4
|
||||
790-79c:1273
|
||||
7a6-7a6:1273
|
||||
7a8-7b8:941
|
||||
7bd-7ca:791
|
||||
4
|
||||
7a6->650:817
|
||||
7a6->710:489
|
||||
7b8->790:961
|
||||
7ca->790:805
|
||||
[quick_sort:2 @ partition_pivot_first]
|
||||
12
|
||||
710-72d:408
|
||||
740-753:208
|
||||
740-75b:463
|
||||
740-75f:262
|
||||
740-761:496
|
||||
743-753:386
|
||||
743-75b:1300
|
||||
743-761:451
|
||||
755-75b:283
|
||||
755-75f:144
|
||||
774-777:619
|
||||
787-788:619
|
||||
4
|
||||
753->770:619
|
||||
75b->743:2137
|
||||
75f->740:427
|
||||
788->7a8:646
|
||||
[quick_sort:2 @ partition_pivot_last]
|
||||
17
|
||||
650-66d:295
|
||||
650-675:517
|
||||
682-689:528
|
||||
686-689:307
|
||||
68f-692:1124
|
||||
6a2-6a2:1124
|
||||
6b0-6b7:806
|
||||
6b0-6bf:1093
|
||||
6b0-6c8:935
|
||||
6b0-6ca:351
|
||||
6b9-6bf:226
|
||||
6b9-6c8:273
|
||||
6b9-6ca:81
|
||||
6d0-6d3:1391
|
||||
6e3-6ea:500
|
||||
6e3-6ef:891
|
||||
6ec-6ef:452
|
||||
9
|
||||
66d->686:307
|
||||
675->682:340
|
||||
689->6b9:580
|
||||
6a2->7a8:1167
|
||||
6b7->68b:834
|
||||
6bf->6d0:1391
|
||||
6c8->6b0:1263
|
||||
6ca->6ec:452
|
||||
6ea->6b0:518
|
||||
[quick_sort:4 @ quick_sort]
|
||||
6
|
||||
790-792:831
|
||||
790-79c:331
|
||||
7a6-7a6:331
|
||||
7a8-7b8:441
|
||||
7bd-7ca:632
|
||||
7d7-7d7:2029
|
||||
6
|
||||
792->7d7:853
|
||||
7a6->650:248
|
||||
7a6->710:103
|
||||
7b8->790:462
|
||||
7ca->790:661
|
||||
7d7->7cf:2097
|
|
@ -0,0 +1,29 @@
|
|||
27
|
||||
400540-400540:10
|
||||
400650-40066d:31
|
||||
400686-400689:3
|
||||
40068b-4006a2:3
|
||||
4006b0-4006b7:3
|
||||
4006b0-4006bf:6
|
||||
4006b0-4006c8:6
|
||||
4006d0-4006ea:51
|
||||
4006d0-400700:4
|
||||
4006ec-400700:30
|
||||
400710-40072f:5
|
||||
400740-400753:3
|
||||
400740-40075b:9
|
||||
400740-40076e:14
|
||||
400743-400753:3
|
||||
400743-40075b:43
|
||||
400743-40076e:11
|
||||
400755-40075b:4
|
||||
400770-400788:6
|
||||
400790-400792:12
|
||||
400790-4007a6:12
|
||||
4007a8-4007b8:11
|
||||
4007bd-4007ca:12
|
||||
4007cf-4007d7:12
|
||||
4007d7-4007d7:12
|
||||
400870-400870:12
|
||||
400875-4008bf:10
|
||||
0
|
|
@ -0,0 +1,64 @@
|
|||
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=1 &> %t2
|
||||
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
|
||||
|
||||
; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
|
||||
; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
|
||||
|
||||
;CHECK-DENSITY: AutoFDO is estimated to optimize better with 4.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
|
||||
;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 0.2
|
||||
|
||||
;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 31.4
|
||||
|
||||
; original code:
|
||||
; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
void swap(int *a, int *b) {
|
||||
int t = *a;
|
||||
*a = *b;
|
||||
*b = t;
|
||||
}
|
||||
|
||||
int partition_pivot_last(int* array, int low, int high) {
|
||||
int pivot = array[high];
|
||||
int i = low - 1;
|
||||
for (int j = low; j < high; j++)
|
||||
if (array[j] < pivot)
|
||||
swap(&array[++i], &array[j]);
|
||||
swap(&array[i + 1], &array[high]);
|
||||
return (i + 1);
|
||||
}
|
||||
|
||||
int partition_pivot_first(int* array, int low, int high) {
|
||||
int pivot = array[low];
|
||||
int i = low + 1;
|
||||
for (int j = low + 1; j <= high; j++)
|
||||
if (array[j] < pivot) { if (j != i) swap(&array[i], &array[j]); i++;}
|
||||
swap(&array[i - 1], &array[low]);
|
||||
return i - 1;
|
||||
}
|
||||
|
||||
void quick_sort(int* array, int low, int high, int (*partition_func)(int *, int, int)) {
|
||||
if (low < high) {
|
||||
int pi = (*partition_func)(array, low, high);
|
||||
quick_sort(array, low, pi - 1, partition_func);
|
||||
quick_sort(array, pi + 1, high, partition_func);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
const int size = 200;
|
||||
int sum = 0;
|
||||
int *array = malloc(size * sizeof(int));
|
||||
for(int i = 0; i < 100 * 1000; i++) {
|
||||
for(int j = 0; j < size; j++)
|
||||
array[j] = j % 10 ? rand() % size: j;
|
||||
int (*fptr)(int *, int, int) = i % 3 ? partition_pivot_last : partition_pivot_first;
|
||||
quick_sort(array, 0, size - 1, fptr);
|
||||
sum += array[i % size];
|
||||
}
|
||||
printf("sum=%d\n", sum);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -9,6 +9,7 @@
|
|||
#include "ProfileGenerator.h"
|
||||
#include "ProfiledBinary.h"
|
||||
#include "llvm/ProfileData/ProfileCommon.h"
|
||||
#include <float.h>
|
||||
#include <unordered_set>
|
||||
|
||||
cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
|
||||
|
@ -70,7 +71,16 @@ static cl::opt<int, true> CSProfMaxContextDepth(
|
|||
"depth limit."),
|
||||
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
|
||||
|
||||
extern cl::opt<int> ProfileSummaryCutoffCold;
|
||||
static cl::opt<double> HotFunctionDensityThreshold(
|
||||
"hot-function-density-threshold", llvm::cl::init(1000),
|
||||
llvm::cl::desc(
|
||||
"specify density threshold for hot functions (default: 1000)"),
|
||||
llvm::cl::Optional);
|
||||
static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
|
||||
llvm::cl::desc("show profile density details"),
|
||||
llvm::cl::Optional);
|
||||
|
||||
extern cl::opt<int> ProfileSummaryCutoffHot;
|
||||
|
||||
using namespace llvm;
|
||||
using namespace sampleprof;
|
||||
|
@ -127,6 +137,51 @@ void ProfileGeneratorBase::write() {
|
|||
write(std::move(WriterOrErr.get()), ProfileMap);
|
||||
}
|
||||
|
||||
void ProfileGeneratorBase::showDensitySuggestion(double Density) {
|
||||
if (Density == 0.0)
|
||||
WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
|
||||
"set too low. Please check your command.\n";
|
||||
else if (Density < HotFunctionDensityThreshold)
|
||||
WithColor::warning()
|
||||
<< "AutoFDO is estimated to optimize better with "
|
||||
<< format("%.1f", HotFunctionDensityThreshold / Density)
|
||||
<< "x more samples. Please consider increasing sampling rate or "
|
||||
"profiling for longer duration to get more samples.\n";
|
||||
|
||||
if (ShowDensity)
|
||||
outs() << "Minimum profile density for hot functions with top "
|
||||
<< format("%.2f",
|
||||
static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
|
||||
10000)
|
||||
<< "% total samples: " << format("%.1f", Density) << "\n";
|
||||
}
|
||||
|
||||
double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
|
||||
uint64_t HotCntThreshold) {
|
||||
double Density = DBL_MAX;
|
||||
std::vector<const FunctionSamples *> HotFuncs;
|
||||
for (auto &I : Profiles) {
|
||||
auto &FuncSamples = I.second;
|
||||
if (FuncSamples.getTotalSamples() < HotCntThreshold)
|
||||
continue;
|
||||
HotFuncs.emplace_back(&FuncSamples);
|
||||
}
|
||||
|
||||
for (auto *FuncSamples : HotFuncs) {
|
||||
auto *Func = Binary->getBinaryFunction(FuncSamples->getName());
|
||||
if (!Func)
|
||||
continue;
|
||||
uint64_t FuncSize = Func->getFuncSize();
|
||||
if (FuncSize == 0)
|
||||
continue;
|
||||
Density =
|
||||
std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
|
||||
FuncSize);
|
||||
}
|
||||
|
||||
return Density == DBL_MAX ? 0.0 : Density;
|
||||
}
|
||||
|
||||
void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
|
||||
const RangeSample &Ranges) {
|
||||
|
||||
|
@ -311,6 +366,12 @@ void ProfileGenerator::generateProfile() {
|
|||
} else {
|
||||
generateLineNumBasedProfile();
|
||||
}
|
||||
postProcessProfiles();
|
||||
}
|
||||
|
||||
void ProfileGenerator::postProcessProfiles() {
|
||||
computeSummaryAndThreshold();
|
||||
calculateAndShowDensity(ProfileMap);
|
||||
}
|
||||
|
||||
void ProfileGenerator::generateLineNumBasedProfile() {
|
||||
|
@ -440,6 +501,12 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
|
|||
}
|
||||
}
|
||||
|
||||
void ProfileGeneratorBase::calculateAndShowDensity(
|
||||
const SampleProfileMap &Profiles) {
|
||||
double Density = calculateDensity(Profiles, HotCountThreshold);
|
||||
showDensitySuggestion(Density);
|
||||
}
|
||||
|
||||
FunctionSamples &CSProfileGenerator::getFunctionProfileForContext(
|
||||
const SampleContextFrameVector &Context, bool WasLeafInlined) {
|
||||
auto I = ProfileMap.find(SampleContext(Context));
|
||||
|
@ -664,9 +731,17 @@ void CSProfileGenerator::postProcessProfiles() {
|
|||
HotCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
|
||||
CSProfMaxColdContextDepth, EnableCSPreInliner);
|
||||
}
|
||||
|
||||
// Merge function samples of CS profile to calculate profile density.
|
||||
sampleprof::SampleProfileMap ContextLessProfiles;
|
||||
for (const auto &I : ProfileMap) {
|
||||
ContextLessProfiles[I.second.getName()].merge(I.second);
|
||||
}
|
||||
|
||||
calculateAndShowDensity(ContextLessProfiles);
|
||||
}
|
||||
|
||||
void CSProfileGenerator::computeSummaryAndThreshold() {
|
||||
void ProfileGeneratorBase::computeSummaryAndThreshold() {
|
||||
SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
|
||||
auto Summary = Builder.computeSummaryForProfiles(ProfileMap);
|
||||
HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
|
||||
|
|
|
@ -75,7 +75,23 @@ protected:
|
|||
const SampleContextFrame &LeafLoc,
|
||||
uint64_t Count);
|
||||
void updateTotalSamples();
|
||||
|
||||
StringRef getCalleeNameForOffset(uint64_t TargetOffset);
|
||||
|
||||
void computeSummaryAndThreshold();
|
||||
|
||||
void calculateAndShowDensity(const SampleProfileMap &Profiles);
|
||||
|
||||
double calculateDensity(const SampleProfileMap &Profiles,
|
||||
uint64_t HotCntThreshold);
|
||||
|
||||
void showDensitySuggestion(double Density);
|
||||
|
||||
// Thresholds from profile summary to answer isHotCount/isColdCount queries.
|
||||
uint64_t HotCountThreshold;
|
||||
|
||||
uint64_t ColdCountThreshold;
|
||||
|
||||
// Used by SampleProfileWriter
|
||||
SampleProfileMap ProfileMap;
|
||||
|
||||
|
@ -104,6 +120,7 @@ private:
|
|||
void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter);
|
||||
void
|
||||
populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters);
|
||||
void postProcessProfiles();
|
||||
};
|
||||
|
||||
using ProbeCounterMap =
|
||||
|
@ -245,8 +262,6 @@ private:
|
|||
// and trimming cold profiles, running preinliner on profiles.
|
||||
void postProcessProfiles();
|
||||
|
||||
void computeSummaryAndThreshold();
|
||||
|
||||
void populateBodySamplesForFunction(FunctionSamples &FunctionProfile,
|
||||
const RangeSample &RangeCounters);
|
||||
void populateBoundarySamplesForFunction(SampleContextFrames ContextId,
|
||||
|
@ -269,9 +284,6 @@ private:
|
|||
FunctionSamples &
|
||||
getFunctionProfileForLeafProbe(SampleContextFrames ContextStack,
|
||||
const MCDecodedPseudoProbe *LeafProbe);
|
||||
// Thresholds from profile summary to answer isHotCount/isColdCount queries.
|
||||
uint64_t HotCountThreshold;
|
||||
uint64_t ColdCountThreshold;
|
||||
|
||||
// Underlying context table serves for sample profile writer.
|
||||
std::unordered_set<SampleContextFrameVector, SampleContextFrameHash> Contexts;
|
||||
|
|
|
@ -76,6 +76,14 @@ struct BinaryFunction {
|
|||
StringRef FuncName;
|
||||
// End of range is an exclusive bound.
|
||||
RangesTy Ranges;
|
||||
|
||||
uint64_t getFuncSize() {
|
||||
uint64_t Sum = 0;
|
||||
for (auto &R : Ranges) {
|
||||
Sum += R.second - R.first;
|
||||
}
|
||||
return Sum;
|
||||
}
|
||||
};
|
||||
|
||||
// Info about function range. A function can be split into multiple
|
||||
|
@ -406,6 +414,13 @@ public:
|
|||
return BinaryFunctions;
|
||||
}
|
||||
|
||||
BinaryFunction *getBinaryFunction(StringRef FName) {
|
||||
auto I = BinaryFunctions.find(FName.str());
|
||||
if (I == BinaryFunctions.end())
|
||||
return nullptr;
|
||||
return &I->second;
|
||||
}
|
||||
|
||||
uint32_t getFuncSizeForContext(SampleContext &Context) {
|
||||
return FuncSizeTracker.getFuncSizeForContext(Context);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue