llvm-project/compiler-rt/lib/profile/InstrProfilingValue.c

372 lines
13 KiB
C

/*===- InstrProfilingValue.c - Support library for PGO instrumentation ----===*\
|*
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|* See https://llvm.org/LICENSE.txt for license information.
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|*
\*===----------------------------------------------------------------------===*/
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "InstrProfiling.h"
#include "InstrProfilingInternal.h"
#include "InstrProfilingUtil.h"
#define INSTR_PROF_VALUE_PROF_DATA
#define INSTR_PROF_COMMON_API_IMPL
#include "InstrProfData.inc"
static int hasStaticCounters = 1;
static int OutOfNodesWarnings = 0;
static int hasNonDefaultValsPerSite = 0;
#define INSTR_PROF_MAX_VP_WARNS 10
#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 16
#define INSTR_PROF_VNODE_POOL_SIZE 1024
#ifndef _MSC_VER
/* A shared static pool in addition to the vnodes statically
* allocated by the compiler. */
COMPILER_RT_VISIBILITY ValueProfNode
lprofValueProfNodes[INSTR_PROF_VNODE_POOL_SIZE] COMPILER_RT_SECTION(
COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
#endif
COMPILER_RT_VISIBILITY uint32_t VPMaxNumValsPerSite =
INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
COMPILER_RT_VISIBILITY void lprofSetupValueProfiler() {
const char *Str = 0;
Str = getenv("LLVM_VP_MAX_NUM_VALS_PER_SITE");
if (Str && Str[0]) {
VPMaxNumValsPerSite = atoi(Str);
hasNonDefaultValsPerSite = 1;
}
if (VPMaxNumValsPerSite > INSTR_PROF_MAX_NUM_VAL_PER_SITE)
VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
}
COMPILER_RT_VISIBILITY void lprofSetMaxValsPerSite(uint32_t MaxVals) {
VPMaxNumValsPerSite = MaxVals;
hasNonDefaultValsPerSite = 1;
}
/* This method is only used in value profiler mock testing. */
COMPILER_RT_VISIBILITY void
__llvm_profile_set_num_value_sites(__llvm_profile_data *Data,
uint32_t ValueKind, uint16_t NumValueSites) {
*((uint16_t *)&Data->NumValueSites[ValueKind]) = NumValueSites;
}
/* This method is only used in value profiler mock testing. */
COMPILER_RT_VISIBILITY const __llvm_profile_data *
__llvm_profile_iterate_data(const __llvm_profile_data *Data) {
return Data + 1;
}
/* This method is only used in value profiler mock testing. */
COMPILER_RT_VISIBILITY void *
__llvm_get_function_addr(const __llvm_profile_data *Data) {
return Data->FunctionPointer;
}
/* Allocate an array that holds the pointers to the linked lists of
* value profile counter nodes. The number of element of the array
* is the total number of value profile sites instrumented. Returns
* 0 if allocation fails.
*/
static int allocateValueProfileCounters(__llvm_profile_data *Data) {
uint64_t NumVSites = 0;
uint32_t VKI;
/* This function will never be called when value site array is allocated
statically at compile time. */
hasStaticCounters = 0;
/* When dynamic allocation is enabled, allow tracking the max number of
* values allowd. */
if (!hasNonDefaultValsPerSite)
VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
NumVSites += Data->NumValueSites[VKI];
ValueProfNode **Mem =
(ValueProfNode **)calloc(NumVSites, sizeof(ValueProfNode *));
if (!Mem)
return 0;
if (!COMPILER_RT_BOOL_CMPXCHG(&Data->Values, 0, Mem)) {
free(Mem);
return 0;
}
return 1;
}
static ValueProfNode *allocateOneNode(void) {
ValueProfNode *Node;
if (!hasStaticCounters)
return (ValueProfNode *)calloc(1, sizeof(ValueProfNode));
/* Early check to avoid value wrapping around. */
if (CurrentVNode + 1 > EndVNode) {
if (OutOfNodesWarnings++ < INSTR_PROF_MAX_VP_WARNS) {
PROF_WARN("Unable to track new values: %s. "
" Consider using option -mllvm -vp-counters-per-site=<n> to "
"allocate more"
" value profile counters at compile time. \n",
"Running out of static counters");
}
return 0;
}
Node = COMPILER_RT_PTR_FETCH_ADD(ValueProfNode, CurrentVNode, 1);
/* Due to section padding, EndVNode point to a byte which is one pass
* an incomplete VNode, so we need to skip the last incomplete node. */
if (Node + 1 > EndVNode)
return 0;
return Node;
}
static COMPILER_RT_ALWAYS_INLINE void
instrumentTargetValueImpl(uint64_t TargetValue, void *Data,
uint32_t CounterIndex, uint64_t CountValue) {
__llvm_profile_data *PData = (__llvm_profile_data *)Data;
if (!PData)
return;
if (!CountValue)
return;
if (!PData->Values) {
if (!allocateValueProfileCounters(PData))
return;
}
ValueProfNode **ValueCounters = (ValueProfNode **)PData->Values;
ValueProfNode *PrevVNode = NULL;
ValueProfNode *MinCountVNode = NULL;
ValueProfNode *CurVNode = ValueCounters[CounterIndex];
uint64_t MinCount = UINT64_MAX;
uint8_t VDataCount = 0;
while (CurVNode) {
if (TargetValue == CurVNode->Value) {
CurVNode->Count += CountValue;
return;
}
if (CurVNode->Count < MinCount) {
MinCount = CurVNode->Count;
MinCountVNode = CurVNode;
}
PrevVNode = CurVNode;
CurVNode = CurVNode->Next;
++VDataCount;
}
if (VDataCount >= VPMaxNumValsPerSite) {
/* Bump down the min count node's count. If it reaches 0,
* evict it. This eviction/replacement policy makes hot
* targets more sticky while cold targets less so. In other
* words, it makes it less likely for the hot targets to be
* prematurally evicted during warmup/establishment period,
* when their counts are still low. In a special case when
* the number of values tracked is reduced to only one, this
* policy will guarantee that the dominating target with >50%
* total count will survive in the end. Note that this scheme
* allows the runtime to track the min count node in an adaptive
* manner. It can correct previous mistakes and eventually
* lock on a cold target that is alread in stable state.
*
* In very rare cases, this replacement scheme may still lead
* to target loss. For instance, out of \c N value slots, \c N-1
* slots are occupied by luke warm targets during the warmup
* period and the remaining one slot is competed by two or more
* very hot targets. If those hot targets occur in an interleaved
* way, none of them will survive (gain enough weight to throw out
* other established entries) due to the ping-pong effect.
* To handle this situation, user can choose to increase the max
* number of tracked values per value site. Alternatively, a more
* expensive eviction mechanism can be implemented. It requires
* the runtime to track the total number of evictions per-site.
* When the total number of evictions reaches certain threshold,
* the runtime can wipe out more than one lowest count entries
* to give space for hot targets.
*/
if (MinCountVNode->Count <= CountValue) {
CurVNode = MinCountVNode;
CurVNode->Value = TargetValue;
CurVNode->Count = CountValue;
} else
MinCountVNode->Count -= CountValue;
return;
}
CurVNode = allocateOneNode();
if (!CurVNode)
return;
CurVNode->Value = TargetValue;
CurVNode->Count += CountValue;
uint32_t Success = 0;
if (!ValueCounters[CounterIndex])
Success =
COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurVNode);
else if (PrevVNode && !PrevVNode->Next)
Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurVNode);
if (!Success && !hasStaticCounters) {
free(CurVNode);
return;
}
}
COMPILER_RT_VISIBILITY void
__llvm_profile_instrument_target(uint64_t TargetValue, void *Data,
uint32_t CounterIndex) {
instrumentTargetValueImpl(TargetValue, Data, CounterIndex, 1);
}
COMPILER_RT_VISIBILITY void
__llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
uint32_t CounterIndex,
uint64_t CountValue) {
instrumentTargetValueImpl(TargetValue, Data, CounterIndex, CountValue);
}
/*
* The target values are partitioned into multiple regions/ranges. There is one
* contiguous region which is precise -- every value in the range is tracked
* individually. A value outside the precise region will be collapsed into one
* value depending on the region it falls in.
*
* There are three regions:
* 1. (-inf, PreciseRangeStart) and (PreciseRangeLast, LargeRangeValue) belong
* to one region -- all values here should be mapped to one value of
* "PreciseRangeLast + 1".
* 2. [PreciseRangeStart, PreciseRangeLast]
* 3. Large values: [LargeValue, +inf) maps to one value of LargeValue.
*
* The range for large values is optional. The default value of INT64_MIN
* indicates it is not specified.
*/
COMPILER_RT_VISIBILITY void __llvm_profile_instrument_range(
uint64_t TargetValue, void *Data, uint32_t CounterIndex,
int64_t PreciseRangeStart, int64_t PreciseRangeLast, int64_t LargeValue) {
if (LargeValue != INT64_MIN && (int64_t)TargetValue >= LargeValue)
TargetValue = LargeValue;
else if ((int64_t)TargetValue < PreciseRangeStart ||
(int64_t)TargetValue > PreciseRangeLast)
TargetValue = PreciseRangeLast + 1;
__llvm_profile_instrument_target(TargetValue, Data, CounterIndex);
}
/*
* A wrapper struct that represents value profile runtime data.
* Like InstrProfRecord class which is used by profiling host tools,
* ValueProfRuntimeRecord also implements the abstract intefaces defined in
* ValueProfRecordClosure so that the runtime data can be serialized using
* shared C implementation.
*/
typedef struct ValueProfRuntimeRecord {
const __llvm_profile_data *Data;
ValueProfNode **NodesKind[IPVK_Last + 1];
uint8_t **SiteCountArray;
} ValueProfRuntimeRecord;
/* ValueProfRecordClosure Interface implementation. */
static uint32_t getNumValueSitesRT(const void *R, uint32_t VK) {
return ((const ValueProfRuntimeRecord *)R)->Data->NumValueSites[VK];
}
static uint32_t getNumValueDataRT(const void *R, uint32_t VK) {
uint32_t S = 0, I;
const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
if (Record->SiteCountArray[VK] == INSTR_PROF_NULLPTR)
return 0;
for (I = 0; I < Record->Data->NumValueSites[VK]; I++)
S += Record->SiteCountArray[VK][I];
return S;
}
static uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK,
uint32_t S) {
const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
return Record->SiteCountArray[VK][S];
}
static ValueProfRuntimeRecord RTRecord;
static ValueProfRecordClosure RTRecordClosure = {
&RTRecord, INSTR_PROF_NULLPTR, /* GetNumValueKinds */
getNumValueSitesRT, getNumValueDataRT, getNumValueDataForSiteRT,
INSTR_PROF_NULLPTR, /* RemapValueData */
INSTR_PROF_NULLPTR, /* GetValueForSite, */
INSTR_PROF_NULLPTR /* AllocValueProfData */
};
static uint32_t
initializeValueProfRuntimeRecord(const __llvm_profile_data *Data,
uint8_t *SiteCountArray[]) {
unsigned I, J, S = 0, NumValueKinds = 0;
ValueProfNode **Nodes = (ValueProfNode **)Data->Values;
RTRecord.Data = Data;
RTRecord.SiteCountArray = SiteCountArray;
for (I = 0; I <= IPVK_Last; I++) {
uint16_t N = Data->NumValueSites[I];
if (!N)
continue;
NumValueKinds++;
RTRecord.NodesKind[I] = Nodes ? &Nodes[S] : INSTR_PROF_NULLPTR;
for (J = 0; J < N; J++) {
/* Compute value count for each site. */
uint32_t C = 0;
ValueProfNode *Site =
Nodes ? RTRecord.NodesKind[I][J] : INSTR_PROF_NULLPTR;
while (Site) {
C++;
Site = Site->Next;
}
if (C > UCHAR_MAX)
C = UCHAR_MAX;
RTRecord.SiteCountArray[I][J] = C;
}
S += N;
}
return NumValueKinds;
}
static ValueProfNode *getNextNValueData(uint32_t VK, uint32_t Site,
InstrProfValueData *Dst,
ValueProfNode *StartNode, uint32_t N) {
unsigned I;
ValueProfNode *VNode = StartNode ? StartNode : RTRecord.NodesKind[VK][Site];
for (I = 0; I < N; I++) {
Dst[I].Value = VNode->Value;
Dst[I].Count = VNode->Count;
VNode = VNode->Next;
}
return VNode;
}
static uint32_t getValueProfDataSizeWrapper(void) {
return getValueProfDataSize(&RTRecordClosure);
}
static uint32_t getNumValueDataForSiteWrapper(uint32_t VK, uint32_t S) {
return getNumValueDataForSiteRT(&RTRecord, VK, S);
}
static VPDataReaderType TheVPDataReader = {
initializeValueProfRuntimeRecord, getValueProfRecordHeaderSize,
getFirstValueProfRecord, getNumValueDataForSiteWrapper,
getValueProfDataSizeWrapper, getNextNValueData};
COMPILER_RT_VISIBILITY VPDataReaderType *lprofGetVPDataReader() {
return &TheVPDataReader;
}