2016-05-25 10:04:04 +08:00
|
|
|
//===-- working_set.cpp ---------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file is a part of EfficiencySanitizer, a family of performance tuners.
|
|
|
|
//
|
|
|
|
// This file contains working-set-specific code.
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "working_set.h"
|
|
|
|
#include "esan.h"
|
2016-06-04 00:27:50 +08:00
|
|
|
#include "esan_circular_buffer.h"
|
2016-05-25 10:04:04 +08:00
|
|
|
#include "esan_flags.h"
|
|
|
|
#include "esan_shadow.h"
|
2016-06-04 00:14:07 +08:00
|
|
|
#include "esan_sideline.h"
|
2016-05-31 21:41:07 +08:00
|
|
|
#include "sanitizer_common/sanitizer_procmaps.h"
|
2016-05-25 10:04:04 +08:00
|
|
|
|
|
|
|
// We shadow every cache line of app memory with one shadow byte.
|
|
|
|
// - The highest bit of each shadow byte indicates whether the corresponding
|
|
|
|
// cache line has ever been accessed.
|
|
|
|
// - The lowest bit of each shadow byte indicates whether the corresponding
|
|
|
|
// cache line was accessed since the last sample.
|
2016-06-04 00:27:50 +08:00
|
|
|
// - The other bits are used for working set snapshots at successively
|
|
|
|
// lower frequencies, each bit to the left from the lowest bit stepping
|
|
|
|
// down the frequency by 2 to the power of getFlags()->snapshot_step.
|
|
|
|
// Thus we have something like this:
|
|
|
|
// Bit 0: Since last sample
|
|
|
|
// Bit 1: Since last 2^2 samples
|
|
|
|
// Bit 2: Since last 2^4 samples
|
|
|
|
// Bit 3: ...
|
|
|
|
// Bit 7: Ever accessed.
|
2016-05-25 10:04:04 +08:00
|
|
|
// We live with races in accessing each shadow byte.
|
|
|
|
typedef unsigned char byte;
|
|
|
|
|
|
|
|
namespace __esan {
|
|
|
|
|
2016-05-31 21:41:07 +08:00
|
|
|
// Our shadow memory assumes that the line size is 64.
|
|
|
|
static const u32 CacheLineSize = 64;
|
|
|
|
|
2016-05-25 10:04:04 +08:00
|
|
|
// See the shadow byte layout description above.
|
|
|
|
static const u32 TotalWorkingSetBitIdx = 7;
|
2016-06-04 00:27:50 +08:00
|
|
|
// We accumulate to the left until we hit this bit.
|
|
|
|
// We don't need to accumulate to the final bit as it's set on each ref
|
|
|
|
// by the compiler instrumentation.
|
|
|
|
static const u32 MaxAccumBitIdx = 6;
|
2016-05-25 10:04:04 +08:00
|
|
|
static const u32 CurWorkingSetBitIdx = 0;
|
|
|
|
static const byte ShadowAccessedVal =
|
|
|
|
(1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
|
|
|
|
|
2016-06-04 00:14:07 +08:00
|
|
|
static SidelineThread Thread;
|
|
|
|
// If we use real-time-based timer samples this won't overflow in any realistic
|
|
|
|
// scenario, but if we switch to some other unit (such as memory accesses) we
|
|
|
|
// may want to consider a 64-bit int.
|
|
|
|
static u32 SnapshotNum;
|
|
|
|
|
2016-06-04 00:27:50 +08:00
|
|
|
// We store the wset size for each of 8 different sampling frequencies.
|
|
|
|
static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
|
|
|
|
// We cannot use static objects as the global destructor is called
|
|
|
|
// prior to our finalize routine.
|
|
|
|
// These are each circular buffers, sized up front.
|
|
|
|
CircularBuffer<u32> SizePerFreq[NumFreq];
|
|
|
|
// We cannot rely on static initializers (they may run too late) but
|
|
|
|
// we record the size here for clarity:
|
|
|
|
u32 CircularBufferSizes[NumFreq] = {
|
|
|
|
// These are each mmap-ed so our minimum is one page.
|
|
|
|
32*1024,
|
|
|
|
16*1024,
|
|
|
|
8*1024,
|
|
|
|
4*1024,
|
|
|
|
4*1024,
|
|
|
|
4*1024,
|
|
|
|
4*1024,
|
|
|
|
4*1024,
|
|
|
|
};
|
|
|
|
|
2016-05-25 10:04:04 +08:00
|
|
|
void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
|
|
|
|
bool IsWrite) {
|
|
|
|
if (Size == 0)
|
|
|
|
return;
|
|
|
|
SIZE_T I = 0;
|
|
|
|
uptr LineSize = getFlags()->cache_line_size;
|
|
|
|
// As Addr+Size could overflow at the top of a 32-bit address space,
|
|
|
|
// we avoid the simpler formula that rounds the start and end.
|
|
|
|
SIZE_T NumLines = Size / LineSize +
|
|
|
|
// Add any extra at the start or end adding on an extra line:
|
|
|
|
(LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
|
|
|
|
byte *Shadow = (byte *)appToShadow(Addr);
|
|
|
|
// Write shadow bytes until we're word-aligned.
|
|
|
|
while (I < NumLines && (uptr)Shadow % 4 != 0) {
|
|
|
|
if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
|
|
|
|
*Shadow |= ShadowAccessedVal;
|
|
|
|
++Shadow;
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
// Write whole shadow words at a time.
|
|
|
|
// Using a word-stride loop improves the runtime of a microbenchmark of
|
|
|
|
// memset calls by 10%.
|
|
|
|
u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
|
|
|
|
ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
|
|
|
|
while (I + 4 <= NumLines) {
|
|
|
|
if ((*(u32*)Shadow & WordValue) != WordValue)
|
|
|
|
*(u32*)Shadow |= WordValue;
|
|
|
|
Shadow += 4;
|
|
|
|
I += 4;
|
|
|
|
}
|
|
|
|
// Write any trailing shadow bytes.
|
|
|
|
while (I < NumLines) {
|
|
|
|
if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
|
|
|
|
*Shadow |= ShadowAccessedVal;
|
|
|
|
++Shadow;
|
|
|
|
++I;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-31 21:41:07 +08:00
|
|
|
// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
|
2016-07-09 12:13:25 +08:00
|
|
|
// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
|
|
|
|
// measures the access during the entire execution and should never be cleared.
|
2016-05-31 21:41:07 +08:00
|
|
|
static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
|
|
|
|
uptr ShadowEnd) {
|
|
|
|
u32 WorkingSetSize = 0;
|
|
|
|
u32 ByteValue = 0x1 << BitIdx;
|
|
|
|
u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
|
|
|
|
ByteValue << 24;
|
|
|
|
// Get word aligned start.
|
|
|
|
ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
|
2016-06-04 00:27:50 +08:00
|
|
|
bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
|
2016-07-09 12:13:25 +08:00
|
|
|
// Do not clear the bit that measures access during the entire execution.
|
|
|
|
bool Clear = BitIdx < TotalWorkingSetBitIdx;
|
2016-05-31 21:41:07 +08:00
|
|
|
for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
|
|
|
|
if ((*Ptr & WordValue) != 0) {
|
|
|
|
byte *BytePtr = (byte *)Ptr;
|
|
|
|
for (u32 j = 0; j < sizeof(u32); ++j) {
|
|
|
|
if (BytePtr[j] & ByteValue) {
|
|
|
|
++WorkingSetSize;
|
2016-06-04 00:27:50 +08:00
|
|
|
if (Accum) {
|
|
|
|
// Accumulate to the lower-frequency bit to the left.
|
|
|
|
BytePtr[j] |= (ByteValue << 1);
|
|
|
|
}
|
2016-05-31 21:41:07 +08:00
|
|
|
}
|
|
|
|
}
|
2016-07-09 12:13:25 +08:00
|
|
|
if (Clear) {
|
|
|
|
// Clear this bit from every shadow byte.
|
|
|
|
*Ptr &= ~WordValue;
|
|
|
|
}
|
2016-05-31 21:41:07 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return WorkingSetSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan shadow memory to calculate the number of cache lines being accessed,
|
|
|
|
// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
|
|
|
|
// We also clear the lowest bits (most recent working set snapshot).
|
2016-07-09 12:13:25 +08:00
|
|
|
// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
|
|
|
|
// measures the access during the entire execution and should never be cleared.
|
2016-05-31 21:41:07 +08:00
|
|
|
static u32 computeWorkingSizeAndReset(u32 BitIdx) {
|
|
|
|
u32 WorkingSetSize = 0;
|
|
|
|
MemoryMappingLayout MemIter(true/*cache*/);
|
2017-07-12 02:54:00 +08:00
|
|
|
MemoryMappedSegment Segment;
|
|
|
|
while (MemIter.Next(&Segment)) {
|
|
|
|
VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
|
|
|
|
Segment.start, Segment.end, Segment.protection,
|
|
|
|
isAppMem(Segment.start), isShadowMem(Segment.start));
|
|
|
|
if (isShadowMem(Segment.start) && Segment.IsWritable()) {
|
|
|
|
VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
|
|
|
|
Segment.end);
|
|
|
|
WorkingSetSize +=
|
|
|
|
countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
|
2016-05-31 21:41:07 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return WorkingSetSize;
|
|
|
|
}
|
|
|
|
|
2016-06-04 00:14:07 +08:00
|
|
|
// This is invoked from a signal handler but in a sideline thread doing nothing
|
|
|
|
// else so it is a little less fragile than a typical signal handler.
|
|
|
|
static void takeSample(void *Arg) {
|
2016-06-04 00:27:50 +08:00
|
|
|
u32 BitIdx = CurWorkingSetBitIdx;
|
|
|
|
u32 Freq = 1;
|
|
|
|
++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
|
|
|
|
while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
|
|
|
|
u32 NumLines = computeWorkingSizeAndReset(BitIdx);
|
|
|
|
VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
|
|
|
|
SnapshotNum, BitIdx, Freq, NumLines);
|
|
|
|
SizePerFreq[BitIdx].push_back(NumLines);
|
|
|
|
Freq = Freq << getFlags()->snapshot_step;
|
|
|
|
BitIdx++;
|
|
|
|
}
|
2016-06-04 00:14:07 +08:00
|
|
|
}
|
|
|
|
|
2016-07-19 13:06:48 +08:00
|
|
|
unsigned int getSampleCountWorkingSet()
|
|
|
|
{
|
|
|
|
return SnapshotNum;
|
|
|
|
}
|
|
|
|
|
[sanitizer][esan] Add internal_sigaction_syscall
Summary:
Adds a version of sigaction that uses a raw system call, to avoid circular
dependencies and support calling sigaction prior to setting up
interceptors. The new sigaction relies on an assembly sigreturn routine
for its restorer, which is Linux x86_64-only for now.
Uses the new sigaction to initialize the working set tool's shadow fault
handler prior to libc interceptor being set up. This is required to
support instrumentation invoked during interceptor setup, which happens
with an instrumented tcmalloc or other allocator compiled with esan.
Adds a test that emulates an instrumented allocator.
Reviewers: aizatsky
Subscribers: vitalybuka, tberghammer, zhaoqin, danalbert, kcc, srhines, eugenis, llvm-commits, kubabrecka
Differential Revision: http://reviews.llvm.org/D21083
llvm-svn: 272676
2016-06-14 23:15:38 +08:00
|
|
|
// Initialization that must be done before any instrumented code is executed.
|
|
|
|
void initializeShadowWorkingSet() {
|
2016-05-31 21:41:07 +08:00
|
|
|
CHECK(getFlags()->cache_line_size == CacheLineSize);
|
2016-05-31 21:21:03 +08:00
|
|
|
registerMemoryFaultHandler();
|
[sanitizer][esan] Add internal_sigaction_syscall
Summary:
Adds a version of sigaction that uses a raw system call, to avoid circular
dependencies and support calling sigaction prior to setting up
interceptors. The new sigaction relies on an assembly sigreturn routine
for its restorer, which is Linux x86_64-only for now.
Uses the new sigaction to initialize the working set tool's shadow fault
handler prior to libc interceptor being set up. This is required to
support instrumentation invoked during interceptor setup, which happens
with an instrumented tcmalloc or other allocator compiled with esan.
Adds a test that emulates an instrumented allocator.
Reviewers: aizatsky
Subscribers: vitalybuka, tberghammer, zhaoqin, danalbert, kcc, srhines, eugenis, llvm-commits, kubabrecka
Differential Revision: http://reviews.llvm.org/D21083
llvm-svn: 272676
2016-06-14 23:15:38 +08:00
|
|
|
}
|
2016-06-04 00:14:07 +08:00
|
|
|
|
[sanitizer][esan] Add internal_sigaction_syscall
Summary:
Adds a version of sigaction that uses a raw system call, to avoid circular
dependencies and support calling sigaction prior to setting up
interceptors. The new sigaction relies on an assembly sigreturn routine
for its restorer, which is Linux x86_64-only for now.
Uses the new sigaction to initialize the working set tool's shadow fault
handler prior to libc interceptor being set up. This is required to
support instrumentation invoked during interceptor setup, which happens
with an instrumented tcmalloc or other allocator compiled with esan.
Adds a test that emulates an instrumented allocator.
Reviewers: aizatsky
Subscribers: vitalybuka, tberghammer, zhaoqin, danalbert, kcc, srhines, eugenis, llvm-commits, kubabrecka
Differential Revision: http://reviews.llvm.org/D21083
llvm-svn: 272676
2016-06-14 23:15:38 +08:00
|
|
|
void initializeWorkingSet() {
|
2016-06-04 00:27:50 +08:00
|
|
|
if (getFlags()->record_snapshots) {
|
|
|
|
for (u32 i = 0; i < NumFreq; ++i)
|
|
|
|
SizePerFreq[i].initialize(CircularBufferSizes[i]);
|
2016-06-04 00:14:07 +08:00
|
|
|
Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
|
2016-06-04 00:27:50 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
|
|
|
|
if (MilliSec > 600000) {
|
|
|
|
Unit = "min";
|
|
|
|
return MilliSec / 60000;
|
|
|
|
} else if (MilliSec > 10000) {
|
|
|
|
Unit = "sec";
|
|
|
|
return MilliSec / 1000;
|
|
|
|
} else {
|
|
|
|
Unit = "ms";
|
|
|
|
return MilliSec;
|
|
|
|
}
|
2016-05-25 10:04:04 +08:00
|
|
|
}
|
|
|
|
|
2016-05-31 21:41:07 +08:00
|
|
|
static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
|
|
|
|
// We need a constant to avoid software divide support:
|
|
|
|
static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
|
|
|
|
static const u32 MegabyteCachelines = KilobyteCachelines << 10;
|
|
|
|
|
|
|
|
if (NumOfCachelines > 10 * MegabyteCachelines) {
|
|
|
|
Unit = "MB";
|
|
|
|
return NumOfCachelines / MegabyteCachelines;
|
|
|
|
} else if (NumOfCachelines > 10 * KilobyteCachelines) {
|
|
|
|
Unit = "KB";
|
|
|
|
return NumOfCachelines / KilobyteCachelines;
|
|
|
|
} else {
|
|
|
|
Unit = "Bytes";
|
|
|
|
return NumOfCachelines * CacheLineSize;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-09 12:13:25 +08:00
|
|
|
void reportWorkingSet() {
|
2016-06-04 00:27:50 +08:00
|
|
|
const char *Unit;
|
|
|
|
if (getFlags()->record_snapshots) {
|
|
|
|
u32 Freq = 1;
|
|
|
|
Report(" Total number of samples: %u\n", SnapshotNum);
|
|
|
|
for (u32 i = 0; i < NumFreq; ++i) {
|
|
|
|
u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
|
|
|
|
Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
|
|
|
|
// FIXME: report whether we wrapped around and thus whether we
|
|
|
|
// have data on the whole run or just the last N samples.
|
|
|
|
for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
|
|
|
|
u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
|
|
|
|
Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
|
|
|
|
SizePerFreq[i][j]);
|
|
|
|
}
|
|
|
|
Freq = Freq << getFlags()->snapshot_step;
|
|
|
|
}
|
|
|
|
}
|
2016-06-04 00:14:07 +08:00
|
|
|
|
2016-05-31 21:41:07 +08:00
|
|
|
// Get the working set size for the entire execution.
|
|
|
|
u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
|
|
|
|
u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
|
|
|
|
Report(" %s: the total working set size: %u %s (%u cache lines)\n",
|
|
|
|
SanitizerToolName, Size, Unit, NumOfCachelines);
|
2016-07-09 12:13:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int finalizeWorkingSet() {
|
|
|
|
if (getFlags()->record_snapshots)
|
|
|
|
Thread.joinThread();
|
|
|
|
reportWorkingSet();
|
|
|
|
if (getFlags()->record_snapshots) {
|
|
|
|
for (u32 i = 0; i < NumFreq; ++i)
|
|
|
|
SizePerFreq[i].free();
|
|
|
|
}
|
2016-05-25 10:04:04 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace __esan
|