[sanitizer] Introduce a new SizeClassMap with minimal amount of cached entries

Summary:
_Note_: I am not attached to the name `DenseSizeClassMap`, so if someone has a
better idea, feel free to suggest it.

The current pre-defined `SizeClassMap` hold a decent amount of cached entries,
either in cheer number of, or in amount of memory cached.

Empirical testing shows that more compact per-class arrays (whose sizes are
directly correlated to the number of cached entries) are beneficial to
performances, particularly in highly threaded environments.

The new proposed `SizeClassMap` has the following properties:
```
c00 => s: 0 diff: +0 00% l 0 cached: 0 0; id 0
c01 => s: 16 diff: +16 00% l 4 cached: 8 128; id 1
c02 => s: 32 diff: +16 100% l 5 cached: 8 256; id 2
c03 => s: 48 diff: +16 50% l 5 cached: 8 384; id 3
c04 => s: 64 diff: +16 33% l 6 cached: 8 512; id 4
c05 => s: 80 diff: +16 25% l 6 cached: 8 640; id 5
c06 => s: 96 diff: +16 20% l 6 cached: 8 768; id 6
c07 => s: 112 diff: +16 16% l 6 cached: 8 896; id 7

c08 => s: 128 diff: +16 14% l 7 cached: 8 1024; id 8
c09 => s: 144 diff: +16 12% l 7 cached: 7 1008; id 9
c10 => s: 160 diff: +16 11% l 7 cached: 6 960; id 10
c11 => s: 176 diff: +16 10% l 7 cached: 5 880; id 11
c12 => s: 192 diff: +16 09% l 7 cached: 5 960; id 12
c13 => s: 208 diff: +16 08% l 7 cached: 4 832; id 13
c14 => s: 224 diff: +16 07% l 7 cached: 4 896; id 14
c15 => s: 240 diff: +16 07% l 7 cached: 4 960; id 15

c16 => s: 256 diff: +16 06% l 8 cached: 4 1024; id 16
c17 => s: 320 diff: +64 25% l 8 cached: 3 960; id 49
c18 => s: 384 diff: +64 20% l 8 cached: 2 768; id 50
c19 => s: 448 diff: +64 16% l 8 cached: 2 896; id 51

c20 => s: 512 diff: +64 14% l 9 cached: 2 1024; id 48
c21 => s: 640 diff: +128 25% l 9 cached: 1 640; id 49
c22 => s: 768 diff: +128 20% l 9 cached: 1 768; id 50
c23 => s: 896 diff: +128 16% l 9 cached: 1 896; id 51

c24 => s: 1024 diff: +128 14% l 10 cached: 1 1024; id 48
c25 => s: 1280 diff: +256 25% l 10 cached: 1 1280; id 49
c26 => s: 1536 diff: +256 20% l 10 cached: 1 1536; id 50
c27 => s: 1792 diff: +256 16% l 10 cached: 1 1792; id 51

c28 => s: 2048 diff: +256 14% l 11 cached: 1 2048; id 48
c29 => s: 2560 diff: +512 25% l 11 cached: 1 2560; id 49
c30 => s: 3072 diff: +512 20% l 11 cached: 1 3072; id 50
c31 => s: 3584 diff: +512 16% l 11 cached: 1 3584; id 51

c32 => s: 4096 diff: +512 14% l 12 cached: 1 4096; id 48
c33 => s: 5120 diff: +1024 25% l 12 cached: 1 5120; id 49
c34 => s: 6144 diff: +1024 20% l 12 cached: 1 6144; id 50
c35 => s: 7168 diff: +1024 16% l 12 cached: 1 7168; id 51

c36 => s: 8192 diff: +1024 14% l 13 cached: 1 8192; id 48
c37 => s: 10240 diff: +2048 25% l 13 cached: 1 10240; id 49
c38 => s: 12288 diff: +2048 20% l 13 cached: 1 12288; id 50
c39 => s: 14336 diff: +2048 16% l 13 cached: 1 14336; id 51

c40 => s: 16384 diff: +2048 14% l 14 cached: 1 16384; id 48
c41 => s: 20480 diff: +4096 25% l 14 cached: 1 20480; id 49
c42 => s: 24576 diff: +4096 20% l 14 cached: 1 24576; id 50
c43 => s: 28672 diff: +4096 16% l 14 cached: 1 28672; id 51

c44 => s: 32768 diff: +4096 14% l 15 cached: 1 32768; id 48
c45 => s: 40960 diff: +8192 25% l 15 cached: 1 40960; id 49
c46 => s: 49152 diff: +8192 20% l 15 cached: 1 49152; id 50
c47 => s: 57344 diff: +8192 16% l 15 cached: 1 57344; id 51

c48 => s: 65536 diff: +8192 14% l 16 cached: 1 65536; id 48
c49 => s: 81920 diff: +16384 25% l 16 cached: 1 81920; id 49
c50 => s: 98304 diff: +16384 20% l 16 cached: 1 98304; id 50
c51 => s: 114688 diff: +16384 16% l 16 cached: 1 114688; id 51

c52 => s: 131072 diff: +16384 14% l 17 cached: 1 131072; id 48
c53 => s: 64 diff: +0 00% l 0 cached: 8 512; id 4
Total cached: 864928 (152/432)
```

It holds a bit less of 1MB of cached entries at most, and the cache fits in a 
page.

The plan is to use this map by default for Scudo once we make sure that there
is no unforeseen impact for any of current use case.

Benchmarks give the most increase in performance (with Scudo) when looking at
highly threaded/contentious environments. For example, rcp2-benchmark
experiences a 10K QPS increase (~3%), and a decrease of 50MB for the max RSS
(~10%). On platforms like Android where we only have a couple of caches,
performance remain similar.

Reviewers: eugenis, kcc

Reviewed By: eugenis

Subscribers: kubamracek, delcypher, #sanitizers, llvm-commits

Differential Revision: https://reviews.llvm.org/D52371

llvm-svn: 343246
This commit is contained in:
Kostya Kortchinsky 2018-09-27 18:20:42 +00:00
parent 777bc9f924
commit 7685301d79
2 changed files with 22 additions and 0 deletions

View File

@ -232,3 +232,8 @@ class SizeClassMap {
typedef SizeClassMap<3, 4, 8, 17, 128, 16> DefaultSizeClassMap;
typedef SizeClassMap<3, 4, 8, 17, 64, 14> CompactSizeClassMap;
typedef SizeClassMap<2, 5, 9, 16, 64, 14> VeryCompactSizeClassMap;
// The following SizeClassMap only holds a way small number of cached entries,
// allowing for denser per-class arrays, smaller memory footprint and usually
// better performances in threaded environments.
typedef SizeClassMap<3, 4, 8, 17, 8, 10> DenseSizeClassMap;

View File

@ -89,11 +89,20 @@ struct AP64VeryCompact {
static const uptr kFlags = 0;
};
struct AP64Dense {
static const uptr kSpaceBeg = kAllocatorSpace;
static const uptr kSpaceSize = kAllocatorSize;
static const uptr kMetadataSize = 16;
typedef DenseSizeClassMap SizeClassMap;
typedef NoOpMapUnmapCallback MapUnmapCallback;
static const uptr kFlags = 0;
};
typedef SizeClassAllocator64<AP64> Allocator64;
typedef SizeClassAllocator64<AP64Dyn> Allocator64Dynamic;
typedef SizeClassAllocator64<AP64Compact> Allocator64Compact;
typedef SizeClassAllocator64<AP64VeryCompact> Allocator64VeryCompact;
typedef SizeClassAllocator64<AP64Dense> Allocator64Dense;
#elif defined(__mips64)
static const u64 kAddressSpaceSize = 1ULL << 40;
#elif defined(__aarch64__)
@ -144,6 +153,10 @@ TEST(SanitizerCommon, InternalSizeClassMap) {
TestSizeClassMap<InternalSizeClassMap>();
}
TEST(SanitizerCommon, DenseSizeClassMap) {
TestSizeClassMap<VeryCompactSizeClassMap>();
}
template <class Allocator>
void TestSizeClassAllocator() {
Allocator *a = new Allocator;
@ -234,6 +247,10 @@ TEST(SanitizerCommon, SizeClassAllocator64Compact) {
TEST(SanitizerCommon, SizeClassAllocator64VeryCompact) {
TestSizeClassAllocator<Allocator64VeryCompact>();
}
TEST(SanitizerCommon, SizeClassAllocator64Dense) {
TestSizeClassAllocator<Allocator64Dense>();
}
#endif
#endif