tsan: optimize DenseSlabAlloc

If lots of threads do lots of malloc/free and they overflow
per-pthread DenseSlabAlloc cache, it causes lots of contention:

  31.97%  race.old  race.old            [.] __sanitizer::StaticSpinMutex::LockSlow
  17.61%  race.old  race.old            [.] __tsan_read4
  10.77%  race.old  race.old            [.] __tsan::SlotLock

Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes.
This way we don't take any locks in steady state at all and do only
1 push/pop per Refill/Drain.

Effect on the added benchmark:

$ TIME="%e %U %S %M" time ./test.old 36 5 2000000
34.51 978.22 175.67 5833592
32.53 891.73 167.03 5790036
36.17 1005.54 201.24 5802828
36.94 1004.76 226.58 5803188

$ TIME="%e %U %S %M" time ./test.new 36 5 2000000
26.44 720.99 13.45 5750704
25.92 721.98 13.58 5767764
26.33 725.15 13.41 5777936
25.93 713.49 13.41 5791796

Reviewed By: melver

Differential Revision: https://reviews.llvm.org/D130002
This commit is contained in:
Dmitry Vyukov 2022-07-16 11:48:18 +02:00
parent 71c502cbca
commit 6d1f86095d
2 changed files with 95 additions and 42 deletions

View File

@ -85,14 +85,7 @@ class DenseSlabAlloc {
}
void FlushCache(Cache *c) {
if (!c->pos)
return;
SpinMutexLock lock(&mtx_);
while (c->pos) {
IndexT idx = c->cache[--c->pos];
*(IndexT*)Map(idx) = freelist_;
freelist_ = idx;
}
while (c->pos) Drain(c);
}
void InitCache(Cache *c) {
@ -106,7 +99,7 @@ class DenseSlabAlloc {
template <typename Func>
void ForEach(Func func) {
SpinMutexLock lock(&mtx_);
Lock lock(&mtx_);
uptr fillpos = atomic_load_relaxed(&fillpos_);
for (uptr l1 = 0; l1 < fillpos; l1++) {
for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
@ -115,48 +108,86 @@ class DenseSlabAlloc {
private:
T *map_[kL1Size];
SpinMutex mtx_;
IndexT freelist_ = {0};
Mutex mtx_;
// The freelist is organized as a lock-free stack of batches of nodes.
// The stack itself uses Block::next links, while the batch within each
// stack node uses Block::batch links.
// Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
atomic_uint64_t freelist_ = {0};
atomic_uintptr_t fillpos_ = {0};
const char *const name_;
void Refill(Cache *c) {
SpinMutexLock lock(&mtx_);
if (freelist_ == 0) {
uptr fillpos = atomic_load_relaxed(&fillpos_);
if (fillpos == kL1Size) {
Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
name_, kL1Size, kL2Size);
Die();
}
VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
fillpos, kL1Size, kL2Size);
T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
// Reserve 0 as invalid index.
IndexT start = fillpos == 0 ? 1 : 0;
for (IndexT i = start; i < kL2Size; i++) {
new(batch + i) T;
*(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
}
*(IndexT*)(batch + kL2Size - 1) = 0;
freelist_ = fillpos * kL2Size + start;
map_[fillpos] = batch;
atomic_store_relaxed(&fillpos_, fillpos + 1);
}
for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
IndexT idx = freelist_;
struct Block {
IndexT next;
IndexT batch;
};
Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
static constexpr u64 kCounterInc = 1ull << 32;
static constexpr u64 kCounterMask = ~(kCounterInc - 1);
NOINLINE void Refill(Cache *c) {
// Pop 1 batch of nodes from the freelist.
IndexT idx;
u64 xchg;
u64 cmp = atomic_load(&freelist_, memory_order_acquire);
do {
idx = static_cast<IndexT>(cmp);
if (!idx)
return AllocSuperBlock(c);
Block *ptr = MapBlock(idx);
xchg = ptr->next | (cmp & kCounterMask);
} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
memory_order_acq_rel));
// Unpack it into c->cache.
while (idx) {
c->cache[c->pos++] = idx;
freelist_ = *(IndexT*)Map(idx);
idx = MapBlock(idx)->batch;
}
}
void Drain(Cache *c) {
SpinMutexLock lock(&mtx_);
for (uptr i = 0; i < Cache::kSize / 2; i++) {
NOINLINE void Drain(Cache *c) {
// Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
IndexT head_idx = 0;
for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
IndexT idx = c->cache[--c->pos];
*(IndexT*)Map(idx) = freelist_;
freelist_ = idx;
Block *ptr = MapBlock(idx);
ptr->batch = head_idx;
head_idx = idx;
}
// Push it onto the freelist stack.
Block *head = MapBlock(head_idx);
u64 xchg;
u64 cmp = atomic_load(&freelist_, memory_order_acquire);
do {
head->next = static_cast<IndexT>(cmp);
xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
memory_order_acq_rel));
}
NOINLINE void AllocSuperBlock(Cache *c) {
Lock lock(&mtx_);
uptr fillpos = atomic_load_relaxed(&fillpos_);
if (fillpos == kL1Size) {
Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
kL2Size);
Die();
}
VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
fillpos, kL1Size, kL2Size);
T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
map_[fillpos] = batch;
// Reserve 0 as invalid index.
for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
new (batch + i) T;
c->cache[c->pos++] = i + fillpos * kL2Size;
if (c->pos == Cache::kSize)
Drain(c);
}
atomic_store_relaxed(&fillpos_, fillpos + 1);
CHECK(c->pos);
}
};

View File

@ -0,0 +1,22 @@
// RUN: %clangxx_tsan %s -o %t
// RUN: %run %t 2>&1 | FileCheck %s
// bench.h needs pthread barriers which are not available on OS X
// UNSUPPORTED: darwin
#include "bench.h"
void thread(int tid) {
void **blocks = new void *[bench_mode];
for (int i = 0; i < bench_niter; i++) {
for (int j = 0; j < bench_mode; j++)
blocks[j] = malloc(8);
for (int j = 0; j < bench_mode; j++)
free(blocks[j]);
}
delete[] blocks;
}
void bench() { start_thread_group(bench_nthread, thread); }
// CHECK: DONE