forked from OSchip/llvm-project
tsan: optimize DenseSlabAlloc
If lots of threads do lots of malloc/free and they overflow per-pthread DenseSlabAlloc cache, it causes lots of contention: 31.97% race.old race.old [.] __sanitizer::StaticSpinMutex::LockSlow 17.61% race.old race.old [.] __tsan_read4 10.77% race.old race.old [.] __tsan::SlotLock Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes. This way we don't take any locks in steady state at all and do only 1 push/pop per Refill/Drain. Effect on the added benchmark: $ TIME="%e %U %S %M" time ./test.old 36 5 2000000 34.51 978.22 175.67 5833592 32.53 891.73 167.03 5790036 36.17 1005.54 201.24 5802828 36.94 1004.76 226.58 5803188 $ TIME="%e %U %S %M" time ./test.new 36 5 2000000 26.44 720.99 13.45 5750704 25.92 721.98 13.58 5767764 26.33 725.15 13.41 5777936 25.93 713.49 13.41 5791796 Reviewed By: melver Differential Revision: https://reviews.llvm.org/D130002
This commit is contained in:
parent
71c502cbca
commit
6d1f86095d
|
@ -85,14 +85,7 @@ class DenseSlabAlloc {
|
|||
}
|
||||
|
||||
void FlushCache(Cache *c) {
|
||||
if (!c->pos)
|
||||
return;
|
||||
SpinMutexLock lock(&mtx_);
|
||||
while (c->pos) {
|
||||
IndexT idx = c->cache[--c->pos];
|
||||
*(IndexT*)Map(idx) = freelist_;
|
||||
freelist_ = idx;
|
||||
}
|
||||
while (c->pos) Drain(c);
|
||||
}
|
||||
|
||||
void InitCache(Cache *c) {
|
||||
|
@ -106,7 +99,7 @@ class DenseSlabAlloc {
|
|||
|
||||
template <typename Func>
|
||||
void ForEach(Func func) {
|
||||
SpinMutexLock lock(&mtx_);
|
||||
Lock lock(&mtx_);
|
||||
uptr fillpos = atomic_load_relaxed(&fillpos_);
|
||||
for (uptr l1 = 0; l1 < fillpos; l1++) {
|
||||
for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
|
||||
|
@ -115,48 +108,86 @@ class DenseSlabAlloc {
|
|||
|
||||
private:
|
||||
T *map_[kL1Size];
|
||||
SpinMutex mtx_;
|
||||
IndexT freelist_ = {0};
|
||||
Mutex mtx_;
|
||||
// The freelist is organized as a lock-free stack of batches of nodes.
|
||||
// The stack itself uses Block::next links, while the batch within each
|
||||
// stack node uses Block::batch links.
|
||||
// Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
|
||||
atomic_uint64_t freelist_ = {0};
|
||||
atomic_uintptr_t fillpos_ = {0};
|
||||
const char *const name_;
|
||||
|
||||
void Refill(Cache *c) {
|
||||
SpinMutexLock lock(&mtx_);
|
||||
if (freelist_ == 0) {
|
||||
uptr fillpos = atomic_load_relaxed(&fillpos_);
|
||||
if (fillpos == kL1Size) {
|
||||
Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
|
||||
name_, kL1Size, kL2Size);
|
||||
Die();
|
||||
}
|
||||
VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
|
||||
fillpos, kL1Size, kL2Size);
|
||||
T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
|
||||
// Reserve 0 as invalid index.
|
||||
IndexT start = fillpos == 0 ? 1 : 0;
|
||||
for (IndexT i = start; i < kL2Size; i++) {
|
||||
new(batch + i) T;
|
||||
*(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
|
||||
}
|
||||
*(IndexT*)(batch + kL2Size - 1) = 0;
|
||||
freelist_ = fillpos * kL2Size + start;
|
||||
map_[fillpos] = batch;
|
||||
atomic_store_relaxed(&fillpos_, fillpos + 1);
|
||||
}
|
||||
for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
|
||||
IndexT idx = freelist_;
|
||||
struct Block {
|
||||
IndexT next;
|
||||
IndexT batch;
|
||||
};
|
||||
|
||||
Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
|
||||
|
||||
static constexpr u64 kCounterInc = 1ull << 32;
|
||||
static constexpr u64 kCounterMask = ~(kCounterInc - 1);
|
||||
|
||||
NOINLINE void Refill(Cache *c) {
|
||||
// Pop 1 batch of nodes from the freelist.
|
||||
IndexT idx;
|
||||
u64 xchg;
|
||||
u64 cmp = atomic_load(&freelist_, memory_order_acquire);
|
||||
do {
|
||||
idx = static_cast<IndexT>(cmp);
|
||||
if (!idx)
|
||||
return AllocSuperBlock(c);
|
||||
Block *ptr = MapBlock(idx);
|
||||
xchg = ptr->next | (cmp & kCounterMask);
|
||||
} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
|
||||
memory_order_acq_rel));
|
||||
// Unpack it into c->cache.
|
||||
while (idx) {
|
||||
c->cache[c->pos++] = idx;
|
||||
freelist_ = *(IndexT*)Map(idx);
|
||||
idx = MapBlock(idx)->batch;
|
||||
}
|
||||
}
|
||||
|
||||
void Drain(Cache *c) {
|
||||
SpinMutexLock lock(&mtx_);
|
||||
for (uptr i = 0; i < Cache::kSize / 2; i++) {
|
||||
NOINLINE void Drain(Cache *c) {
|
||||
// Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
|
||||
IndexT head_idx = 0;
|
||||
for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
|
||||
IndexT idx = c->cache[--c->pos];
|
||||
*(IndexT*)Map(idx) = freelist_;
|
||||
freelist_ = idx;
|
||||
Block *ptr = MapBlock(idx);
|
||||
ptr->batch = head_idx;
|
||||
head_idx = idx;
|
||||
}
|
||||
// Push it onto the freelist stack.
|
||||
Block *head = MapBlock(head_idx);
|
||||
u64 xchg;
|
||||
u64 cmp = atomic_load(&freelist_, memory_order_acquire);
|
||||
do {
|
||||
head->next = static_cast<IndexT>(cmp);
|
||||
xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
|
||||
} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
|
||||
memory_order_acq_rel));
|
||||
}
|
||||
|
||||
NOINLINE void AllocSuperBlock(Cache *c) {
|
||||
Lock lock(&mtx_);
|
||||
uptr fillpos = atomic_load_relaxed(&fillpos_);
|
||||
if (fillpos == kL1Size) {
|
||||
Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
|
||||
kL2Size);
|
||||
Die();
|
||||
}
|
||||
VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
|
||||
fillpos, kL1Size, kL2Size);
|
||||
T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
|
||||
map_[fillpos] = batch;
|
||||
// Reserve 0 as invalid index.
|
||||
for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
|
||||
new (batch + i) T;
|
||||
c->cache[c->pos++] = i + fillpos * kL2Size;
|
||||
if (c->pos == Cache::kSize)
|
||||
Drain(c);
|
||||
}
|
||||
atomic_store_relaxed(&fillpos_, fillpos + 1);
|
||||
CHECK(c->pos);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
// RUN: %clangxx_tsan %s -o %t
|
||||
// RUN: %run %t 2>&1 | FileCheck %s
|
||||
|
||||
// bench.h needs pthread barriers which are not available on OS X
|
||||
// UNSUPPORTED: darwin
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
void thread(int tid) {
|
||||
void **blocks = new void *[bench_mode];
|
||||
for (int i = 0; i < bench_niter; i++) {
|
||||
for (int j = 0; j < bench_mode; j++)
|
||||
blocks[j] = malloc(8);
|
||||
for (int j = 0; j < bench_mode; j++)
|
||||
free(blocks[j]);
|
||||
}
|
||||
delete[] blocks;
|
||||
}
|
||||
|
||||
void bench() { start_thread_group(bench_nthread, thread); }
|
||||
|
||||
// CHECK: DONE
|
Loading…
Reference in New Issue