tsan: optimize DenseSlabAlloc

If lots of threads do lots of malloc/free and they overflow per-pthread DenseSlabAlloc cache, it causes lots of contention: 31.97% race.old race.old [.] __sanitizer::StaticSpinMutex::LockSlow 17.61% race.old race.old [.] __tsan_read4 10.77% race.old race.old [.] __tsan::SlotLock Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes. This way we don't take any locks in steady state at all and do only 1 push/pop per Refill/Drain. Effect on the added benchmark: $ TIME="%e %U %S %M" time ./test.old 36 5 2000000 34.51 978.22 175.67 5833592 32.53 891.73 167.03 5790036 36.17 1005.54 201.24 5802828 36.94 1004.76 226.58 5803188 $ TIME="%e %U %S %M" time ./test.new 36 5 2000000 26.44 720.99 13.45 5750704 25.92 721.98 13.58 5767764 26.33 725.15 13.41 5777936 25.93 713.49 13.41 5791796 Reviewed By: melver Differential Revision: https://reviews.llvm.org/D130002
2022-07-16 11:48:18 +02:00 · 2022-07-16 11:48:18 +02:00 · 6d1f86095d
parent 71c502cbca
commit 6d1f86095d
2 changed files with 95 additions and 42 deletions
--- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
@ -85,14 +85,7 @@ class DenseSlabAlloc {
  }

  void FlushCache(Cache *c) {
-    if (!c->pos)
-      return;
-    SpinMutexLock lock(&mtx_);
-    while (c->pos) {
-      IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
-    }
+    while (c->pos) Drain(c);
  }

  void InitCache(Cache *c) {
@ -106,7 +99,7 @@ class DenseSlabAlloc {

  template <typename Func>
  void ForEach(Func func) {
-    SpinMutexLock lock(&mtx_);
+    Lock lock(&mtx_);
    uptr fillpos = atomic_load_relaxed(&fillpos_);
    for (uptr l1 = 0; l1 < fillpos; l1++) {
      for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
@ -115,48 +108,86 @@ class DenseSlabAlloc {

 private:
  T *map_[kL1Size];
-  SpinMutex mtx_;
-  IndexT freelist_ = {0};
+  Mutex mtx_;
+  // The freelist is organized as a lock-free stack of batches of nodes.
+  // The stack itself uses Block::next links, while the batch within each
+  // stack node uses Block::batch links.
+  // Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
+  atomic_uint64_t freelist_ = {0};
  atomic_uintptr_t fillpos_ = {0};
  const char *const name_;

-  void Refill(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    if (freelist_ == 0) {
-      uptr fillpos = atomic_load_relaxed(&fillpos_);
-      if (fillpos == kL1Size) {
-        Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
-            name_, kL1Size, kL2Size);
-        Die();
-      }
-      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
-              fillpos, kL1Size, kL2Size);
-      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
-      // Reserve 0 as invalid index.
-      IndexT start = fillpos == 0 ? 1 : 0;
-      for (IndexT i = start; i < kL2Size; i++) {
-        new(batch + i) T;
-        *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
-      }
-      *(IndexT*)(batch + kL2Size - 1) = 0;
-      freelist_ = fillpos * kL2Size + start;
-      map_[fillpos] = batch;
-      atomic_store_relaxed(&fillpos_, fillpos + 1);
-    }
-    for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
-      IndexT idx = freelist_;
+  struct Block {
+    IndexT next;
+    IndexT batch;
+  };
+
+  Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
+
+  static constexpr u64 kCounterInc = 1ull << 32;
+  static constexpr u64 kCounterMask = ~(kCounterInc - 1);
+
+  NOINLINE void Refill(Cache *c) {
+    // Pop 1 batch of nodes from the freelist.
+    IndexT idx;
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      idx = static_cast<IndexT>(cmp);
+      if (!idx)
+        return AllocSuperBlock(c);
+      Block *ptr = MapBlock(idx);
+      xchg = ptr->next | (cmp & kCounterMask);
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+    // Unpack it into c->cache.
+    while (idx) {
      c->cache[c->pos++] = idx;
-      freelist_ = *(IndexT*)Map(idx);
+      idx = MapBlock(idx)->batch;
    }
  }

-  void Drain(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    for (uptr i = 0; i < Cache::kSize / 2; i++) {
+  NOINLINE void Drain(Cache *c) {
+    // Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
+    IndexT head_idx = 0;
+    for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
      IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
+      Block *ptr = MapBlock(idx);
+      ptr->batch = head_idx;
+      head_idx = idx;
    }
+    // Push it onto the freelist stack.
+    Block *head = MapBlock(head_idx);
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      head->next = static_cast<IndexT>(cmp);
+      xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+  }
+
+  NOINLINE void AllocSuperBlock(Cache *c) {
+    Lock lock(&mtx_);
+    uptr fillpos = atomic_load_relaxed(&fillpos_);
+    if (fillpos == kL1Size) {
+      Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
+             kL2Size);
+      Die();
+    }
+    VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
+            fillpos, kL1Size, kL2Size);
+    T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
+    map_[fillpos] = batch;
+    // Reserve 0 as invalid index.
+    for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
+      new (batch + i) T;
+      c->cache[c->pos++] = i + fillpos * kL2Size;
+      if (c->pos == Cache::kSize)
+        Drain(c);
+    }
+    atomic_store_relaxed(&fillpos_, fillpos + 1);
+    CHECK(c->pos);
  }
 };

--- a/compiler-rt/test/tsan/bench_malloc.cpp
+++ b/compiler-rt/test/tsan/bench_malloc.cpp
@ -0,0 +1,22 @@
+// RUN: %clangxx_tsan %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// bench.h needs pthread barriers which are not available on OS X
+// UNSUPPORTED: darwin
+
+#include "bench.h"
+
+void thread(int tid) {
+  void **blocks = new void *[bench_mode];
+  for (int i = 0; i < bench_niter; i++) {
+    for (int j = 0; j < bench_mode; j++)
+      blocks[j] = malloc(8);
+    for (int j = 0; j < bench_mode; j++)
+      free(blocks[j]);
+  }
+  delete[] blocks;
+}
+
+void bench() { start_thread_group(bench_nthread, thread); }
+
+// CHECK: DONE