forked from OSchip/llvm-project
tsan: optimize sync clock memory consumption
This change implements 2 optimizations of sync clocks that reduce memory consumption: Use previously unused first level block space to store clock elements. Currently a clock for 100 threads consumes 3 512-byte blocks: 2 64-bit second level blocks to store clock elements +1 32-bit first level block to store indices to second level blocks Only 8 bytes of the first level block are actually used. With this change such clock consumes only 2 blocks. Share similar clocks differing only by a single clock entry for the current thread. When a thread does several release operations on fresh sync objects without intervening acquire operations in between (e.g. initialization of several fields in ctor), the resulting clocks differ only by a single entry for the current thread. This change reuses a single clock for such release operations. The current thread time (which is different for different clocks) is stored in dirty entries. We are experiencing issues with a large program that eats all 64M clock blocks (32GB of non-flushable memory) and crashes with dense allocator overflow. Max number of threads in the program is ~170 which is currently quite unfortunate (consume 4 blocks per clock). Currently it crashes after consuming 60+ GB of memory. The first optimization brings clock block consumption down to ~40M and allows the program to work. The second optimization further reduces block consumption to "modest" 16M blocks (~8GB of RAM) and reduces overall RAM consumption to ~30GB. Measurements on another real world C++ RPC benchmark show RSS reduction from 3.491G to 3.186G and a modest speedup of ~5%. Go parallel client/server HTTP benchmark: https://github.com/golang/benchmarks/blob/master/http/http.go shows RSS reduction from 320MB to 240MB and a few percent speedup. Reviewed in https://reviews.llvm.org/D35323 llvm-svn: 308018
This commit is contained in:
parent
0e03935182
commit
9f2c6207d5
|
@ -61,20 +61,13 @@
|
||||||
// an exclusive lock; ThreadClock's are private to respective threads and so
|
// an exclusive lock; ThreadClock's are private to respective threads and so
|
||||||
// do not need any protection.
|
// do not need any protection.
|
||||||
//
|
//
|
||||||
// Description of ThreadClock state:
|
|
||||||
// clk_ - fixed size vector clock.
|
|
||||||
// nclk_ - effective size of the vector clock (the rest is zeros).
|
|
||||||
// tid_ - index of the thread associated with he clock ("current thread").
|
|
||||||
// last_acquire_ - current thread time when it acquired something from
|
|
||||||
// other threads.
|
|
||||||
//
|
|
||||||
// Description of SyncClock state:
|
// Description of SyncClock state:
|
||||||
// clk_ - variable size vector clock, low kClkBits hold timestamp,
|
// clk_ - variable size vector clock, low kClkBits hold timestamp,
|
||||||
// the remaining bits hold "acquired" flag (the actual value is thread's
|
// the remaining bits hold "acquired" flag (the actual value is thread's
|
||||||
// reused counter);
|
// reused counter);
|
||||||
// if acquried == thr->reused_, then the respective thread has already
|
// if acquried == thr->reused_, then the respective thread has already
|
||||||
// acquired this clock (except possibly dirty_tids_).
|
// acquired this clock (except possibly for dirty elements).
|
||||||
// dirty_tids_ - holds up to two indeces in the vector clock that other threads
|
// dirty_ - holds up to two indeces in the vector clock that other threads
|
||||||
// need to acquire regardless of "acquired" flag value;
|
// need to acquire regardless of "acquired" flag value;
|
||||||
// release_store_tid_ - denotes that the clock state is a result of
|
// release_store_tid_ - denotes that the clock state is a result of
|
||||||
// release-store operation by the thread with release_store_tid_ index.
|
// release-store operation by the thread with release_store_tid_ index.
|
||||||
|
@ -90,21 +83,51 @@
|
||||||
|
|
||||||
namespace __tsan {
|
namespace __tsan {
|
||||||
|
|
||||||
|
static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
|
||||||
|
return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drop reference to the first level block idx.
|
||||||
|
static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
|
||||||
|
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
||||||
|
atomic_uint32_t *ref = ref_ptr(cb);
|
||||||
|
u32 v = atomic_load(ref, memory_order_acquire);
|
||||||
|
for (;;) {
|
||||||
|
CHECK_GT(v, 0);
|
||||||
|
if (v == 1)
|
||||||
|
break;
|
||||||
|
if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// First level block owns second level blocks, so them as well.
|
||||||
|
for (uptr i = 0; i < blocks; i++)
|
||||||
|
ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
|
||||||
|
ctx->clock_alloc.Free(c, idx);
|
||||||
|
}
|
||||||
|
|
||||||
ThreadClock::ThreadClock(unsigned tid, unsigned reused)
|
ThreadClock::ThreadClock(unsigned tid, unsigned reused)
|
||||||
: tid_(tid)
|
: tid_(tid)
|
||||||
, reused_(reused + 1) { // 0 has special meaning
|
, reused_(reused + 1) // 0 has special meaning
|
||||||
|
, cached_idx_()
|
||||||
|
, cached_size_()
|
||||||
|
, cached_blocks_() {
|
||||||
CHECK_LT(tid, kMaxTidInClock);
|
CHECK_LT(tid, kMaxTidInClock);
|
||||||
CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
|
CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
|
||||||
nclk_ = tid_ + 1;
|
nclk_ = tid_ + 1;
|
||||||
last_acquire_ = 0;
|
last_acquire_ = 0;
|
||||||
internal_memset(clk_, 0, sizeof(clk_));
|
internal_memset(clk_, 0, sizeof(clk_));
|
||||||
clk_[tid_].reused = reused_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::ResetCached(ClockCache *c) {
|
void ThreadClock::ResetCached(ClockCache *c) {
|
||||||
|
if (cached_idx_) {
|
||||||
|
UnrefClockBlock(c, cached_idx_, cached_blocks_);
|
||||||
|
cached_idx_ = 0;
|
||||||
|
cached_size_ = 0;
|
||||||
|
cached_blocks_ = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
|
void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
|
||||||
DCHECK_LE(nclk_, kMaxTid);
|
DCHECK_LE(nclk_, kMaxTid);
|
||||||
DCHECK_LE(src->size_, kMaxTid);
|
DCHECK_LE(src->size_, kMaxTid);
|
||||||
CPP_STAT_INC(StatClockAcquire);
|
CPP_STAT_INC(StatClockAcquire);
|
||||||
|
@ -116,50 +139,46 @@ void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if we've already acquired src after the last release operation on src
|
|
||||||
bool acquired = false;
|
bool acquired = false;
|
||||||
if (nclk > tid_) {
|
|
||||||
if (src->elem(tid_).reused == reused_) {
|
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++) {
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
unsigned tid = src->dirty_tids_[i];
|
SyncClock::Dirty dirty = src->dirty_[i];
|
||||||
|
unsigned tid = dirty.tid;
|
||||||
if (tid != kInvalidTid) {
|
if (tid != kInvalidTid) {
|
||||||
u64 epoch = src->elem(tid).epoch;
|
if (clk_[tid] < dirty.epoch) {
|
||||||
if (clk_[tid].epoch < epoch) {
|
clk_[tid] = dirty.epoch;
|
||||||
clk_[tid].epoch = epoch;
|
|
||||||
acquired = true;
|
acquired = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (acquired) {
|
|
||||||
CPP_STAT_INC(StatClockAcquiredSomething);
|
|
||||||
last_acquire_ = clk_[tid_].epoch;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Check if we've already acquired src after the last release operation on src
|
||||||
|
if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
|
||||||
// O(N) acquire.
|
// O(N) acquire.
|
||||||
CPP_STAT_INC(StatClockAcquireFull);
|
CPP_STAT_INC(StatClockAcquireFull);
|
||||||
nclk_ = max(nclk_, nclk);
|
nclk_ = max(nclk_, nclk);
|
||||||
for (uptr i = 0; i < nclk; i++) {
|
u64 *dst_pos = &clk_[0];
|
||||||
u64 epoch = src->elem(i).epoch;
|
for (ClockElem &src_elem : *src) {
|
||||||
if (clk_[i].epoch < epoch) {
|
u64 epoch = src_elem.epoch;
|
||||||
clk_[i].epoch = epoch;
|
if (*dst_pos < epoch) {
|
||||||
|
*dst_pos = epoch;
|
||||||
acquired = true;
|
acquired = true;
|
||||||
}
|
}
|
||||||
|
dst_pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remember that this thread has acquired this clock.
|
// Remember that this thread has acquired this clock.
|
||||||
if (nclk > tid_)
|
if (nclk > tid_)
|
||||||
src->elem(tid_).reused = reused_;
|
src->elem(tid_).reused = reused_;
|
||||||
|
}
|
||||||
|
|
||||||
if (acquired) {
|
if (acquired) {
|
||||||
CPP_STAT_INC(StatClockAcquiredSomething);
|
CPP_STAT_INC(StatClockAcquiredSomething);
|
||||||
last_acquire_ = clk_[tid_].epoch;
|
last_acquire_ = clk_[tid_];
|
||||||
|
ResetCached(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
|
void ThreadClock::release(ClockCache *c, SyncClock *dst) {
|
||||||
DCHECK_LE(nclk_, kMaxTid);
|
DCHECK_LE(nclk_, kMaxTid);
|
||||||
DCHECK_LE(dst->size_, kMaxTid);
|
DCHECK_LE(dst->size_, kMaxTid);
|
||||||
|
|
||||||
|
@ -179,7 +198,7 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
|
||||||
// since the last release on dst. If so, we need to update
|
// since the last release on dst. If so, we need to update
|
||||||
// only dst->elem(tid_).
|
// only dst->elem(tid_).
|
||||||
if (dst->elem(tid_).epoch > last_acquire_) {
|
if (dst->elem(tid_).epoch > last_acquire_) {
|
||||||
UpdateCurrentThread(dst);
|
UpdateCurrentThread(c, dst);
|
||||||
if (dst->release_store_tid_ != tid_ ||
|
if (dst->release_store_tid_ != tid_ ||
|
||||||
dst->release_store_reused_ != reused_)
|
dst->release_store_reused_ != reused_)
|
||||||
dst->release_store_tid_ = kInvalidTid;
|
dst->release_store_tid_ = kInvalidTid;
|
||||||
|
@ -188,23 +207,24 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
|
||||||
|
|
||||||
// O(N) release.
|
// O(N) release.
|
||||||
CPP_STAT_INC(StatClockReleaseFull);
|
CPP_STAT_INC(StatClockReleaseFull);
|
||||||
|
dst->Unshare(c);
|
||||||
// First, remember whether we've acquired dst.
|
// First, remember whether we've acquired dst.
|
||||||
bool acquired = IsAlreadyAcquired(dst);
|
bool acquired = IsAlreadyAcquired(dst);
|
||||||
if (acquired)
|
if (acquired)
|
||||||
CPP_STAT_INC(StatClockReleaseAcquired);
|
CPP_STAT_INC(StatClockReleaseAcquired);
|
||||||
// Update dst->clk_.
|
// Update dst->clk_.
|
||||||
for (uptr i = 0; i < nclk_; i++) {
|
dst->FlushDirty();
|
||||||
ClockElem &ce = dst->elem(i);
|
uptr i = 0;
|
||||||
ce.epoch = max(ce.epoch, clk_[i].epoch);
|
for (ClockElem &ce : *dst) {
|
||||||
|
ce.epoch = max(ce.epoch, clk_[i]);
|
||||||
ce.reused = 0;
|
ce.reused = 0;
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
// Clear 'acquired' flag in the remaining elements.
|
// Clear 'acquired' flag in the remaining elements.
|
||||||
if (nclk_ < dst->size_)
|
if (nclk_ < dst->size_)
|
||||||
CPP_STAT_INC(StatClockReleaseClearTail);
|
CPP_STAT_INC(StatClockReleaseClearTail);
|
||||||
for (uptr i = nclk_; i < dst->size_; i++)
|
for (uptr i = nclk_; i < dst->size_; i++)
|
||||||
dst->elem(i).reused = 0;
|
dst->elem(i).reused = 0;
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++)
|
|
||||||
dst->dirty_tids_[i] = kInvalidTid;
|
|
||||||
dst->release_store_tid_ = kInvalidTid;
|
dst->release_store_tid_ = kInvalidTid;
|
||||||
dst->release_store_reused_ = 0;
|
dst->release_store_reused_ = 0;
|
||||||
// If we've acquired dst, remember this fact,
|
// If we've acquired dst, remember this fact,
|
||||||
|
@ -213,11 +233,37 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
|
||||||
dst->elem(tid_).reused = reused_;
|
dst->elem(tid_).reused = reused_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
|
void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
|
||||||
DCHECK_LE(nclk_, kMaxTid);
|
DCHECK_LE(nclk_, kMaxTid);
|
||||||
DCHECK_LE(dst->size_, kMaxTid);
|
DCHECK_LE(dst->size_, kMaxTid);
|
||||||
CPP_STAT_INC(StatClockStore);
|
CPP_STAT_INC(StatClockStore);
|
||||||
|
|
||||||
|
if (dst->size_ == 0 && cached_idx_ != 0) {
|
||||||
|
// Reuse the cached clock.
|
||||||
|
// Note: we could reuse/cache the cached clock in more cases:
|
||||||
|
// we could update the existing clock and cache it, or replace it with the
|
||||||
|
// currently cached clock and release the old one. And for a shared
|
||||||
|
// existing clock, we could replace it with the currently cached;
|
||||||
|
// or unshare, update and cache. But, for simplicity, we currnetly reuse
|
||||||
|
// cached clock only when the target clock is empty.
|
||||||
|
dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
|
||||||
|
dst->tab_idx_ = cached_idx_;
|
||||||
|
dst->size_ = cached_size_;
|
||||||
|
dst->blocks_ = cached_blocks_;
|
||||||
|
CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
|
||||||
|
// The cached clock is shared (immutable),
|
||||||
|
// so this is where we store the current clock.
|
||||||
|
dst->dirty_[0].tid = tid_;
|
||||||
|
dst->dirty_[0].epoch = clk_[tid_];
|
||||||
|
dst->release_store_tid_ = tid_;
|
||||||
|
dst->release_store_reused_ = reused_;
|
||||||
|
// Rememeber that we don't need to acquire it in future.
|
||||||
|
dst->elem(tid_).reused = reused_;
|
||||||
|
// Grab a reference.
|
||||||
|
atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Check if we need to resize dst.
|
// Check if we need to resize dst.
|
||||||
if (dst->size_ < nclk_)
|
if (dst->size_ < nclk_)
|
||||||
dst->Resize(c, nclk_);
|
dst->Resize(c, nclk_);
|
||||||
|
@ -226,32 +272,41 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
|
||||||
dst->release_store_reused_ == reused_ &&
|
dst->release_store_reused_ == reused_ &&
|
||||||
dst->elem(tid_).epoch > last_acquire_) {
|
dst->elem(tid_).epoch > last_acquire_) {
|
||||||
CPP_STAT_INC(StatClockStoreFast);
|
CPP_STAT_INC(StatClockStoreFast);
|
||||||
UpdateCurrentThread(dst);
|
UpdateCurrentThread(c, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// O(N) release-store.
|
// O(N) release-store.
|
||||||
CPP_STAT_INC(StatClockStoreFull);
|
CPP_STAT_INC(StatClockStoreFull);
|
||||||
for (uptr i = 0; i < nclk_; i++) {
|
dst->Unshare(c);
|
||||||
ClockElem &ce = dst->elem(i);
|
// Note: dst can be larger than this ThreadClock.
|
||||||
ce.epoch = clk_[i].epoch;
|
// This is fine since clk_ beyond size is all zeros.
|
||||||
|
uptr i = 0;
|
||||||
|
for (ClockElem &ce : *dst) {
|
||||||
|
ce.epoch = clk_[i];
|
||||||
ce.reused = 0;
|
ce.reused = 0;
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
// Clear the tail of dst->clk_.
|
for (uptr i = 0; i < kDirtyTids; i++)
|
||||||
if (nclk_ < dst->size_) {
|
dst->dirty_[i].tid = kInvalidTid;
|
||||||
for (uptr i = nclk_; i < dst->size_; i++) {
|
|
||||||
ClockElem &ce = dst->elem(i);
|
|
||||||
ce.epoch = 0;
|
|
||||||
ce.reused = 0;
|
|
||||||
}
|
|
||||||
CPP_STAT_INC(StatClockStoreTail);
|
|
||||||
}
|
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++)
|
|
||||||
dst->dirty_tids_[i] = kInvalidTid;
|
|
||||||
dst->release_store_tid_ = tid_;
|
dst->release_store_tid_ = tid_;
|
||||||
dst->release_store_reused_ = reused_;
|
dst->release_store_reused_ = reused_;
|
||||||
// Rememeber that we don't need to acquire it in future.
|
// Rememeber that we don't need to acquire it in future.
|
||||||
dst->elem(tid_).reused = reused_;
|
dst->elem(tid_).reused = reused_;
|
||||||
|
|
||||||
|
// If the resulting clock is cachable, cache it for future release operations.
|
||||||
|
// The clock is always cachable if we released to an empty sync object.
|
||||||
|
if (cached_idx_ == 0 && dst->Cachable()) {
|
||||||
|
// Grab a reference to the ClockBlock.
|
||||||
|
atomic_uint32_t *ref = ref_ptr(dst->tab_);
|
||||||
|
if (atomic_load(ref, memory_order_acquire) == 1)
|
||||||
|
atomic_store_relaxed(ref, 2);
|
||||||
|
else
|
||||||
|
atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
|
||||||
|
cached_idx_ = dst->tab_idx_;
|
||||||
|
cached_size_ = dst->size_;
|
||||||
|
cached_blocks_ = dst->blocks_;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
|
void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
|
||||||
|
@ -261,37 +316,36 @@ void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Updates only single element related to the current thread in dst->clk_.
|
// Updates only single element related to the current thread in dst->clk_.
|
||||||
void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
|
void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
|
||||||
// Update the threads time, but preserve 'acquired' flag.
|
// Update the threads time, but preserve 'acquired' flag.
|
||||||
dst->elem(tid_).epoch = clk_[tid_].epoch;
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++) {
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
if (dst->dirty_tids_[i] == tid_) {
|
SyncClock::Dirty *dirty = &dst->dirty_[i];
|
||||||
|
const unsigned tid = dirty->tid;
|
||||||
|
if (tid == tid_ || tid == kInvalidTid) {
|
||||||
CPP_STAT_INC(StatClockReleaseFast);
|
CPP_STAT_INC(StatClockReleaseFast);
|
||||||
return;
|
dirty->tid = tid_;
|
||||||
}
|
dirty->epoch = clk_[tid_];
|
||||||
if (dst->dirty_tids_[i] == kInvalidTid) {
|
|
||||||
CPP_STAT_INC(StatClockReleaseFast);
|
|
||||||
dst->dirty_tids_[i] = tid_;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Reset all 'acquired' flags, O(N).
|
// Reset all 'acquired' flags, O(N).
|
||||||
|
// We are going to touch dst elements, so we need to unshare it.
|
||||||
|
dst->Unshare(c);
|
||||||
CPP_STAT_INC(StatClockReleaseSlow);
|
CPP_STAT_INC(StatClockReleaseSlow);
|
||||||
|
dst->elem(tid_).epoch = clk_[tid_];
|
||||||
for (uptr i = 0; i < dst->size_; i++)
|
for (uptr i = 0; i < dst->size_; i++)
|
||||||
dst->elem(i).reused = 0;
|
dst->elem(i).reused = 0;
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++)
|
dst->FlushDirty();
|
||||||
dst->dirty_tids_[i] = kInvalidTid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks whether the current threads has already acquired src.
|
// Checks whether the current thread has already acquired src.
|
||||||
bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
|
bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
|
||||||
if (src->elem(tid_).reused != reused_)
|
if (src->elem(tid_).reused != reused_)
|
||||||
return false;
|
return false;
|
||||||
for (unsigned i = 0; i < kDirtyTids; i++) {
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
unsigned tid = src->dirty_tids_[i];
|
SyncClock::Dirty dirty = src->dirty_[i];
|
||||||
if (tid != kInvalidTid) {
|
if (dirty.tid != kInvalidTid) {
|
||||||
if (clk_[tid].epoch < src->elem(tid).epoch)
|
if (clk_[dirty.tid] < dirty.epoch)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -302,22 +356,19 @@ bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
|
||||||
// This function is called only from weird places like AcquireGlobal.
|
// This function is called only from weird places like AcquireGlobal.
|
||||||
void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
|
void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
|
||||||
DCHECK_LT(tid, kMaxTid);
|
DCHECK_LT(tid, kMaxTid);
|
||||||
DCHECK_GE(v, clk_[tid].epoch);
|
DCHECK_GE(v, clk_[tid]);
|
||||||
clk_[tid].epoch = v;
|
clk_[tid] = v;
|
||||||
if (nclk_ <= tid)
|
if (nclk_ <= tid)
|
||||||
nclk_ = tid + 1;
|
nclk_ = tid + 1;
|
||||||
last_acquire_ = clk_[tid_].epoch;
|
last_acquire_ = clk_[tid_];
|
||||||
|
ResetCached(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
|
void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
|
||||||
printf("clock=[");
|
printf("clock=[");
|
||||||
for (uptr i = 0; i < nclk_; i++)
|
for (uptr i = 0; i < nclk_; i++)
|
||||||
printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
|
printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
|
||||||
printf("] reused=[");
|
printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
|
||||||
for (uptr i = 0; i < nclk_; i++)
|
|
||||||
printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
|
|
||||||
printf("] tid=%u/%u last_acq=%llu",
|
|
||||||
tid_, reused_, last_acquire_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SyncClock::SyncClock() {
|
SyncClock::SyncClock() {
|
||||||
|
@ -327,22 +378,14 @@ SyncClock::SyncClock() {
|
||||||
SyncClock::~SyncClock() {
|
SyncClock::~SyncClock() {
|
||||||
// Reset must be called before dtor.
|
// Reset must be called before dtor.
|
||||||
CHECK_EQ(size_, 0);
|
CHECK_EQ(size_, 0);
|
||||||
|
CHECK_EQ(blocks_, 0);
|
||||||
CHECK_EQ(tab_, 0);
|
CHECK_EQ(tab_, 0);
|
||||||
CHECK_EQ(tab_idx_, 0);
|
CHECK_EQ(tab_idx_, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SyncClock::Reset(ClockCache *c) {
|
void SyncClock::Reset(ClockCache *c) {
|
||||||
if (size_ == 0) {
|
if (size_)
|
||||||
// nothing
|
UnrefClockBlock(c, tab_idx_, blocks_);
|
||||||
} else if (size_ <= ClockBlock::kClockCount) {
|
|
||||||
// One-level table.
|
|
||||||
ctx->clock_alloc.Free(c, tab_idx_);
|
|
||||||
} else {
|
|
||||||
// Two-level table.
|
|
||||||
for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
|
|
||||||
ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
|
|
||||||
ctx->clock_alloc.Free(c, tab_idx_);
|
|
||||||
}
|
|
||||||
ResetImpl();
|
ResetImpl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -350,66 +393,171 @@ void SyncClock::ResetImpl() {
|
||||||
tab_ = 0;
|
tab_ = 0;
|
||||||
tab_idx_ = 0;
|
tab_idx_ = 0;
|
||||||
size_ = 0;
|
size_ = 0;
|
||||||
|
blocks_ = 0;
|
||||||
release_store_tid_ = kInvalidTid;
|
release_store_tid_ = kInvalidTid;
|
||||||
release_store_reused_ = 0;
|
release_store_reused_ = 0;
|
||||||
for (uptr i = 0; i < kDirtyTids; i++)
|
for (uptr i = 0; i < kDirtyTids; i++)
|
||||||
dirty_tids_[i] = kInvalidTid;
|
dirty_[i].tid = kInvalidTid;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SyncClock::Resize(ClockCache *c, uptr nclk) {
|
void SyncClock::Resize(ClockCache *c, uptr nclk) {
|
||||||
CPP_STAT_INC(StatClockReleaseResize);
|
CPP_STAT_INC(StatClockReleaseResize);
|
||||||
if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
|
Unshare(c);
|
||||||
RoundUpTo(size_, ClockBlock::kClockCount)) {
|
if (nclk <= capacity()) {
|
||||||
// Growing within the same block.
|
|
||||||
// Memory is already allocated, just increase the size.
|
// Memory is already allocated, just increase the size.
|
||||||
size_ = nclk;
|
size_ = nclk;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (nclk <= ClockBlock::kClockCount) {
|
if (size_ == 0) {
|
||||||
// Grow from 0 to one-level table.
|
// Grow from 0 to one-level table.
|
||||||
CHECK_EQ(size_, 0);
|
CHECK_EQ(size_, 0);
|
||||||
|
CHECK_EQ(blocks_, 0);
|
||||||
CHECK_EQ(tab_, 0);
|
CHECK_EQ(tab_, 0);
|
||||||
CHECK_EQ(tab_idx_, 0);
|
CHECK_EQ(tab_idx_, 0);
|
||||||
size_ = nclk;
|
|
||||||
tab_idx_ = ctx->clock_alloc.Alloc(c);
|
tab_idx_ = ctx->clock_alloc.Alloc(c);
|
||||||
tab_ = ctx->clock_alloc.Map(tab_idx_);
|
tab_ = ctx->clock_alloc.Map(tab_idx_);
|
||||||
internal_memset(tab_, 0, sizeof(*tab_));
|
internal_memset(tab_, 0, sizeof(*tab_));
|
||||||
return;
|
atomic_store_relaxed(ref_ptr(tab_), 1);
|
||||||
|
size_ = 1;
|
||||||
|
} else if (size_ > blocks_ * ClockBlock::kClockCount) {
|
||||||
|
u32 idx = ctx->clock_alloc.Alloc(c);
|
||||||
|
ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
|
||||||
|
uptr top = size_ - blocks_ * ClockBlock::kClockCount;
|
||||||
|
CHECK_LT(top, ClockBlock::kClockCount);
|
||||||
|
const uptr move = top * sizeof(tab_->clock[0]);
|
||||||
|
internal_memcpy(&new_cb->clock[0], tab_->clock, move);
|
||||||
|
internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
|
||||||
|
internal_memset(tab_->clock, 0, move);
|
||||||
|
append_block(idx);
|
||||||
}
|
}
|
||||||
// Growing two-level table.
|
// At this point we have first level table allocated and all clock elements
|
||||||
if (size_ == 0) {
|
// are evacuated from it to a second level block.
|
||||||
// Allocate first level table.
|
|
||||||
tab_idx_ = ctx->clock_alloc.Alloc(c);
|
|
||||||
tab_ = ctx->clock_alloc.Map(tab_idx_);
|
|
||||||
internal_memset(tab_, 0, sizeof(*tab_));
|
|
||||||
} else if (size_ <= ClockBlock::kClockCount) {
|
|
||||||
// Transform one-level table to two-level table.
|
|
||||||
u32 old = tab_idx_;
|
|
||||||
tab_idx_ = ctx->clock_alloc.Alloc(c);
|
|
||||||
tab_ = ctx->clock_alloc.Map(tab_idx_);
|
|
||||||
internal_memset(tab_, 0, sizeof(*tab_));
|
|
||||||
tab_->table[0] = old;
|
|
||||||
}
|
|
||||||
// At this point we have first level table allocated.
|
|
||||||
// Add second level tables as necessary.
|
// Add second level tables as necessary.
|
||||||
for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
|
while (nclk > capacity()) {
|
||||||
i < nclk; i += ClockBlock::kClockCount) {
|
|
||||||
u32 idx = ctx->clock_alloc.Alloc(c);
|
u32 idx = ctx->clock_alloc.Alloc(c);
|
||||||
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
||||||
internal_memset(cb, 0, sizeof(*cb));
|
internal_memset(cb, 0, sizeof(*cb));
|
||||||
CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
|
append_block(idx);
|
||||||
tab_->table[i/ClockBlock::kClockCount] = idx;
|
|
||||||
}
|
}
|
||||||
size_ = nclk;
|
size_ = nclk;
|
||||||
}
|
}
|
||||||
|
|
||||||
ClockElem &SyncClock::elem(unsigned tid) const {
|
// Flushes all dirty elements into the main clock array.
|
||||||
|
void SyncClock::FlushDirty() {
|
||||||
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
|
Dirty *dirty = &dirty_[i];
|
||||||
|
if (dirty->tid != kInvalidTid) {
|
||||||
|
CHECK_LT(dirty->tid, size_);
|
||||||
|
elem(dirty->tid).epoch = dirty->epoch;
|
||||||
|
dirty->tid = kInvalidTid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SyncClock::IsShared() const {
|
||||||
|
if (size_ == 0)
|
||||||
|
return false;
|
||||||
|
atomic_uint32_t *ref = ref_ptr(tab_);
|
||||||
|
u32 v = atomic_load(ref, memory_order_acquire);
|
||||||
|
CHECK_GT(v, 0);
|
||||||
|
return v > 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unshares the current clock if it's shared.
|
||||||
|
// Shared clocks are immutable, so they need to be unshared before any updates.
|
||||||
|
// Note: this does not apply to dirty entries as they are not shared.
|
||||||
|
void SyncClock::Unshare(ClockCache *c) {
|
||||||
|
if (!IsShared())
|
||||||
|
return;
|
||||||
|
// First, copy current state into old.
|
||||||
|
SyncClock old;
|
||||||
|
old.tab_ = tab_;
|
||||||
|
old.tab_idx_ = tab_idx_;
|
||||||
|
old.size_ = size_;
|
||||||
|
old.blocks_ = blocks_;
|
||||||
|
old.release_store_tid_ = release_store_tid_;
|
||||||
|
old.release_store_reused_ = release_store_reused_;
|
||||||
|
for (unsigned i = 0; i < kDirtyTids; i++)
|
||||||
|
old.dirty_[i] = dirty_[i];
|
||||||
|
// Then, clear current object.
|
||||||
|
ResetImpl();
|
||||||
|
// Allocate brand new clock in the current object.
|
||||||
|
Resize(c, old.size_);
|
||||||
|
// Now copy state back into this object.
|
||||||
|
Iter old_iter(&old);
|
||||||
|
for (ClockElem &ce : *this) {
|
||||||
|
ce = *old_iter;
|
||||||
|
++old_iter;
|
||||||
|
}
|
||||||
|
release_store_tid_ = old.release_store_tid_;
|
||||||
|
release_store_reused_ = old.release_store_reused_;
|
||||||
|
for (unsigned i = 0; i < kDirtyTids; i++)
|
||||||
|
dirty_[i] = old.dirty_[i];
|
||||||
|
// Drop reference to old and delete if necessary.
|
||||||
|
old.Reset(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can we cache this clock for future release operations?
|
||||||
|
ALWAYS_INLINE bool SyncClock::Cachable() const {
|
||||||
|
if (size_ == 0)
|
||||||
|
return false;
|
||||||
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
|
if (dirty_[i].tid != kInvalidTid)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return atomic_load_relaxed(ref_ptr(tab_)) == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// elem linearizes the two-level structure into linear array.
|
||||||
|
// Note: this is used only for one time accesses, vector operations use
|
||||||
|
// the iterator as it is much faster.
|
||||||
|
ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
|
||||||
DCHECK_LT(tid, size_);
|
DCHECK_LT(tid, size_);
|
||||||
if (size_ <= ClockBlock::kClockCount)
|
const uptr block = tid / ClockBlock::kClockCount;
|
||||||
|
DCHECK_LE(block, blocks_);
|
||||||
|
tid %= ClockBlock::kClockCount;
|
||||||
|
if (block == blocks_)
|
||||||
return tab_->clock[tid];
|
return tab_->clock[tid];
|
||||||
u32 idx = tab_->table[tid / ClockBlock::kClockCount];
|
u32 idx = get_block(block);
|
||||||
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
||||||
return cb->clock[tid % ClockBlock::kClockCount];
|
return cb->clock[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE uptr SyncClock::capacity() const {
|
||||||
|
if (size_ == 0)
|
||||||
|
return 0;
|
||||||
|
uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
|
||||||
|
// How many clock elements we can fit into the first level block.
|
||||||
|
// +1 for ref counter.
|
||||||
|
uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
|
||||||
|
return blocks_ * ClockBlock::kClockCount + top;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
|
||||||
|
DCHECK(size_);
|
||||||
|
DCHECK_LT(bi, blocks_);
|
||||||
|
return tab_->table[ClockBlock::kBlockIdx - bi];
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
|
||||||
|
uptr bi = blocks_++;
|
||||||
|
CHECK_EQ(get_block(bi), 0);
|
||||||
|
tab_->table[ClockBlock::kBlockIdx - bi] = idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Used only by tests.
|
||||||
|
u64 SyncClock::get(unsigned tid) const {
|
||||||
|
for (unsigned i = 0; i < kDirtyTids; i++) {
|
||||||
|
Dirty dirty = dirty_[i];
|
||||||
|
if (dirty.tid == tid)
|
||||||
|
return dirty.epoch;
|
||||||
|
}
|
||||||
|
return elem(tid).epoch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Used only by Iter test.
|
||||||
|
u64 SyncClock::get_clean(unsigned tid) const {
|
||||||
|
return elem(tid).epoch;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
|
void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
|
||||||
|
@ -419,8 +567,32 @@ void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
|
||||||
printf("] reused=[");
|
printf("] reused=[");
|
||||||
for (uptr i = 0; i < size_; i++)
|
for (uptr i = 0; i < size_; i++)
|
||||||
printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
|
printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
|
||||||
printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
|
printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
|
||||||
release_store_tid_, release_store_reused_,
|
release_store_tid_, release_store_reused_,
|
||||||
dirty_tids_[0], dirty_tids_[1]);
|
dirty_[0].tid, dirty_[0].epoch,
|
||||||
|
dirty_[1].tid, dirty_[1].epoch);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SyncClock::Iter::Next() {
|
||||||
|
// Finished with the current block, move on to the next one.
|
||||||
|
block_++;
|
||||||
|
if (block_ < parent_->blocks_) {
|
||||||
|
// Iterate over the next second level block.
|
||||||
|
u32 idx = parent_->get_block(block_);
|
||||||
|
ClockBlock *cb = ctx->clock_alloc.Map(idx);
|
||||||
|
pos_ = &cb->clock[0];
|
||||||
|
end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
|
||||||
|
ClockBlock::kClockCount);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (block_ == parent_->blocks_ &&
|
||||||
|
parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
|
||||||
|
// Iterate over elements in the first level block.
|
||||||
|
pos_ = &parent_->tab_->clock[0];
|
||||||
|
end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
|
||||||
|
ClockBlock::kClockCount);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
parent_ = nullptr; // denotes end
|
||||||
}
|
}
|
||||||
} // namespace __tsan
|
} // namespace __tsan
|
||||||
|
|
|
@ -18,25 +18,6 @@
|
||||||
|
|
||||||
namespace __tsan {
|
namespace __tsan {
|
||||||
|
|
||||||
struct ClockElem {
|
|
||||||
u64 epoch : kClkBits;
|
|
||||||
u64 reused : 64 - kClkBits;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ClockBlock {
|
|
||||||
static const uptr kSize = 512;
|
|
||||||
static const uptr kTableSize = kSize / sizeof(u32);
|
|
||||||
static const uptr kClockCount = kSize / sizeof(ClockElem);
|
|
||||||
|
|
||||||
union {
|
|
||||||
u32 table[kTableSize];
|
|
||||||
ClockElem clock[kClockCount];
|
|
||||||
};
|
|
||||||
|
|
||||||
ClockBlock() {
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
|
typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
|
||||||
typedef DenseSlabAllocCache ClockCache;
|
typedef DenseSlabAllocCache ClockCache;
|
||||||
|
|
||||||
|
@ -46,69 +27,117 @@ class SyncClock {
|
||||||
SyncClock();
|
SyncClock();
|
||||||
~SyncClock();
|
~SyncClock();
|
||||||
|
|
||||||
uptr size() const {
|
uptr size() const;
|
||||||
return size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
u64 get(unsigned tid) const {
|
// These are used only in tests.
|
||||||
return elem(tid).epoch;
|
u64 get(unsigned tid) const;
|
||||||
}
|
u64 get_clean(unsigned tid) const;
|
||||||
|
|
||||||
void Resize(ClockCache *c, uptr nclk);
|
void Resize(ClockCache *c, uptr nclk);
|
||||||
void Reset(ClockCache *c);
|
void Reset(ClockCache *c);
|
||||||
|
|
||||||
void DebugDump(int(*printf)(const char *s, ...));
|
void DebugDump(int(*printf)(const char *s, ...));
|
||||||
|
|
||||||
|
// Clock element iterator.
|
||||||
|
// Note: it iterates only over the table without regard to dirty entries.
|
||||||
|
class Iter {
|
||||||
|
public:
|
||||||
|
explicit Iter(SyncClock* parent);
|
||||||
|
Iter& operator++();
|
||||||
|
bool operator!=(const Iter& other);
|
||||||
|
ClockElem &operator*();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend struct ThreadClock;
|
SyncClock *parent_;
|
||||||
|
// [pos_, end_) is the current continuous range of clock elements.
|
||||||
|
ClockElem *pos_;
|
||||||
|
ClockElem *end_;
|
||||||
|
int block_; // Current number of second level block.
|
||||||
|
|
||||||
|
NOINLINE void Next();
|
||||||
|
};
|
||||||
|
|
||||||
|
Iter begin();
|
||||||
|
Iter end();
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class ThreadClock;
|
||||||
|
friend class Iter;
|
||||||
static const uptr kDirtyTids = 2;
|
static const uptr kDirtyTids = 2;
|
||||||
|
|
||||||
|
struct Dirty {
|
||||||
|
u64 epoch : kClkBits;
|
||||||
|
u64 tid : 64 - kClkBits; // kInvalidId if not active
|
||||||
|
};
|
||||||
|
|
||||||
unsigned release_store_tid_;
|
unsigned release_store_tid_;
|
||||||
unsigned release_store_reused_;
|
unsigned release_store_reused_;
|
||||||
unsigned dirty_tids_[kDirtyTids];
|
Dirty dirty_[kDirtyTids];
|
||||||
// tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
|
// If size_ is 0, tab_ is nullptr.
|
||||||
// If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
|
// If size <= 64 (kClockCount), tab_ contains pointer to an array with
|
||||||
// Otherwise, tab_ points to an array with 128 u32 elements,
|
// 64 ClockElem's (ClockBlock::clock).
|
||||||
|
// Otherwise, tab_ points to an array with up to 127 u32 elements,
|
||||||
// each pointing to the second-level 512b block with 64 ClockElem's.
|
// each pointing to the second-level 512b block with 64 ClockElem's.
|
||||||
|
// Unused space in the first level ClockBlock is used to store additional
|
||||||
|
// clock elements.
|
||||||
|
// The last u32 element in the first level ClockBlock is always used as
|
||||||
|
// reference counter.
|
||||||
|
//
|
||||||
|
// See the following scheme for details.
|
||||||
|
// All memory blocks are 512 bytes (allocated from ClockAlloc).
|
||||||
|
// Clock (clk) elements are 64 bits.
|
||||||
|
// Idx and ref are 32 bits.
|
||||||
|
//
|
||||||
|
// tab_
|
||||||
|
// |
|
||||||
|
// \/
|
||||||
|
// +----------------------------------------------------+
|
||||||
|
// | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
|
||||||
|
// +----------------------------------------------------+
|
||||||
|
// | |
|
||||||
|
// | \/
|
||||||
|
// | +----------------+
|
||||||
|
// | | clk0 ... clk63 |
|
||||||
|
// | +----------------+
|
||||||
|
// \/
|
||||||
|
// +------------------+
|
||||||
|
// | clk64 ... clk127 |
|
||||||
|
// +------------------+
|
||||||
|
//
|
||||||
|
// Note: dirty entries, if active, always override what's stored in the clock.
|
||||||
ClockBlock *tab_;
|
ClockBlock *tab_;
|
||||||
u32 tab_idx_;
|
u32 tab_idx_;
|
||||||
u32 size_;
|
u16 size_;
|
||||||
|
u16 blocks_; // Number of second level blocks.
|
||||||
|
|
||||||
|
void Unshare(ClockCache *c);
|
||||||
|
bool IsShared() const;
|
||||||
|
bool Cachable() const;
|
||||||
void ResetImpl();
|
void ResetImpl();
|
||||||
|
void FlushDirty();
|
||||||
|
uptr capacity() const;
|
||||||
|
u32 get_block(uptr bi) const;
|
||||||
|
void append_block(u32 idx);
|
||||||
ClockElem &elem(unsigned tid) const;
|
ClockElem &elem(unsigned tid) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
// The clock that lives in threads.
|
// The clock that lives in threads.
|
||||||
struct ThreadClock {
|
class ThreadClock {
|
||||||
public:
|
public:
|
||||||
typedef DenseSlabAllocCache Cache;
|
typedef DenseSlabAllocCache Cache;
|
||||||
|
|
||||||
explicit ThreadClock(unsigned tid, unsigned reused = 0);
|
explicit ThreadClock(unsigned tid, unsigned reused = 0);
|
||||||
|
|
||||||
u64 get(unsigned tid) const {
|
u64 get(unsigned tid) const;
|
||||||
DCHECK_LT(tid, kMaxTidInClock);
|
|
||||||
return clk_[tid].epoch;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set(ClockCache *c, unsigned tid, u64 v);
|
void set(ClockCache *c, unsigned tid, u64 v);
|
||||||
|
void set(u64 v);
|
||||||
|
void tick();
|
||||||
|
uptr size() const;
|
||||||
|
|
||||||
void set(u64 v) {
|
void acquire(ClockCache *c, SyncClock *src);
|
||||||
DCHECK_GE(v, clk_[tid_].epoch);
|
void release(ClockCache *c, SyncClock *dst);
|
||||||
clk_[tid_].epoch = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
void tick() {
|
|
||||||
clk_[tid_].epoch++;
|
|
||||||
}
|
|
||||||
|
|
||||||
uptr size() const {
|
|
||||||
return nclk_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void acquire(ClockCache *c, const SyncClock *src);
|
|
||||||
void release(ClockCache *c, SyncClock *dst) const;
|
|
||||||
void acq_rel(ClockCache *c, SyncClock *dst);
|
void acq_rel(ClockCache *c, SyncClock *dst);
|
||||||
void ReleaseStore(ClockCache *c, SyncClock *dst) const;
|
void ReleaseStore(ClockCache *c, SyncClock *dst);
|
||||||
void ResetCached(ClockCache *c);
|
void ResetCached(ClockCache *c);
|
||||||
|
|
||||||
void DebugReset();
|
void DebugReset();
|
||||||
|
@ -116,16 +145,82 @@ struct ThreadClock {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static const uptr kDirtyTids = SyncClock::kDirtyTids;
|
static const uptr kDirtyTids = SyncClock::kDirtyTids;
|
||||||
|
// Index of the thread associated with he clock ("current thread").
|
||||||
const unsigned tid_;
|
const unsigned tid_;
|
||||||
const unsigned reused_;
|
const unsigned reused_; // tid_ reuse count.
|
||||||
|
// Current thread time when it acquired something from other threads.
|
||||||
u64 last_acquire_;
|
u64 last_acquire_;
|
||||||
|
|
||||||
|
// Cached SyncClock (without dirty entries and release_store_tid_).
|
||||||
|
// We reuse it for subsequent store-release operations without intervening
|
||||||
|
// acquire operations. Since it is shared (and thus constant), clock value
|
||||||
|
// for the current thread is then stored in dirty entries in the SyncClock.
|
||||||
|
// We host a refernece to the table while it is cached here.
|
||||||
|
u32 cached_idx_;
|
||||||
|
u16 cached_size_;
|
||||||
|
u16 cached_blocks_;
|
||||||
|
|
||||||
|
// Number of active elements in the clk_ table (the rest is zeros).
|
||||||
uptr nclk_;
|
uptr nclk_;
|
||||||
ClockElem clk_[kMaxTidInClock];
|
u64 clk_[kMaxTidInClock]; // Fixed size vector clock.
|
||||||
|
|
||||||
bool IsAlreadyAcquired(const SyncClock *src) const;
|
bool IsAlreadyAcquired(const SyncClock *src) const;
|
||||||
void UpdateCurrentThread(SyncClock *dst) const;
|
void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
|
||||||
|
DCHECK_LT(tid, kMaxTidInClock);
|
||||||
|
return clk_[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE void ThreadClock::set(u64 v) {
|
||||||
|
DCHECK_GE(v, clk_[tid_]);
|
||||||
|
clk_[tid_] = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE void ThreadClock::tick() {
|
||||||
|
clk_[tid_]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE uptr ThreadClock::size() const {
|
||||||
|
return nclk_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
|
||||||
|
return Iter(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
|
||||||
|
return Iter(nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE uptr SyncClock::size() const {
|
||||||
|
return size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
|
||||||
|
: parent_(parent)
|
||||||
|
, pos_(nullptr)
|
||||||
|
, end_(nullptr)
|
||||||
|
, block_(-1) {
|
||||||
|
if (parent)
|
||||||
|
Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
|
||||||
|
pos_++;
|
||||||
|
if (UNLIKELY(pos_ >= end_))
|
||||||
|
Next();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
|
||||||
|
return parent_ != other.parent_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
|
||||||
|
return *pos_;
|
||||||
|
}
|
||||||
} // namespace __tsan
|
} // namespace __tsan
|
||||||
|
|
||||||
#endif // TSAN_CLOCK_H
|
#endif // TSAN_CLOCK_H
|
||||||
|
|
|
@ -38,15 +38,40 @@
|
||||||
|
|
||||||
namespace __tsan {
|
namespace __tsan {
|
||||||
|
|
||||||
|
const int kClkBits = 42;
|
||||||
|
const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
|
||||||
|
|
||||||
|
struct ClockElem {
|
||||||
|
u64 epoch : kClkBits;
|
||||||
|
u64 reused : 64 - kClkBits; // tid reuse count
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ClockBlock {
|
||||||
|
static const uptr kSize = 512;
|
||||||
|
static const uptr kTableSize = kSize / sizeof(u32);
|
||||||
|
static const uptr kClockCount = kSize / sizeof(ClockElem);
|
||||||
|
static const uptr kRefIdx = kTableSize - 1;
|
||||||
|
static const uptr kBlockIdx = kTableSize - 2;
|
||||||
|
|
||||||
|
union {
|
||||||
|
u32 table[kTableSize];
|
||||||
|
ClockElem clock[kClockCount];
|
||||||
|
};
|
||||||
|
|
||||||
|
ClockBlock() {
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const int kTidBits = 13;
|
const int kTidBits = 13;
|
||||||
const unsigned kMaxTid = 1 << kTidBits;
|
// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is
|
||||||
|
// occupied by reference counter, so total number of elements we can store
|
||||||
|
// in SyncClock is kClockCount * (kTableSize - 1).
|
||||||
|
const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount;
|
||||||
#if !SANITIZER_GO
|
#if !SANITIZER_GO
|
||||||
const unsigned kMaxTidInClock = kMaxTid * 2; // This includes msb 'freed' bit.
|
const unsigned kMaxTidInClock = kMaxTid * 2; // This includes msb 'freed' bit.
|
||||||
#else
|
#else
|
||||||
const unsigned kMaxTidInClock = kMaxTid; // Go does not track freed memory.
|
const unsigned kMaxTidInClock = kMaxTid; // Go does not track freed memory.
|
||||||
#endif
|
#endif
|
||||||
const int kClkBits = 42;
|
|
||||||
const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
|
|
||||||
const uptr kShadowStackSize = 64 * 1024;
|
const uptr kShadowStackSize = 64 * 1024;
|
||||||
|
|
||||||
// Count of shadow values in a shadow cell.
|
// Count of shadow values in a shadow cell.
|
||||||
|
@ -74,7 +99,7 @@ const bool kCollectHistory = false;
|
||||||
const bool kCollectHistory = true;
|
const bool kCollectHistory = true;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const unsigned kInvalidTid = (unsigned)-1;
|
const u16 kInvalidTid = kMaxTid + 1;
|
||||||
|
|
||||||
// The following "build consistency" machinery ensures that all source files
|
// The following "build consistency" machinery ensures that all source files
|
||||||
// are built in the same configuration. Inconsistent builds lead to
|
// are built in the same configuration. Inconsistent builds lead to
|
||||||
|
|
|
@ -53,6 +53,31 @@ TEST(Clock, ChunkedBasic) {
|
||||||
chunked.Reset(&cache);
|
chunked.Reset(&cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const uptr interesting_sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66,
|
||||||
|
100, 124, 125, 126, 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254,
|
||||||
|
255};
|
||||||
|
|
||||||
|
TEST(Clock, Iter) {
|
||||||
|
const uptr n = ARRAY_SIZE(interesting_sizes);
|
||||||
|
for (uptr fi = 0; fi < n; fi++) {
|
||||||
|
const uptr size = interesting_sizes[fi];
|
||||||
|
SyncClock sync;
|
||||||
|
ThreadClock vector(0);
|
||||||
|
for (uptr i = 0; i < size; i++)
|
||||||
|
vector.set(&cache, i, i + 1);
|
||||||
|
if (size != 0)
|
||||||
|
vector.release(&cache, &sync);
|
||||||
|
uptr i = 0;
|
||||||
|
for (ClockElem &ce : sync) {
|
||||||
|
ASSERT_LT(i, size);
|
||||||
|
ASSERT_EQ(sync.get_clean(i), ce.epoch);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
ASSERT_EQ(i, size);
|
||||||
|
sync.Reset(&cache);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Clock, AcquireRelease) {
|
TEST(Clock, AcquireRelease) {
|
||||||
ThreadClock vector1(100);
|
ThreadClock vector1(100);
|
||||||
vector1.tick();
|
vector1.tick();
|
||||||
|
@ -216,13 +241,11 @@ TEST(Clock, Growth) {
|
||||||
|
|
||||||
TEST(Clock, Growth2) {
|
TEST(Clock, Growth2) {
|
||||||
// Test clock growth for every pair of sizes:
|
// Test clock growth for every pair of sizes:
|
||||||
const uptr sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66, 100, 124, 125, 126,
|
const uptr n = ARRAY_SIZE(interesting_sizes);
|
||||||
127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254, 255};
|
|
||||||
const uptr n = sizeof(sizes) / sizeof(sizes[0]);
|
|
||||||
for (uptr fi = 0; fi < n; fi++) {
|
for (uptr fi = 0; fi < n; fi++) {
|
||||||
for (uptr ti = fi + 1; ti < n; ti++) {
|
for (uptr ti = fi + 1; ti < n; ti++) {
|
||||||
const uptr from = sizes[fi];
|
const uptr from = interesting_sizes[fi];
|
||||||
const uptr to = sizes[ti];
|
const uptr to = interesting_sizes[ti];
|
||||||
SyncClock sync;
|
SyncClock sync;
|
||||||
ThreadClock vector(0);
|
ThreadClock vector(0);
|
||||||
for (uptr i = 0; i < from; i++)
|
for (uptr i = 0; i < from; i++)
|
||||||
|
|
Loading…
Reference in New Issue