llvm-project/compiler-rt/lib/memprof/memprof_allocator.cpp

//===-- memprof_allocator.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file is a part of MemProfiler, a memory profiler.
//
// Implementation of MemProf's memory allocator, which uses the allocator
// from sanitizer_common.
//
//===----------------------------------------------------------------------===//

#include "memprof_allocator.h"
#include "memprof_mapping.h"
#include "memprof_stack.h"
#include "memprof_thread.h"
#include "sanitizer_common/sanitizer_allocator_checks.h"
#include "sanitizer_common/sanitizer_allocator_interface.h"
#include "sanitizer_common/sanitizer_allocator_report.h"
#include "sanitizer_common/sanitizer_errno.h"
#include "sanitizer_common/sanitizer_file.h"
#include "sanitizer_common/sanitizer_flags.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_list.h"
#include "sanitizer_common/sanitizer_stackdepot.h"

#include <sched.h>
#include <stdlib.h>
#include <time.h>

namespace __memprof {

static int GetCpuId(void) {
  // _memprof_preinit is called via the preinit_array, which subsequently calls
  // malloc. Since this is before _dl_init calls VDSO_SETUP, sched_getcpu
  // will seg fault as the address of __vdso_getcpu will be null.
  if (!memprof_init_done)
    return -1;
  return sched_getcpu();
}

// Compute the timestamp in ms.
static int GetTimestamp(void) {
  // timespec_get will segfault if called from dl_init
  if (!memprof_timestamp_inited) {
    // By returning 0, this will be effectively treated as being
    // timestamped at memprof init time (when memprof_init_timestamp_s
    // is initialized).
    return 0;
  }
  timespec ts;
  clock_gettime(CLOCK_REALTIME, &ts);
  return (ts.tv_sec - memprof_init_timestamp_s) * 1000 + ts.tv_nsec / 1000000;
}

static MemprofAllocator &get_allocator();

// The memory chunk allocated from the underlying allocator looks like this:
// H H U U U U U U
//   H -- ChunkHeader (32 bytes)
//   U -- user memory.

// If there is left padding before the ChunkHeader (due to use of memalign),
// we store a magic value in the first uptr word of the memory block and
// store the address of ChunkHeader in the next uptr.
// M B L L L L L L L L L  H H U U U U U U
//   |                    ^
//   ---------------------|
//   M -- magic value kAllocBegMagic
//   B -- address of ChunkHeader pointing to the first 'H'

constexpr uptr kMaxAllowedMallocBits = 40;

// Should be no more than 32-bytes
struct ChunkHeader {
  // 1-st 4 bytes.
  u32 alloc_context_id;
  // 2-nd 4 bytes
  u32 cpu_id;
  // 3-rd 4 bytes
  u32 timestamp_ms;
  // 4-th 4 bytes
  // Note only 1 bit is needed for this flag if we need space in the future for
  // more fields.
  u32 from_memalign;
  // 5-th and 6-th 4 bytes
  // The max size of an allocation is 2^40 (kMaxAllowedMallocSize), so this
  // could be shrunk to kMaxAllowedMallocBits if we need space in the future for
  // more fields.
  atomic_uint64_t user_requested_size;
  // 23 bits available
  // 7-th and 8-th 4 bytes
  u64 data_type_id; // TODO: hash of type name
};

static const uptr kChunkHeaderSize = sizeof(ChunkHeader);
COMPILER_CHECK(kChunkHeaderSize == 32);

struct MemprofChunk : ChunkHeader {
  uptr Beg() { return reinterpret_cast<uptr>(this) + kChunkHeaderSize; }
  uptr UsedSize() {
    return atomic_load(&user_requested_size, memory_order_relaxed);
  }
  void *AllocBeg() {
    if (from_memalign)
      return get_allocator().GetBlockBegin(reinterpret_cast<void *>(this));
    return reinterpret_cast<void *>(this);
  }
};

class LargeChunkHeader {
  static constexpr uptr kAllocBegMagic =
      FIRST_32_SECOND_64(0xCC6E96B9, 0xCC6E96B9CC6E96B9ULL);
  atomic_uintptr_t magic;
  MemprofChunk *chunk_header;

public:
  MemprofChunk *Get() const {
    return atomic_load(&magic, memory_order_acquire) == kAllocBegMagic
               ? chunk_header
               : nullptr;
  }

  void Set(MemprofChunk *p) {
    if (p) {
      chunk_header = p;
      atomic_store(&magic, kAllocBegMagic, memory_order_release);
      return;
    }

    uptr old = kAllocBegMagic;
    if (!atomic_compare_exchange_strong(&magic, &old, 0,
                                        memory_order_release)) {
      CHECK_EQ(old, kAllocBegMagic);
    }
  }
};

void FlushUnneededMemProfShadowMemory(uptr p, uptr size) {
  // Since memprof's mapping is compacting, the shadow chunk may be
  // not page-aligned, so we only flush the page-aligned portion.
  ReleaseMemoryPagesToOS(MemToShadow(p), MemToShadow(p + size));
}

void MemprofMapUnmapCallback::OnMap(uptr p, uptr size) const {
  // Statistics.
  MemprofStats &thread_stats = GetCurrentThreadStats();
  thread_stats.mmaps++;
  thread_stats.mmaped += size;
}
void MemprofMapUnmapCallback::OnUnmap(uptr p, uptr size) const {
  // We are about to unmap a chunk of user memory.
  // Mark the corresponding shadow memory as not needed.
  FlushUnneededMemProfShadowMemory(p, size);
  // Statistics.
  MemprofStats &thread_stats = GetCurrentThreadStats();
  thread_stats.munmaps++;
  thread_stats.munmaped += size;
}

AllocatorCache *GetAllocatorCache(MemprofThreadLocalMallocStorage *ms) {
  CHECK(ms);
  return &ms->allocator_cache;
}

struct MemInfoBlock {
  u32 alloc_count;
  u64 total_access_count, min_access_count, max_access_count;
  u64 total_size;
  u32 min_size, max_size;
  u32 alloc_timestamp, dealloc_timestamp;
  u64 total_lifetime;
  u32 min_lifetime, max_lifetime;
  u32 alloc_cpu_id, dealloc_cpu_id;
  u32 num_migrated_cpu;

  // Only compared to prior deallocated object currently.
  u32 num_lifetime_overlaps;
  u32 num_same_alloc_cpu;
  u32 num_same_dealloc_cpu;

  u64 data_type_id; // TODO: hash of type name

  MemInfoBlock() : alloc_count(0) {}

  MemInfoBlock(u32 size, u64 access_count, u32 alloc_timestamp,
               u32 dealloc_timestamp, u32 alloc_cpu, u32 dealloc_cpu)
      : alloc_count(1), total_access_count(access_count),
        min_access_count(access_count), max_access_count(access_count),
        total_size(size), min_size(size), max_size(size),
        alloc_timestamp(alloc_timestamp), dealloc_timestamp(dealloc_timestamp),
        total_lifetime(dealloc_timestamp - alloc_timestamp),
        min_lifetime(total_lifetime), max_lifetime(total_lifetime),
        alloc_cpu_id(alloc_cpu), dealloc_cpu_id(dealloc_cpu),
        num_lifetime_overlaps(0), num_same_alloc_cpu(0),
        num_same_dealloc_cpu(0) {
    num_migrated_cpu = alloc_cpu_id != dealloc_cpu_id;
  }

  void Print(u64 id) {
    u64 p;
    if (flags()->print_terse) {
      p = total_size * 100 / alloc_count;
      Printf("MIB:%llu/%u/%llu.%02llu/%u/%u/", id, alloc_count, p / 100, p % 100,
             min_size, max_size);
      p = total_access_count * 100 / alloc_count;
      Printf("%llu.%02llu/%llu/%llu/", p / 100, p % 100, min_access_count,
             max_access_count);
      p = total_lifetime * 100 / alloc_count;
      Printf("%llu.%02llu/%u/%u/", p / 100, p % 100, min_lifetime, max_lifetime);
      Printf("%u/%u/%u/%u\n", num_migrated_cpu, num_lifetime_overlaps,
             num_same_alloc_cpu, num_same_dealloc_cpu);
    } else {
      p = total_size * 100 / alloc_count;
      Printf("Memory allocation stack id = %llu\n", id);
      Printf("\talloc_count %u, size (ave/min/max) %llu.%02llu / %u / %u\n",
             alloc_count, p / 100, p % 100, min_size, max_size);
      p = total_access_count * 100 / alloc_count;
      Printf("\taccess_count (ave/min/max): %llu.%02llu / %llu / %llu\n", p / 100,
             p % 100, min_access_count, max_access_count);
      p = total_lifetime * 100 / alloc_count;
      Printf("\tlifetime (ave/min/max): %llu.%02llu / %u / %u\n", p / 100, p % 100,
             min_lifetime, max_lifetime);
      Printf("\tnum migrated: %u, num lifetime overlaps: %u, num same alloc "
             "cpu: %u, num same dealloc_cpu: %u\n",
             num_migrated_cpu, num_lifetime_overlaps, num_same_alloc_cpu,
             num_same_dealloc_cpu);
    }
  }

  static void printHeader() {
    CHECK(flags()->print_terse);
    Printf("MIB:StackID/AllocCount/AveSize/MinSize/MaxSize/AveAccessCount/"
           "MinAccessCount/MaxAccessCount/AveLifetime/MinLifetime/MaxLifetime/"
           "NumMigratedCpu/NumLifetimeOverlaps/NumSameAllocCpu/"
           "NumSameDeallocCpu\n");
  }

  void Merge(MemInfoBlock &newMIB) {
    alloc_count += newMIB.alloc_count;

    total_access_count += newMIB.total_access_count;
    min_access_count = Min(min_access_count, newMIB.min_access_count);
    max_access_count = Max(max_access_count, newMIB.max_access_count);

    total_size += newMIB.total_size;
    min_size = Min(min_size, newMIB.min_size);
    max_size = Max(max_size, newMIB.max_size);

    total_lifetime += newMIB.total_lifetime;
    min_lifetime = Min(min_lifetime, newMIB.min_lifetime);
    max_lifetime = Max(max_lifetime, newMIB.max_lifetime);

    // We know newMIB was deallocated later, so just need to check if it was
    // allocated before last one deallocated.
    num_lifetime_overlaps += newMIB.alloc_timestamp < dealloc_timestamp;
    alloc_timestamp = newMIB.alloc_timestamp;
    dealloc_timestamp = newMIB.dealloc_timestamp;

    num_same_alloc_cpu += alloc_cpu_id == newMIB.alloc_cpu_id;
    num_same_dealloc_cpu += dealloc_cpu_id == newMIB.dealloc_cpu_id;
    alloc_cpu_id = newMIB.alloc_cpu_id;
    dealloc_cpu_id = newMIB.dealloc_cpu_id;
  }
};

struct SetEntry {
  SetEntry() : id(0), MIB() {}
  bool Empty() { return id == 0; }
  void Print() {
    CHECK(!Empty());
    MIB.Print(id);
  }
  // The stack id
  u64 id;
  MemInfoBlock MIB;
};

struct CacheSet {
  enum { kSetSize = 4 };

  void PrintAll() {
    for (int i = 0; i < kSetSize; i++) {
      if (Entries[i].Empty())
        continue;
      Entries[i].Print();
    }
  }
  void insertOrMerge(u64 new_id, MemInfoBlock &newMIB) {
    SpinMutexLock l(&SetMutex);
    AccessCount++;

    for (int i = 0; i < kSetSize; i++) {
      auto id = Entries[i].id;
      // Check if this is a hit or an empty entry. Since we always move any
      // filled locations to the front of the array (see below), we don't need
      // to look after finding the first empty entry.
      if (id == new_id || !id) {
        if (id == 0) {
          Entries[i].id = new_id;
          Entries[i].MIB = newMIB;
        } else {
          Entries[i].MIB.Merge(newMIB);
        }
        // Assuming some id locality, we try to swap the matching entry
        // into the first set position.
        if (i != 0) {
          auto tmp = Entries[0];
          Entries[0] = Entries[i];
          Entries[i] = tmp;
        }
        return;
      }
    }

    // Miss
    MissCount++;

    // We try to find the entries with the lowest alloc count to be evicted:
    int min_idx = 0;
    u64 min_count = Entries[0].MIB.alloc_count;
    for (int i = 1; i < kSetSize; i++) {
      CHECK(!Entries[i].Empty());
      if (Entries[i].MIB.alloc_count < min_count) {
        min_idx = i;
        min_count = Entries[i].MIB.alloc_count;
      }
    }

    // Print the evicted entry profile information
    if (!flags()->print_terse)
      Printf("Evicted:\n");
    Entries[min_idx].Print();

    // Similar to the hit case, put new MIB in first set position.
    if (min_idx != 0)
      Entries[min_idx] = Entries[0];
    Entries[0].id = new_id;
    Entries[0].MIB = newMIB;
  }

  void PrintMissRate(int i) {
    u64 p = AccessCount ? MissCount * 10000ULL / AccessCount : 0;
    Printf("Set %d miss rate: %d / %d = %5llu.%02llu%%\n", i, MissCount,
           AccessCount, p / 100, p % 100);
  }

  SetEntry Entries[kSetSize];
  u32 AccessCount = 0;
  u32 MissCount = 0;
  SpinMutex SetMutex;
};

struct MemInfoBlockCache {
  MemInfoBlockCache() {
    if (common_flags()->print_module_map)
      DumpProcessMap();
    if (flags()->print_terse)
      MemInfoBlock::printHeader();
    Sets =
        (CacheSet *)malloc(sizeof(CacheSet) * flags()->mem_info_cache_entries);
    Constructed = true;
  }

  ~MemInfoBlockCache() { free(Sets); }

  void insertOrMerge(u64 new_id, MemInfoBlock &newMIB) {
    u64 hv = new_id;

    // Use mod method where number of entries should be a prime close to power
    // of 2.
    hv %= flags()->mem_info_cache_entries;

    return Sets[hv].insertOrMerge(new_id, newMIB);
  }

  void PrintAll() {
    for (int i = 0; i < flags()->mem_info_cache_entries; i++) {
      Sets[i].PrintAll();
    }
  }

  void PrintMissRate() {
    if (!flags()->print_mem_info_cache_miss_rate)
      return;
    u64 MissCountSum = 0;
    u64 AccessCountSum = 0;
    for (int i = 0; i < flags()->mem_info_cache_entries; i++) {
      MissCountSum += Sets[i].MissCount;
      AccessCountSum += Sets[i].AccessCount;
    }
    u64 p = AccessCountSum ? MissCountSum * 10000ULL / AccessCountSum : 0;
    Printf("Overall miss rate: %llu / %llu = %5llu.%02llu%%\n", MissCountSum,
           AccessCountSum, p / 100, p % 100);
    if (flags()->print_mem_info_cache_miss_rate_details)
      for (int i = 0; i < flags()->mem_info_cache_entries; i++)
        Sets[i].PrintMissRate(i);
  }

  CacheSet *Sets;
  // Flag when the Sets have been allocated, in case a deallocation is called
  // very early before the static init of the Allocator and therefore this table
  // have completed.
  bool Constructed = false;
};

// Accumulates the access count from the shadow for the given pointer and size.
u64 GetShadowCount(uptr p, u32 size) {
  u64 *shadow = (u64 *)MEM_TO_SHADOW(p);
  u64 *shadow_end = (u64 *)MEM_TO_SHADOW(p + size);
  u64 count = 0;
  for (; shadow <= shadow_end; shadow++)
    count += *shadow;
  return count;
}

// Clears the shadow counters (when memory is allocated).
void ClearShadow(uptr addr, uptr size) {
  CHECK(AddrIsAlignedByGranularity(addr));
  CHECK(AddrIsInMem(addr));
  CHECK(AddrIsAlignedByGranularity(addr + size));
  CHECK(AddrIsInMem(addr + size - SHADOW_GRANULARITY));
  CHECK(REAL(memset));
  uptr shadow_beg = MEM_TO_SHADOW(addr);
  uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
  if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
    REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
  } else {
    uptr page_size = GetPageSizeCached();
    uptr page_beg = RoundUpTo(shadow_beg, page_size);
    uptr page_end = RoundDownTo(shadow_end, page_size);

    if (page_beg >= page_end) {
      REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
    } else {
      if (page_beg != shadow_beg) {
        REAL(memset)((void *)shadow_beg, 0, page_beg - shadow_beg);
      }
      if (page_end != shadow_end) {
        REAL(memset)((void *)page_end, 0, shadow_end - page_end);
      }
      ReserveShadowMemoryRange(page_beg, page_end - 1, nullptr);
    }
  }
}

struct Allocator {
  static const uptr kMaxAllowedMallocSize = 1ULL << kMaxAllowedMallocBits;

  MemprofAllocator allocator;
  StaticSpinMutex fallback_mutex;
  AllocatorCache fallback_allocator_cache;

  uptr max_user_defined_malloc_size;
  atomic_uint8_t rss_limit_exceeded;

  MemInfoBlockCache MemInfoBlockTable;
  bool destructing;

  // ------------------- Initialization ------------------------
  explicit Allocator(LinkerInitialized) : destructing(false) {}

  ~Allocator() { FinishAndPrint(); }

  void FinishAndPrint() {
    if (!flags()->print_terse)
      Printf("Live on exit:\n");
    allocator.ForceLock();
    allocator.ForEachChunk(
        [](uptr chunk, void *alloc) {
          u64 user_requested_size;
          MemprofChunk *m =
              ((Allocator *)alloc)
                  ->GetMemprofChunk((void *)chunk, user_requested_size);
          if (!m)
            return;
          uptr user_beg = ((uptr)m) + kChunkHeaderSize;
          u64 c = GetShadowCount(user_beg, user_requested_size);
          long curtime = GetTimestamp();
          MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
                              m->cpu_id, GetCpuId());
          ((Allocator *)alloc)
              ->MemInfoBlockTable.insertOrMerge(m->alloc_context_id, newMIB);
        },
        this);
    allocator.ForceUnlock();

    destructing = true;
    MemInfoBlockTable.PrintMissRate();
    MemInfoBlockTable.PrintAll();
    StackDepotPrintAll();
  }

  void InitLinkerInitialized() {
    SetAllocatorMayReturnNull(common_flags()->allocator_may_return_null);
    allocator.InitLinkerInitialized(
        common_flags()->allocator_release_to_os_interval_ms);
    max_user_defined_malloc_size = common_flags()->max_allocation_size_mb
                                       ? common_flags()->max_allocation_size_mb
                                             << 20
                                       : kMaxAllowedMallocSize;
  }

  bool RssLimitExceeded() {
    return atomic_load(&rss_limit_exceeded, memory_order_relaxed);
  }

  void SetRssLimitExceeded(bool limit_exceeded) {
    atomic_store(&rss_limit_exceeded, limit_exceeded, memory_order_relaxed);
  }

  // -------------------- Allocation/Deallocation routines ---------------
  void *Allocate(uptr size, uptr alignment, BufferedStackTrace *stack,
                 AllocType alloc_type) {
    if (UNLIKELY(!memprof_inited))
      MemprofInitFromRtl();
    if (RssLimitExceeded()) {
      if (AllocatorMayReturnNull())
        return nullptr;
      ReportRssLimitExceeded(stack);
    }
    CHECK(stack);
    const uptr min_alignment = MEMPROF_ALIGNMENT;
    if (alignment < min_alignment)
      alignment = min_alignment;
    if (size == 0) {
      // We'd be happy to avoid allocating memory for zero-size requests, but
      // some programs/tests depend on this behavior and assume that malloc
      // would not return NULL even for zero-size allocations. Moreover, it
      // looks like operator new should never return NULL, and results of
      // consecutive "new" calls must be different even if the allocated size
      // is zero.
      size = 1;
    }
    CHECK(IsPowerOfTwo(alignment));
    uptr rounded_size = RoundUpTo(size, alignment);
    uptr needed_size = rounded_size + kChunkHeaderSize;
    if (alignment > min_alignment)
      needed_size += alignment;
    CHECK(IsAligned(needed_size, min_alignment));
    if (size > kMaxAllowedMallocSize || needed_size > kMaxAllowedMallocSize ||
        size > max_user_defined_malloc_size) {
      if (AllocatorMayReturnNull()) {
        Report("WARNING: MemProfiler failed to allocate 0x%zx bytes\n", size);
        return nullptr;
      }
      uptr malloc_limit =
          Min(kMaxAllowedMallocSize, max_user_defined_malloc_size);
      ReportAllocationSizeTooBig(size, malloc_limit, stack);
    }

    MemprofThread *t = GetCurrentThread();
    void *allocated;
    if (t) {
      AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
      allocated = allocator.Allocate(cache, needed_size, 8);
    } else {
      SpinMutexLock l(&fallback_mutex);
      AllocatorCache *cache = &fallback_allocator_cache;
      allocated = allocator.Allocate(cache, needed_size, 8);
    }
    if (UNLIKELY(!allocated)) {
      SetAllocatorOutOfMemory();
      if (AllocatorMayReturnNull())
        return nullptr;
      ReportOutOfMemory(size, stack);
    }

    uptr alloc_beg = reinterpret_cast<uptr>(allocated);
    uptr alloc_end = alloc_beg + needed_size;
    uptr beg_plus_header = alloc_beg + kChunkHeaderSize;
    uptr user_beg = beg_plus_header;
    if (!IsAligned(user_beg, alignment))
      user_beg = RoundUpTo(user_beg, alignment);
    uptr user_end = user_beg + size;
    CHECK_LE(user_end, alloc_end);
    uptr chunk_beg = user_beg - kChunkHeaderSize;
    MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);
    m->from_memalign = alloc_beg != chunk_beg;
    CHECK(size);

    m->cpu_id = GetCpuId();
    m->timestamp_ms = GetTimestamp();
    m->alloc_context_id = StackDepotPut(*stack);

    uptr size_rounded_down_to_granularity =
        RoundDownTo(size, SHADOW_GRANULARITY);
    if (size_rounded_down_to_granularity)
      ClearShadow(user_beg, size_rounded_down_to_granularity);

    MemprofStats &thread_stats = GetCurrentThreadStats();
    thread_stats.mallocs++;
    thread_stats.malloced += size;
    thread_stats.malloced_overhead += needed_size - size;
    if (needed_size > SizeClassMap::kMaxSize)
      thread_stats.malloc_large++;
    else
      thread_stats.malloced_by_size[SizeClassMap::ClassID(needed_size)]++;

    void *res = reinterpret_cast<void *>(user_beg);
    atomic_store(&m->user_requested_size, size, memory_order_release);
    if (alloc_beg != chunk_beg) {
      CHECK_LE(alloc_beg + sizeof(LargeChunkHeader), chunk_beg);
      reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Set(m);
    }
    MEMPROF_MALLOC_HOOK(res, size);
    return res;
  }

  void Deallocate(void *ptr, uptr delete_size, uptr delete_alignment,
                  BufferedStackTrace *stack, AllocType alloc_type) {
    uptr p = reinterpret_cast<uptr>(ptr);
    if (p == 0)
      return;

    MEMPROF_FREE_HOOK(ptr);

    uptr chunk_beg = p - kChunkHeaderSize;
    MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);

    u64 user_requested_size =
        atomic_exchange(&m->user_requested_size, 0, memory_order_acquire);
    if (memprof_inited && memprof_init_done && !destructing &&
        MemInfoBlockTable.Constructed) {
      u64 c = GetShadowCount(p, user_requested_size);
      long curtime = GetTimestamp();

      MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
                          m->cpu_id, GetCpuId());
        MemInfoBlockTable.insertOrMerge(m->alloc_context_id, newMIB);
    }

    MemprofStats &thread_stats = GetCurrentThreadStats();
    thread_stats.frees++;
    thread_stats.freed += user_requested_size;

    void *alloc_beg = m->AllocBeg();
    if (alloc_beg != m) {
      // Clear the magic value, as allocator internals may overwrite the
      // contents of deallocated chunk, confusing GetMemprofChunk lookup.
      reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Set(nullptr);
    }

    MemprofThread *t = GetCurrentThread();
    if (t) {
      AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
      allocator.Deallocate(cache, alloc_beg);
    } else {
      SpinMutexLock l(&fallback_mutex);
      AllocatorCache *cache = &fallback_allocator_cache;
      allocator.Deallocate(cache, alloc_beg);
    }
  }

  void *Reallocate(void *old_ptr, uptr new_size, BufferedStackTrace *stack) {
    CHECK(old_ptr && new_size);
    uptr p = reinterpret_cast<uptr>(old_ptr);
    uptr chunk_beg = p - kChunkHeaderSize;
    MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);

    MemprofStats &thread_stats = GetCurrentThreadStats();
    thread_stats.reallocs++;
    thread_stats.realloced += new_size;

    void *new_ptr = Allocate(new_size, 8, stack, FROM_MALLOC);
    if (new_ptr) {
      CHECK_NE(REAL(memcpy), nullptr);
      uptr memcpy_size = Min(new_size, m->UsedSize());
      REAL(memcpy)(new_ptr, old_ptr, memcpy_size);
      Deallocate(old_ptr, 0, 0, stack, FROM_MALLOC);
    }
    return new_ptr;
  }

  void *Calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
    if (UNLIKELY(CheckForCallocOverflow(size, nmemb))) {
      if (AllocatorMayReturnNull())
        return nullptr;
      ReportCallocOverflow(nmemb, size, stack);
    }
    void *ptr = Allocate(nmemb * size, 8, stack, FROM_MALLOC);
    // If the memory comes from the secondary allocator no need to clear it
    // as it comes directly from mmap.
    if (ptr && allocator.FromPrimary(ptr))
      REAL(memset)(ptr, 0, nmemb * size);
    return ptr;
  }

  void CommitBack(MemprofThreadLocalMallocStorage *ms,
                  BufferedStackTrace *stack) {
    AllocatorCache *ac = GetAllocatorCache(ms);
    allocator.SwallowCache(ac);
  }

  // -------------------------- Chunk lookup ----------------------

  // Assumes alloc_beg == allocator.GetBlockBegin(alloc_beg).
  MemprofChunk *GetMemprofChunk(void *alloc_beg, u64 &user_requested_size) {
    if (!alloc_beg)
      return nullptr;
    MemprofChunk *p = reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Get();
    if (!p) {
      if (!allocator.FromPrimary(alloc_beg))
        return nullptr;
      p = reinterpret_cast<MemprofChunk *>(alloc_beg);
    }
    // The size is reset to 0 on deallocation (and a min of 1 on
    // allocation).
    user_requested_size =
        atomic_load(&p->user_requested_size, memory_order_acquire);
    if (user_requested_size)
      return p;
    return nullptr;
  }

  MemprofChunk *GetMemprofChunkByAddr(uptr p, u64 &user_requested_size) {
    void *alloc_beg = allocator.GetBlockBegin(reinterpret_cast<void *>(p));
    return GetMemprofChunk(alloc_beg, user_requested_size);
  }

  uptr AllocationSize(uptr p) {
    u64 user_requested_size;
    MemprofChunk *m = GetMemprofChunkByAddr(p, user_requested_size);
    if (!m)
      return 0;
    if (m->Beg() != p)
      return 0;
    return user_requested_size;
  }

  void Purge(BufferedStackTrace *stack) { allocator.ForceReleaseToOS(); }

  void PrintStats() { allocator.PrintStats(); }

  void ForceLock() NO_THREAD_SAFETY_ANALYSIS {
    allocator.ForceLock();
    fallback_mutex.Lock();
  }

  void ForceUnlock() NO_THREAD_SAFETY_ANALYSIS {
    fallback_mutex.Unlock();
    allocator.ForceUnlock();
  }
};

static Allocator instance(LINKER_INITIALIZED);

static MemprofAllocator &get_allocator() { return instance.allocator; }

void InitializeAllocator() { instance.InitLinkerInitialized(); }

void MemprofThreadLocalMallocStorage::CommitBack() {
  GET_STACK_TRACE_MALLOC;
  instance.CommitBack(this, &stack);
}

void PrintInternalAllocatorStats() { instance.PrintStats(); }

void memprof_free(void *ptr, BufferedStackTrace *stack, AllocType alloc_type) {
  instance.Deallocate(ptr, 0, 0, stack, alloc_type);
}

void memprof_delete(void *ptr, uptr size, uptr alignment,
                    BufferedStackTrace *stack, AllocType alloc_type) {
  instance.Deallocate(ptr, size, alignment, stack, alloc_type);
}

void *memprof_malloc(uptr size, BufferedStackTrace *stack) {
  return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC));
}

void *memprof_calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
  return SetErrnoOnNull(instance.Calloc(nmemb, size, stack));
}

void *memprof_reallocarray(void *p, uptr nmemb, uptr size,
                           BufferedStackTrace *stack) {
  if (UNLIKELY(CheckForCallocOverflow(size, nmemb))) {
    errno = errno_ENOMEM;
    if (AllocatorMayReturnNull())
      return nullptr;
    ReportReallocArrayOverflow(nmemb, size, stack);
  }
  return memprof_realloc(p, nmemb * size, stack);
}

void *memprof_realloc(void *p, uptr size, BufferedStackTrace *stack) {
  if (!p)
    return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC));
  if (size == 0) {
    if (flags()->allocator_frees_and_returns_null_on_realloc_zero) {
      instance.Deallocate(p, 0, 0, stack, FROM_MALLOC);
      return nullptr;
    }
    // Allocate a size of 1 if we shouldn't free() on Realloc to 0
    size = 1;
  }
  return SetErrnoOnNull(instance.Reallocate(p, size, stack));
}

void *memprof_valloc(uptr size, BufferedStackTrace *stack) {
  return SetErrnoOnNull(
      instance.Allocate(size, GetPageSizeCached(), stack, FROM_MALLOC));
}

void *memprof_pvalloc(uptr size, BufferedStackTrace *stack) {
  uptr PageSize = GetPageSizeCached();
  if (UNLIKELY(CheckForPvallocOverflow(size, PageSize))) {
    errno = errno_ENOMEM;
    if (AllocatorMayReturnNull())
      return nullptr;
    ReportPvallocOverflow(size, stack);
  }
  // pvalloc(0) should allocate one page.
  size = size ? RoundUpTo(size, PageSize) : PageSize;
  return SetErrnoOnNull(instance.Allocate(size, PageSize, stack, FROM_MALLOC));
}

void *memprof_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
                       AllocType alloc_type) {
  if (UNLIKELY(!IsPowerOfTwo(alignment))) {
    errno = errno_EINVAL;
    if (AllocatorMayReturnNull())
      return nullptr;
    ReportInvalidAllocationAlignment(alignment, stack);
  }
  return SetErrnoOnNull(instance.Allocate(size, alignment, stack, alloc_type));
}

void *memprof_aligned_alloc(uptr alignment, uptr size,
                            BufferedStackTrace *stack) {
  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(alignment, size))) {
    errno = errno_EINVAL;
    if (AllocatorMayReturnNull())
      return nullptr;
    ReportInvalidAlignedAllocAlignment(size, alignment, stack);
  }
  return SetErrnoOnNull(instance.Allocate(size, alignment, stack, FROM_MALLOC));
}

int memprof_posix_memalign(void **memptr, uptr alignment, uptr size,
                           BufferedStackTrace *stack) {
  if (UNLIKELY(!CheckPosixMemalignAlignment(alignment))) {
    if (AllocatorMayReturnNull())
      return errno_EINVAL;
    ReportInvalidPosixMemalignAlignment(alignment, stack);
  }
  void *ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC);
  if (UNLIKELY(!ptr))
    // OOM error is already taken care of by Allocate.
    return errno_ENOMEM;
  CHECK(IsAligned((uptr)ptr, alignment));
  *memptr = ptr;
  return 0;
}

uptr memprof_malloc_usable_size(const void *ptr, uptr pc, uptr bp) {
  if (!ptr)
    return 0;
  uptr usable_size = instance.AllocationSize(reinterpret_cast<uptr>(ptr));
  return usable_size;
}

void MemprofSoftRssLimitExceededCallback(bool limit_exceeded) {
  instance.SetRssLimitExceeded(limit_exceeded);
}

} // namespace __memprof

// ---------------------- Interface ---------------- {{{1
using namespace __memprof;

#if !SANITIZER_SUPPORTS_WEAK_HOOKS
// Provide default (no-op) implementation of malloc hooks.
SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_malloc_hook, void *ptr,
                             uptr size) {
  (void)ptr;
  (void)size;
}

SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_free_hook, void *ptr) {
  (void)ptr;
}
#endif

uptr __sanitizer_get_estimated_allocated_size(uptr size) { return size; }

int __sanitizer_get_ownership(const void *p) {
  return memprof_malloc_usable_size(p, 0, 0) != 0;
}

uptr __sanitizer_get_allocated_size(const void *p) {
  return memprof_malloc_usable_size(p, 0, 0);
}

int __memprof_profile_dump() {
  instance.FinishAndPrint();
  // In the future we may want to return non-zero if there are any errors
  // detected during the dumping process.
  return 0;
}