2016-07-21 06:06:41 +08:00
|
|
|
//===-- sanitizer_allocator_primary64.h -------------------------*- C++ -*-===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Part of the Sanitizer Allocator.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef SANITIZER_ALLOCATOR_H
|
|
|
|
#error This file must be included inside sanitizer_allocator.h
|
|
|
|
#endif
|
|
|
|
|
2016-08-23 08:30:43 +08:00
|
|
|
template<class SizeClassAllocator> struct SizeClassAllocator64LocalCache;
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
// SizeClassAllocator64 -- allocator for 64-bit address space.
|
2016-08-26 04:23:08 +08:00
|
|
|
// The template parameter Params is a class containing the actual parameters.
|
2016-07-21 06:06:41 +08:00
|
|
|
//
|
|
|
|
// Space: a portion of address space of kSpaceSize bytes starting at SpaceBeg.
|
|
|
|
// If kSpaceBeg is ~0 then SpaceBeg is chosen dynamically my mmap.
|
|
|
|
// Otherwise SpaceBeg=kSpaceBeg (fixed address).
|
|
|
|
// kSpaceSize is a power of two.
|
|
|
|
// At the beginning the entire space is mprotect-ed, then small parts of it
|
|
|
|
// are mapped on demand.
|
|
|
|
//
|
|
|
|
// Region: a part of Space dedicated to a single size class.
|
|
|
|
// There are kNumClasses Regions of equal size.
|
|
|
|
//
|
|
|
|
// UserChunk: a piece of memory returned to user.
|
|
|
|
// MetaChunk: kMetadataSize bytes of metadata associated with a UserChunk.
|
2016-08-25 05:20:10 +08:00
|
|
|
|
|
|
|
// FreeArray is an array free-d chunks (stored as 4-byte offsets)
|
2016-07-21 06:06:41 +08:00
|
|
|
//
|
|
|
|
// A Region looks like this:
|
2016-08-25 05:20:10 +08:00
|
|
|
// UserChunk1 ... UserChunkN <gap> MetaChunkN ... MetaChunk1 FreeArray
|
2016-08-26 04:23:08 +08:00
|
|
|
|
2016-08-26 08:06:03 +08:00
|
|
|
struct SizeClassAllocator64FlagMasks { // Bit masks.
|
|
|
|
enum {
|
|
|
|
kRandomShuffleChunks = 1,
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2016-08-26 04:23:08 +08:00
|
|
|
template <class Params>
|
2016-07-21 06:06:41 +08:00
|
|
|
class SizeClassAllocator64 {
|
|
|
|
public:
|
2016-08-26 04:23:08 +08:00
|
|
|
static const uptr kSpaceBeg = Params::kSpaceBeg;
|
|
|
|
static const uptr kSpaceSize = Params::kSpaceSize;
|
|
|
|
static const uptr kMetadataSize = Params::kMetadataSize;
|
|
|
|
typedef typename Params::SizeClassMap SizeClassMap;
|
|
|
|
typedef typename Params::MapUnmapCallback MapUnmapCallback;
|
|
|
|
|
2016-08-26 08:06:03 +08:00
|
|
|
static const bool kRandomShuffleChunks =
|
|
|
|
Params::kFlags & SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
|
|
|
|
|
2016-08-26 04:23:08 +08:00
|
|
|
typedef SizeClassAllocator64<Params> ThisT;
|
2016-08-25 01:40:29 +08:00
|
|
|
typedef SizeClassAllocator64LocalCache<ThisT> AllocatorCache;
|
|
|
|
|
2016-08-25 05:20:10 +08:00
|
|
|
// When we know the size class (the region base) we can represent a pointer
|
|
|
|
// as a 4-byte integer (offset from the region start shifted right by 4).
|
|
|
|
typedef u32 CompactPtrT;
|
2016-08-27 07:58:42 +08:00
|
|
|
static const uptr kCompactPtrScale = 4;
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
CompactPtrT PointerToCompactPtr(uptr base, uptr ptr) const {
|
2016-08-27 07:58:42 +08:00
|
|
|
return static_cast<CompactPtrT>((ptr - base) >> kCompactPtrScale);
|
2016-08-25 05:20:10 +08:00
|
|
|
}
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr CompactPtrToPointer(uptr base, CompactPtrT ptr32) const {
|
2016-08-27 07:58:42 +08:00
|
|
|
return base + (static_cast<uptr>(ptr32) << kCompactPtrScale);
|
2016-08-25 05:20:10 +08:00
|
|
|
}
|
|
|
|
|
2016-11-29 08:22:50 +08:00
|
|
|
void Init(s32 release_to_os_interval_ms) {
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr TotalSpaceSize = kSpaceSize + AdditionalSize();
|
|
|
|
if (kUsingConstantSpaceBeg) {
|
2017-11-08 00:19:24 +08:00
|
|
|
CHECK_EQ(kSpaceBeg, address_range.Init(TotalSpaceSize, AllocatorName(),
|
|
|
|
kSpaceBeg));
|
2016-07-21 06:06:41 +08:00
|
|
|
} else {
|
2017-11-08 00:19:24 +08:00
|
|
|
NonConstSpaceBeg = address_range.Init(TotalSpaceSize, AllocatorName());
|
2016-07-21 06:06:41 +08:00
|
|
|
CHECK_NE(NonConstSpaceBeg, ~(uptr)0);
|
|
|
|
}
|
2016-11-29 08:22:50 +08:00
|
|
|
SetReleaseToOSIntervalMs(release_to_os_interval_ms);
|
2017-06-27 06:54:10 +08:00
|
|
|
MapWithCallbackOrDie(SpaceEnd(), AdditionalSize());
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
2016-11-29 08:22:50 +08:00
|
|
|
s32 ReleaseToOSIntervalMs() const {
|
|
|
|
return atomic_load(&release_to_os_interval_ms_, memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetReleaseToOSIntervalMs(s32 release_to_os_interval_ms) {
|
|
|
|
atomic_store(&release_to_os_interval_ms_, release_to_os_interval_ms,
|
|
|
|
memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
2017-10-24 01:12:07 +08:00
|
|
|
void ForceReleaseToOS() {
|
|
|
|
for (uptr class_id = 1; class_id < kNumClasses; class_id++) {
|
|
|
|
BlockingMutexLock l(&GetRegionInfo(class_id)->mutex);
|
|
|
|
MaybeReleaseToOS(class_id, true /*force*/);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
static bool CanAllocate(uptr size, uptr alignment) {
|
|
|
|
return size <= SizeClassMap::kMaxSize &&
|
|
|
|
alignment <= SizeClassMap::kMaxSize;
|
|
|
|
}
|
|
|
|
|
2016-08-25 05:20:10 +08:00
|
|
|
NOINLINE void ReturnToAllocator(AllocatorStats *stat, uptr class_id,
|
|
|
|
const CompactPtrT *chunks, uptr n_chunks) {
|
2016-07-21 06:06:41 +08:00
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr region_beg = GetRegionBeginBySizeClass(class_id);
|
|
|
|
CompactPtrT *free_array = GetFreeArray(region_beg);
|
|
|
|
|
|
|
|
BlockingMutexLock l(®ion->mutex);
|
|
|
|
uptr old_num_chunks = region->num_freed_chunks;
|
|
|
|
uptr new_num_freed_chunks = old_num_chunks + n_chunks;
|
2017-06-27 06:54:10 +08:00
|
|
|
// Failure to allocate free array space while releasing memory is non
|
|
|
|
// recoverable.
|
|
|
|
if (UNLIKELY(!EnsureFreeArraySpace(region, region_beg,
|
|
|
|
new_num_freed_chunks)))
|
|
|
|
DieOnFailure::OnOOM();
|
2016-08-25 05:20:10 +08:00
|
|
|
for (uptr i = 0; i < n_chunks; i++)
|
|
|
|
free_array[old_num_chunks + i] = chunks[i];
|
|
|
|
region->num_freed_chunks = new_num_freed_chunks;
|
2017-06-27 06:54:10 +08:00
|
|
|
region->stats.n_freed += n_chunks;
|
2016-11-29 08:22:50 +08:00
|
|
|
|
2017-10-24 01:12:07 +08:00
|
|
|
MaybeReleaseToOS(class_id, false /*force*/);
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
NOINLINE bool GetFromAllocator(AllocatorStats *stat, uptr class_id,
|
2016-08-25 05:20:10 +08:00
|
|
|
CompactPtrT *chunks, uptr n_chunks) {
|
2016-07-21 06:06:41 +08:00
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr region_beg = GetRegionBeginBySizeClass(class_id);
|
|
|
|
CompactPtrT *free_array = GetFreeArray(region_beg);
|
|
|
|
|
|
|
|
BlockingMutexLock l(®ion->mutex);
|
|
|
|
if (UNLIKELY(region->num_freed_chunks < n_chunks)) {
|
2017-06-27 06:54:10 +08:00
|
|
|
if (UNLIKELY(!PopulateFreeArray(stat, class_id, region,
|
|
|
|
n_chunks - region->num_freed_chunks)))
|
|
|
|
return false;
|
2016-08-25 05:20:10 +08:00
|
|
|
CHECK_GE(region->num_freed_chunks, n_chunks);
|
|
|
|
}
|
|
|
|
region->num_freed_chunks -= n_chunks;
|
|
|
|
uptr base_idx = region->num_freed_chunks;
|
|
|
|
for (uptr i = 0; i < n_chunks; i++)
|
|
|
|
chunks[i] = free_array[base_idx + i];
|
2017-06-27 06:54:10 +08:00
|
|
|
region->stats.n_allocated += n_chunks;
|
|
|
|
return true;
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool PointerIsMine(const void *p) {
|
|
|
|
uptr P = reinterpret_cast<uptr>(p);
|
|
|
|
if (kUsingConstantSpaceBeg && (kSpaceBeg % kSpaceSize) == 0)
|
|
|
|
return P / kSpaceSize == kSpaceBeg / kSpaceSize;
|
|
|
|
return P >= SpaceBeg() && P < SpaceEnd();
|
|
|
|
}
|
|
|
|
|
2016-08-05 02:15:38 +08:00
|
|
|
uptr GetRegionBegin(const void *p) {
|
|
|
|
if (kUsingConstantSpaceBeg)
|
|
|
|
return reinterpret_cast<uptr>(p) & ~(kRegionSize - 1);
|
|
|
|
uptr space_beg = SpaceBeg();
|
2016-08-05 02:30:41 +08:00
|
|
|
return ((reinterpret_cast<uptr>(p) - space_beg) & ~(kRegionSize - 1)) +
|
|
|
|
space_beg;
|
2016-08-05 02:15:38 +08:00
|
|
|
}
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr GetRegionBeginBySizeClass(uptr class_id) const {
|
2016-08-10 07:30:22 +08:00
|
|
|
return SpaceBeg() + kRegionSize * class_id;
|
|
|
|
}
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr GetSizeClass(const void *p) {
|
|
|
|
if (kUsingConstantSpaceBeg && (kSpaceBeg % kSpaceSize) == 0)
|
|
|
|
return ((reinterpret_cast<uptr>(p)) / kRegionSize) % kNumClassesRounded;
|
|
|
|
return ((reinterpret_cast<uptr>(p) - SpaceBeg()) / kRegionSize) %
|
|
|
|
kNumClassesRounded;
|
|
|
|
}
|
|
|
|
|
|
|
|
void *GetBlockBegin(const void *p) {
|
|
|
|
uptr class_id = GetSizeClass(p);
|
2016-08-06 09:24:11 +08:00
|
|
|
uptr size = ClassIdToSize(class_id);
|
2016-07-21 06:06:41 +08:00
|
|
|
if (!size) return nullptr;
|
|
|
|
uptr chunk_idx = GetChunkIdx((uptr)p, size);
|
2016-08-05 02:30:41 +08:00
|
|
|
uptr reg_beg = GetRegionBegin(p);
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr beg = chunk_idx * size;
|
|
|
|
uptr next_beg = beg + size;
|
|
|
|
if (class_id >= kNumClasses) return nullptr;
|
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
|
|
|
if (region->mapped_user >= next_beg)
|
|
|
|
return reinterpret_cast<void*>(reg_beg + beg);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
uptr GetActuallyAllocatedSize(void *p) {
|
|
|
|
CHECK(PointerIsMine(p));
|
2016-08-06 09:24:11 +08:00
|
|
|
return ClassIdToSize(GetSizeClass(p));
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
uptr ClassID(uptr size) { return SizeClassMap::ClassID(size); }
|
|
|
|
|
|
|
|
void *GetMetaData(const void *p) {
|
|
|
|
uptr class_id = GetSizeClass(p);
|
2016-08-06 09:24:11 +08:00
|
|
|
uptr size = ClassIdToSize(class_id);
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr chunk_idx = GetChunkIdx(reinterpret_cast<uptr>(p), size);
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr region_beg = GetRegionBeginBySizeClass(class_id);
|
|
|
|
return reinterpret_cast<void *>(GetMetadataEnd(region_beg) -
|
2016-07-21 06:06:41 +08:00
|
|
|
(1 + chunk_idx) * kMetadataSize);
|
|
|
|
}
|
|
|
|
|
|
|
|
uptr TotalMemoryUsed() {
|
|
|
|
uptr res = 0;
|
|
|
|
for (uptr i = 0; i < kNumClasses; i++)
|
|
|
|
res += GetRegionInfo(i)->allocated_user;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test-only.
|
|
|
|
void TestOnlyUnmap() {
|
2017-06-27 06:54:10 +08:00
|
|
|
UnmapWithCallbackOrDie(SpaceBeg(), kSpaceSize + AdditionalSize());
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
2016-07-22 05:38:40 +08:00
|
|
|
static void FillMemoryProfile(uptr start, uptr rss, bool file, uptr *stats,
|
|
|
|
uptr stats_size) {
|
|
|
|
for (uptr class_id = 0; class_id < stats_size; class_id++)
|
|
|
|
if (stats[class_id] == start)
|
|
|
|
stats[class_id] = rss;
|
|
|
|
}
|
|
|
|
|
2016-08-27 07:58:42 +08:00
|
|
|
void PrintStats(uptr class_id, uptr rss) {
|
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
|
|
|
if (region->mapped_user == 0) return;
|
2017-06-27 06:54:10 +08:00
|
|
|
uptr in_use = region->stats.n_allocated - region->stats.n_freed;
|
2016-08-27 07:58:42 +08:00
|
|
|
uptr avail_chunks = region->allocated_user / ClassIdToSize(class_id);
|
|
|
|
Printf(
|
2017-06-27 06:54:10 +08:00
|
|
|
"%s %02zd (%6zd): mapped: %6zdK allocs: %7zd frees: %7zd inuse: %6zd "
|
2017-10-14 02:38:10 +08:00
|
|
|
"num_freed_chunks %7zd avail: %6zd rss: %6zdK releases: %6zd "
|
|
|
|
"last released: %6zdK region: 0x%zx\n",
|
2017-06-27 06:54:10 +08:00
|
|
|
region->exhausted ? "F" : " ", class_id, ClassIdToSize(class_id),
|
|
|
|
region->mapped_user >> 10, region->stats.n_allocated,
|
|
|
|
region->stats.n_freed, in_use, region->num_freed_chunks, avail_chunks,
|
2017-10-14 02:38:10 +08:00
|
|
|
rss >> 10, region->rtoi.num_releases,
|
|
|
|
region->rtoi.last_released_bytes >> 10,
|
|
|
|
SpaceBeg() + kRegionSize * class_id);
|
2016-08-27 07:58:42 +08:00
|
|
|
}
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
void PrintStats() {
|
2017-10-24 01:58:16 +08:00
|
|
|
uptr rss_stats[kNumClasses];
|
|
|
|
for (uptr class_id = 0; class_id < kNumClasses; class_id++)
|
|
|
|
rss_stats[class_id] = SpaceBeg() + kRegionSize * class_id;
|
|
|
|
GetMemoryProfile(FillMemoryProfile, rss_stats, kNumClasses);
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr total_mapped = 0;
|
2017-10-24 01:58:16 +08:00
|
|
|
uptr total_rss = 0;
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr n_allocated = 0;
|
|
|
|
uptr n_freed = 0;
|
|
|
|
for (uptr class_id = 1; class_id < kNumClasses; class_id++) {
|
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
2017-10-24 01:58:16 +08:00
|
|
|
if (region->mapped_user != 0) {
|
|
|
|
total_mapped += region->mapped_user;
|
|
|
|
total_rss += rss_stats[class_id];
|
|
|
|
}
|
2017-06-27 06:54:10 +08:00
|
|
|
n_allocated += region->stats.n_allocated;
|
|
|
|
n_freed += region->stats.n_freed;
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
2017-10-24 01:58:16 +08:00
|
|
|
|
|
|
|
Printf("Stats: SizeClassAllocator64: %zdM mapped (%zdM rss) in "
|
|
|
|
"%zd allocations; remains %zd\n", total_mapped >> 20,
|
|
|
|
total_rss >> 20, n_allocated, n_allocated - n_freed);
|
2016-08-27 07:58:42 +08:00
|
|
|
for (uptr class_id = 1; class_id < kNumClasses; class_id++)
|
|
|
|
PrintStats(class_id, rss_stats[class_id]);
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// ForceLock() and ForceUnlock() are needed to implement Darwin malloc zone
|
|
|
|
// introspection API.
|
|
|
|
void ForceLock() {
|
|
|
|
for (uptr i = 0; i < kNumClasses; i++) {
|
|
|
|
GetRegionInfo(i)->mutex.Lock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ForceUnlock() {
|
|
|
|
for (int i = (int)kNumClasses - 1; i >= 0; i--) {
|
|
|
|
GetRegionInfo(i)->mutex.Unlock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Iterate over all existing chunks.
|
|
|
|
// The allocator must be locked when calling this function.
|
|
|
|
void ForEachChunk(ForEachChunkCallback callback, void *arg) {
|
|
|
|
for (uptr class_id = 1; class_id < kNumClasses; class_id++) {
|
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
2016-08-06 09:24:11 +08:00
|
|
|
uptr chunk_size = ClassIdToSize(class_id);
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr region_beg = SpaceBeg() + class_id * kRegionSize;
|
|
|
|
for (uptr chunk = region_beg;
|
|
|
|
chunk < region_beg + region->allocated_user;
|
|
|
|
chunk += chunk_size) {
|
|
|
|
// Too slow: CHECK_EQ((void *)chunk, GetBlockBegin((void *)chunk));
|
|
|
|
callback(chunk, arg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-25 05:20:10 +08:00
|
|
|
static uptr ClassIdToSize(uptr class_id) {
|
|
|
|
return SizeClassMap::Size(class_id);
|
|
|
|
}
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
static uptr AdditionalSize() {
|
|
|
|
return RoundUpTo(sizeof(RegionInfo) * kNumClassesRounded,
|
|
|
|
GetPageSizeCached());
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef SizeClassMap SizeClassMapT;
|
|
|
|
static const uptr kNumClasses = SizeClassMap::kNumClasses;
|
|
|
|
static const uptr kNumClassesRounded = SizeClassMap::kNumClassesRounded;
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
// A packed array of counters. Each counter occupies 2^n bits, enough to store
|
|
|
|
// counter's max_value. Ctor will try to allocate the required buffer via
|
|
|
|
// mapper->MapPackedCounterArrayBuffer and the caller is expected to check
|
|
|
|
// whether the initialization was successful by checking IsAllocated() result.
|
|
|
|
// For the performance sake, none of the accessors check the validity of the
|
|
|
|
// arguments, it is assumed that index is always in [0, n) range and the value
|
|
|
|
// is not incremented past max_value.
|
|
|
|
template<class MemoryMapperT>
|
|
|
|
class PackedCounterArray {
|
|
|
|
public:
|
|
|
|
PackedCounterArray(u64 num_counters, u64 max_value, MemoryMapperT *mapper)
|
|
|
|
: n(num_counters), memory_mapper(mapper) {
|
|
|
|
CHECK_GT(num_counters, 0);
|
|
|
|
CHECK_GT(max_value, 0);
|
|
|
|
constexpr u64 kMaxCounterBits = sizeof(*buffer) * 8ULL;
|
|
|
|
// Rounding counter storage size up to the power of two allows for using
|
|
|
|
// bit shifts calculating particular counter's index and offset.
|
|
|
|
uptr counter_size_bits =
|
|
|
|
RoundUpToPowerOfTwo(MostSignificantSetBitIndex(max_value) + 1);
|
|
|
|
CHECK_LE(counter_size_bits, kMaxCounterBits);
|
|
|
|
counter_size_bits_log = Log2(counter_size_bits);
|
|
|
|
counter_mask = ~0ULL >> (kMaxCounterBits - counter_size_bits);
|
|
|
|
|
|
|
|
uptr packing_ratio = kMaxCounterBits >> counter_size_bits_log;
|
|
|
|
CHECK_GT(packing_ratio, 0);
|
|
|
|
packing_ratio_log = Log2(packing_ratio);
|
|
|
|
bit_offset_mask = packing_ratio - 1;
|
|
|
|
|
|
|
|
buffer_size =
|
|
|
|
(RoundUpTo(n, 1ULL << packing_ratio_log) >> packing_ratio_log) *
|
|
|
|
sizeof(*buffer);
|
|
|
|
buffer = reinterpret_cast<u64*>(
|
|
|
|
memory_mapper->MapPackedCounterArrayBuffer(buffer_size));
|
|
|
|
}
|
|
|
|
~PackedCounterArray() {
|
|
|
|
if (buffer) {
|
|
|
|
memory_mapper->UnmapPackedCounterArrayBuffer(
|
|
|
|
reinterpret_cast<uptr>(buffer), buffer_size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsAllocated() const {
|
|
|
|
return !!buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 GetCount() const {
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
uptr Get(uptr i) const {
|
|
|
|
DCHECK_LT(i, n);
|
|
|
|
uptr index = i >> packing_ratio_log;
|
|
|
|
uptr bit_offset = (i & bit_offset_mask) << counter_size_bits_log;
|
|
|
|
return (buffer[index] >> bit_offset) & counter_mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Inc(uptr i) const {
|
|
|
|
DCHECK_LT(Get(i), counter_mask);
|
|
|
|
uptr index = i >> packing_ratio_log;
|
|
|
|
uptr bit_offset = (i & bit_offset_mask) << counter_size_bits_log;
|
|
|
|
buffer[index] += 1ULL << bit_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
void IncRange(uptr from, uptr to) const {
|
|
|
|
DCHECK_LE(from, to);
|
|
|
|
for (uptr i = from; i <= to; i++)
|
|
|
|
Inc(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const u64 n;
|
|
|
|
u64 counter_size_bits_log;
|
|
|
|
u64 counter_mask;
|
|
|
|
u64 packing_ratio_log;
|
|
|
|
u64 bit_offset_mask;
|
|
|
|
|
|
|
|
MemoryMapperT* const memory_mapper;
|
|
|
|
u64 buffer_size;
|
|
|
|
u64* buffer;
|
|
|
|
};
|
|
|
|
|
|
|
|
template<class MemoryMapperT>
|
|
|
|
class FreePagesRangeTracker {
|
|
|
|
public:
|
|
|
|
explicit FreePagesRangeTracker(MemoryMapperT* mapper)
|
|
|
|
: memory_mapper(mapper),
|
|
|
|
page_size_scaled_log(Log2(GetPageSizeCached() >> kCompactPtrScale)),
|
|
|
|
in_the_range(false), current_page(0), current_range_start_page(0) {}
|
|
|
|
|
|
|
|
void NextPage(bool freed) {
|
|
|
|
if (freed) {
|
|
|
|
if (!in_the_range) {
|
|
|
|
current_range_start_page = current_page;
|
|
|
|
in_the_range = true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
CloseOpenedRange();
|
|
|
|
}
|
|
|
|
current_page++;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Done() {
|
|
|
|
CloseOpenedRange();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
void CloseOpenedRange() {
|
|
|
|
if (in_the_range) {
|
|
|
|
memory_mapper->ReleasePageRangeToOS(
|
|
|
|
current_range_start_page << page_size_scaled_log,
|
|
|
|
current_page << page_size_scaled_log);
|
|
|
|
in_the_range = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MemoryMapperT* const memory_mapper;
|
|
|
|
const uptr page_size_scaled_log;
|
|
|
|
bool in_the_range;
|
|
|
|
uptr current_page;
|
|
|
|
uptr current_range_start_page;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Iterates over the free_array to identify memory pages containing freed
|
|
|
|
// chunks only and returns these pages back to OS.
|
|
|
|
// allocated_pages_count is the total number of pages allocated for the
|
|
|
|
// current bucket.
|
|
|
|
template<class MemoryMapperT>
|
|
|
|
static void ReleaseFreeMemoryToOS(CompactPtrT *free_array,
|
|
|
|
uptr free_array_count, uptr chunk_size,
|
|
|
|
uptr allocated_pages_count,
|
|
|
|
MemoryMapperT *memory_mapper) {
|
|
|
|
const uptr page_size = GetPageSizeCached();
|
|
|
|
|
|
|
|
// Figure out the number of chunks per page and whether we can take a fast
|
|
|
|
// path (the number of chunks per page is the same for all pages).
|
|
|
|
uptr full_pages_chunk_count_max;
|
|
|
|
bool same_chunk_count_per_page;
|
|
|
|
if (chunk_size <= page_size && page_size % chunk_size == 0) {
|
|
|
|
// Same number of chunks per page, no cross overs.
|
|
|
|
full_pages_chunk_count_max = page_size / chunk_size;
|
|
|
|
same_chunk_count_per_page = true;
|
|
|
|
} else if (chunk_size <= page_size && page_size % chunk_size != 0 &&
|
|
|
|
chunk_size % (page_size % chunk_size) == 0) {
|
|
|
|
// Some chunks are crossing page boundaries, which means that the page
|
|
|
|
// contains one or two partial chunks, but all pages contain the same
|
|
|
|
// number of chunks.
|
|
|
|
full_pages_chunk_count_max = page_size / chunk_size + 1;
|
|
|
|
same_chunk_count_per_page = true;
|
|
|
|
} else if (chunk_size <= page_size) {
|
|
|
|
// Some chunks are crossing page boundaries, which means that the page
|
|
|
|
// contains one or two partial chunks.
|
|
|
|
full_pages_chunk_count_max = page_size / chunk_size + 2;
|
|
|
|
same_chunk_count_per_page = false;
|
|
|
|
} else if (chunk_size > page_size && chunk_size % page_size == 0) {
|
|
|
|
// One chunk covers multiple pages, no cross overs.
|
|
|
|
full_pages_chunk_count_max = 1;
|
|
|
|
same_chunk_count_per_page = true;
|
|
|
|
} else if (chunk_size > page_size) {
|
|
|
|
// One chunk covers multiple pages, Some chunks are crossing page
|
|
|
|
// boundaries. Some pages contain one chunk, some contain two.
|
|
|
|
full_pages_chunk_count_max = 2;
|
|
|
|
same_chunk_count_per_page = false;
|
|
|
|
} else {
|
|
|
|
UNREACHABLE("All chunk_size/page_size ratios must be handled.");
|
|
|
|
}
|
|
|
|
|
|
|
|
PackedCounterArray<MemoryMapperT> counters(allocated_pages_count,
|
|
|
|
full_pages_chunk_count_max,
|
|
|
|
memory_mapper);
|
|
|
|
if (!counters.IsAllocated())
|
|
|
|
return;
|
|
|
|
|
|
|
|
const uptr chunk_size_scaled = chunk_size >> kCompactPtrScale;
|
|
|
|
const uptr page_size_scaled = page_size >> kCompactPtrScale;
|
|
|
|
const uptr page_size_scaled_log = Log2(page_size_scaled);
|
|
|
|
|
|
|
|
// Iterate over free chunks and count how many free chunks affect each
|
|
|
|
// allocated page.
|
|
|
|
if (chunk_size <= page_size && page_size % chunk_size == 0) {
|
|
|
|
// Each chunk affects one page only.
|
|
|
|
for (uptr i = 0; i < free_array_count; i++)
|
|
|
|
counters.Inc(free_array[i] >> page_size_scaled_log);
|
|
|
|
} else {
|
|
|
|
// In all other cases chunks might affect more than one page.
|
|
|
|
for (uptr i = 0; i < free_array_count; i++) {
|
|
|
|
counters.IncRange(
|
|
|
|
free_array[i] >> page_size_scaled_log,
|
|
|
|
(free_array[i] + chunk_size_scaled - 1) >> page_size_scaled_log);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Iterate over pages detecting ranges of pages with chunk counters equal
|
|
|
|
// to the expected number of chunks for the particular page.
|
|
|
|
FreePagesRangeTracker<MemoryMapperT> range_tracker(memory_mapper);
|
|
|
|
if (same_chunk_count_per_page) {
|
|
|
|
// Fast path, every page has the same number of chunks affecting it.
|
|
|
|
for (uptr i = 0; i < counters.GetCount(); i++)
|
|
|
|
range_tracker.NextPage(counters.Get(i) == full_pages_chunk_count_max);
|
|
|
|
} else {
|
|
|
|
// Show path, go through the pages keeping count how many chunks affect
|
|
|
|
// each page.
|
|
|
|
const uptr pn =
|
|
|
|
chunk_size < page_size ? page_size_scaled / chunk_size_scaled : 1;
|
|
|
|
const uptr pnc = pn * chunk_size_scaled;
|
|
|
|
// The idea is to increment the current page pointer by the first chunk
|
|
|
|
// size, middle portion size (the portion of the page covered by chunks
|
|
|
|
// except the first and the last one) and then the last chunk size, adding
|
|
|
|
// up the number of chunks on the current page and checking on every step
|
|
|
|
// whether the page boundary was crossed.
|
|
|
|
uptr prev_page_boundary = 0;
|
|
|
|
uptr current_boundary = 0;
|
|
|
|
for (uptr i = 0; i < counters.GetCount(); i++) {
|
|
|
|
uptr page_boundary = prev_page_boundary + page_size_scaled;
|
|
|
|
uptr chunks_per_page = pn;
|
|
|
|
if (current_boundary < page_boundary) {
|
|
|
|
if (current_boundary > prev_page_boundary)
|
|
|
|
chunks_per_page++;
|
|
|
|
current_boundary += pnc;
|
|
|
|
if (current_boundary < page_boundary) {
|
|
|
|
chunks_per_page++;
|
|
|
|
current_boundary += chunk_size_scaled;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prev_page_boundary = page_boundary;
|
|
|
|
|
|
|
|
range_tracker.NextPage(counters.Get(i) == chunks_per_page);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
range_tracker.Done();
|
|
|
|
}
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
private:
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
friend class MemoryMapper;
|
|
|
|
|
2017-11-08 00:19:24 +08:00
|
|
|
ReservedAddressRange address_range;
|
|
|
|
static const char *AllocatorName() { return "sanitizer_allocator"; }
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
static const uptr kRegionSize = kSpaceSize / kNumClassesRounded;
|
2016-08-25 05:20:10 +08:00
|
|
|
// FreeArray is the array of free-d chunks (stored as 4-byte offsets).
|
|
|
|
// In the worst case it may reguire kRegionSize/SizeClassMap::kMinSize
|
|
|
|
// elements, but in reality this will not happen. For simplicity we
|
|
|
|
// dedicate 1/8 of the region's virtual space to FreeArray.
|
|
|
|
static const uptr kFreeArraySize = kRegionSize / 8;
|
2016-07-21 06:06:41 +08:00
|
|
|
|
|
|
|
static const bool kUsingConstantSpaceBeg = kSpaceBeg != ~(uptr)0;
|
|
|
|
uptr NonConstSpaceBeg;
|
|
|
|
uptr SpaceBeg() const {
|
|
|
|
return kUsingConstantSpaceBeg ? kSpaceBeg : NonConstSpaceBeg;
|
|
|
|
}
|
|
|
|
uptr SpaceEnd() const { return SpaceBeg() + kSpaceSize; }
|
|
|
|
// kRegionSize must be >= 2^32.
|
|
|
|
COMPILER_CHECK((kRegionSize) >= (1ULL << (SANITIZER_WORDSIZE / 2)));
|
2016-08-25 05:20:10 +08:00
|
|
|
// kRegionSize must be <= 2^36, see CompactPtrT.
|
2016-08-10 07:30:22 +08:00
|
|
|
COMPILER_CHECK((kRegionSize) <= (1ULL << (SANITIZER_WORDSIZE / 2 + 4)));
|
2016-07-21 06:06:41 +08:00
|
|
|
// Call mmap for user memory with at least this size.
|
|
|
|
static const uptr kUserMapSize = 1 << 16;
|
|
|
|
// Call mmap for metadata memory with at least this size.
|
|
|
|
static const uptr kMetaMapSize = 1 << 16;
|
2016-08-25 05:20:10 +08:00
|
|
|
// Call mmap for free array memory with at least this size.
|
|
|
|
static const uptr kFreeArrayMapSize = 1 << 16;
|
2016-11-29 08:22:50 +08:00
|
|
|
|
|
|
|
atomic_sint32_t release_to_os_interval_ms_;
|
2016-08-27 07:58:42 +08:00
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
struct Stats {
|
|
|
|
uptr n_allocated;
|
|
|
|
uptr n_freed;
|
|
|
|
};
|
|
|
|
|
2016-08-27 07:58:42 +08:00
|
|
|
struct ReleaseToOsInfo {
|
|
|
|
uptr n_freed_at_last_release;
|
|
|
|
uptr num_releases;
|
2016-11-29 08:22:50 +08:00
|
|
|
u64 last_release_at_ns;
|
2017-10-14 02:38:10 +08:00
|
|
|
u64 last_released_bytes;
|
2016-08-27 07:58:42 +08:00
|
|
|
};
|
2016-07-21 06:06:41 +08:00
|
|
|
|
|
|
|
struct RegionInfo {
|
|
|
|
BlockingMutex mutex;
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr num_freed_chunks; // Number of elements in the freearray.
|
|
|
|
uptr mapped_free_array; // Bytes mapped for freearray.
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr allocated_user; // Bytes allocated for user memory.
|
|
|
|
uptr allocated_meta; // Bytes allocated for metadata.
|
|
|
|
uptr mapped_user; // Bytes mapped for user memory.
|
|
|
|
uptr mapped_meta; // Bytes mapped for metadata.
|
2017-06-27 06:54:10 +08:00
|
|
|
u32 rand_state; // Seed for random shuffle, used if kRandomShuffleChunks.
|
|
|
|
bool exhausted; // Whether region is out of space for new chunks.
|
|
|
|
Stats stats;
|
2016-08-27 07:58:42 +08:00
|
|
|
ReleaseToOsInfo rtoi;
|
2016-07-21 06:06:41 +08:00
|
|
|
};
|
|
|
|
COMPILER_CHECK(sizeof(RegionInfo) >= kCacheLineSize);
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
RegionInfo *GetRegionInfo(uptr class_id) const {
|
2016-07-21 06:06:41 +08:00
|
|
|
CHECK_LT(class_id, kNumClasses);
|
|
|
|
RegionInfo *regions =
|
|
|
|
reinterpret_cast<RegionInfo *>(SpaceBeg() + kSpaceSize);
|
|
|
|
return ®ions[class_id];
|
|
|
|
}
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr GetMetadataEnd(uptr region_beg) const {
|
2016-08-25 05:20:10 +08:00
|
|
|
return region_beg + kRegionSize - kFreeArraySize;
|
|
|
|
}
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr GetChunkIdx(uptr chunk, uptr size) const {
|
2016-08-05 02:15:38 +08:00
|
|
|
if (!kUsingConstantSpaceBeg)
|
|
|
|
chunk -= SpaceBeg();
|
|
|
|
|
2016-07-21 06:06:41 +08:00
|
|
|
uptr offset = chunk % kRegionSize;
|
|
|
|
// Here we divide by a non-constant. This is costly.
|
|
|
|
// size always fits into 32-bits. If the offset fits too, use 32-bit div.
|
|
|
|
if (offset >> (SANITIZER_WORDSIZE / 2))
|
|
|
|
return offset / size;
|
|
|
|
return (u32)offset / (u32)size;
|
|
|
|
}
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
CompactPtrT *GetFreeArray(uptr region_beg) const {
|
|
|
|
return reinterpret_cast<CompactPtrT *>(GetMetadataEnd(region_beg));
|
2016-08-25 05:20:10 +08:00
|
|
|
}
|
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
bool MapWithCallback(uptr beg, uptr size) {
|
2017-11-08 00:19:24 +08:00
|
|
|
uptr mapped = address_range.Map(beg, size);
|
[sanitizer] Small tweaks and fixes to allocator related functions
Summary:
In `sanitizer_allocator_primary32.h`:
- rounding up in `MapWithCallback` is not needed as `MmapOrDie` does it. Note
that the 64-bit counterpart doesn't round up, this keeps the behavior
consistent;
- since `IsAligned` exists, use it in `AllocateRegion`;
- in `PopulateFreeList`:
- checking `b->Count` to be greater than 0 when `b->Count() == max_count` is
redundant when done more than once. Just check that `max_count` is greater
than 0 out of the loop; the compiler (at least on ARM) didn't optimize it;
- mark the batch creation failure as `UNLIKELY`;
In `sanitizer_allocator_primary64.h`:
- in `MapWithCallback`, mark the failure condition as `UNLIKELY`;
In `sanitizer_posix.h`:
- mark a bunch of Mmap related failure conditions as `UNLIKELY`;
- in `MmapAlignedOrDieOnFatalError`, we have `IsAligned`, so use it; rearrange
the conditions as one test was redudant;
- in `MmapFixedImpl`, 30 chars was not large enough to hold the message and a
full 64-bit address (or at least a 48-bit usermode address), increase to 40.
Reviewers: alekseyshl
Reviewed By: alekseyshl
Subscribers: aemerson, kubamracek, kristof.beyls, llvm-commits
Differential Revision: https://reviews.llvm.org/D34840
llvm-svn: 306834
2017-07-01 00:05:40 +08:00
|
|
|
if (UNLIKELY(!mapped))
|
2017-06-27 06:54:10 +08:00
|
|
|
return false;
|
|
|
|
CHECK_EQ(beg, mapped);
|
|
|
|
MapUnmapCallback().OnMap(beg, size);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void MapWithCallbackOrDie(uptr beg, uptr size) {
|
2017-11-08 00:19:24 +08:00
|
|
|
CHECK_EQ(beg, address_range.MapOrDie(beg, size));
|
2017-06-27 06:54:10 +08:00
|
|
|
MapUnmapCallback().OnMap(beg, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
void UnmapWithCallbackOrDie(uptr beg, uptr size) {
|
|
|
|
MapUnmapCallback().OnUnmap(beg, size);
|
2017-11-08 00:19:24 +08:00
|
|
|
address_range.Unmap(beg, size);
|
2017-06-27 06:54:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool EnsureFreeArraySpace(RegionInfo *region, uptr region_beg,
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr num_freed_chunks) {
|
|
|
|
uptr needed_space = num_freed_chunks * sizeof(CompactPtrT);
|
|
|
|
if (region->mapped_free_array < needed_space) {
|
|
|
|
uptr new_mapped_free_array = RoundUpTo(needed_space, kFreeArrayMapSize);
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
CHECK_LE(new_mapped_free_array, kFreeArraySize);
|
2016-08-25 05:20:10 +08:00
|
|
|
uptr current_map_end = reinterpret_cast<uptr>(GetFreeArray(region_beg)) +
|
|
|
|
region->mapped_free_array;
|
|
|
|
uptr new_map_size = new_mapped_free_array - region->mapped_free_array;
|
2017-06-27 06:54:10 +08:00
|
|
|
if (UNLIKELY(!MapWithCallback(current_map_end, new_map_size)))
|
|
|
|
return false;
|
2016-08-25 05:20:10 +08:00
|
|
|
region->mapped_free_array = new_mapped_free_array;
|
|
|
|
}
|
2017-06-27 06:54:10 +08:00
|
|
|
return true;
|
2016-08-25 05:20:10 +08:00
|
|
|
}
|
|
|
|
|
2017-12-05 02:56:38 +08:00
|
|
|
// Check whether this size class is exhausted.
|
|
|
|
bool IsRegionExhausted(RegionInfo *region, uptr class_id,
|
|
|
|
uptr additional_map_size) {
|
|
|
|
if (LIKELY(region->mapped_user + region->mapped_meta +
|
|
|
|
additional_map_size <= kRegionSize - kFreeArraySize))
|
|
|
|
return false;
|
|
|
|
if (!region->exhausted) {
|
|
|
|
region->exhausted = true;
|
|
|
|
Printf("%s: Out of memory. ", SanitizerToolName);
|
|
|
|
Printf("The process has exhausted %zuMB for size class %zu.\n",
|
|
|
|
kRegionSize >> 20, ClassIdToSize(class_id));
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
NOINLINE bool PopulateFreeArray(AllocatorStats *stat, uptr class_id,
|
2016-08-25 05:20:10 +08:00
|
|
|
RegionInfo *region, uptr requested_count) {
|
|
|
|
// region->mutex is held.
|
2017-06-27 06:54:10 +08:00
|
|
|
const uptr region_beg = GetRegionBeginBySizeClass(class_id);
|
2017-12-05 02:56:38 +08:00
|
|
|
const uptr size = ClassIdToSize(class_id);
|
2017-06-27 06:54:10 +08:00
|
|
|
|
2017-12-05 02:56:38 +08:00
|
|
|
const uptr total_user_bytes =
|
|
|
|
region->allocated_user + requested_count * size;
|
2017-06-27 06:54:10 +08:00
|
|
|
// Map more space for chunks, if necessary.
|
2017-12-05 02:56:38 +08:00
|
|
|
if (LIKELY(total_user_bytes > region->mapped_user)) {
|
2017-10-27 01:59:24 +08:00
|
|
|
if (UNLIKELY(region->mapped_user == 0)) {
|
|
|
|
if (!kUsingConstantSpaceBeg && kRandomShuffleChunks)
|
[sanitizer] Random shuffling of chunks for the 32-bit Primary Allocator
Summary:
The 64-bit primary has had random shuffling of chunks for a while, this
implements it for the 32-bit primary. Scudo is currently the only user of
`kRandomShuffleChunks`.
This change consists of a few modifications:
- move the random shuffling functions out of the 64-bit primary to
`sanitizer_common.h`. Alternatively I could move them to
`sanitizer_allocator.h` as they are only used in the allocator, I don't feel
strongly either way;
- small change in the 64-bit primary to make the `rand_state` initialization
`UNLIKELY`;
- addition of a `rand_state` in the 32-bit primary's `SizeClassInfo` and
shuffling of chunks when populating the free list.
- enabling the `random_shuffle.cpp` test on platforms using the 32-bit primary
for Scudo.
Some comments on why the shuffling is done that way. Initially I just
implemented a `Shuffle` function in the `TransferBatch` which was simpler but I
came to realize this wasn't good enough: for chunks of 10000 bytes for example,
with a `CompactSizeClassMap`, a batch holds only 1 chunk, meaning shuffling the
batch has no effect, while a region is usually 1MB, eg: 104 chunks of that size.
So I decided to "stage" the newly gathered chunks in a temporary array that
would be shuffled prior to placing the chunks in batches.
The result is looping twice through n_chunks even if shuffling is not enabled,
but I didn't notice any significant significant performance impact.
Reviewers: alekseyshl
Reviewed By: alekseyshl
Subscribers: srhines, llvm-commits, kubamracek
Differential Revision: https://reviews.llvm.org/D39244
llvm-svn: 316596
2017-10-26 01:24:56 +08:00
|
|
|
// The random state is initialized from ASLR.
|
|
|
|
region->rand_state = static_cast<u32>(region_beg >> 12);
|
2017-10-27 01:59:24 +08:00
|
|
|
// Postpone the first release to OS attempt for ReleaseToOSIntervalMs,
|
|
|
|
// preventing just allocated memory from being released sooner than
|
|
|
|
// necessary and also preventing extraneous ReleaseMemoryPagesToOS calls
|
|
|
|
// for short lived processes.
|
2017-11-04 07:31:00 +08:00
|
|
|
// Do it only when the feature is turned on, to avoid a potentially
|
|
|
|
// extraneous syscall.
|
|
|
|
if (ReleaseToOSIntervalMs() >= 0)
|
2017-12-14 00:23:54 +08:00
|
|
|
region->rtoi.last_release_at_ns = MonotonicNanoTime();
|
2017-10-27 01:59:24 +08:00
|
|
|
}
|
2016-07-21 06:06:41 +08:00
|
|
|
// Do the mmap for the user memory.
|
2017-12-05 02:56:38 +08:00
|
|
|
const uptr user_map_size =
|
|
|
|
RoundUpTo(total_user_bytes - region->mapped_user, kUserMapSize);
|
|
|
|
if (UNLIKELY(IsRegionExhausted(region, class_id, user_map_size)))
|
|
|
|
return false;
|
2017-06-27 06:54:10 +08:00
|
|
|
if (UNLIKELY(!MapWithCallback(region_beg + region->mapped_user,
|
2017-12-05 02:56:38 +08:00
|
|
|
user_map_size)))
|
2017-06-27 06:54:10 +08:00
|
|
|
return false;
|
2017-12-05 02:56:38 +08:00
|
|
|
stat->Add(AllocatorStatMapped, user_map_size);
|
|
|
|
region->mapped_user += user_map_size;
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
2017-12-05 02:56:38 +08:00
|
|
|
const uptr new_chunks_count =
|
|
|
|
(region->mapped_user - region->allocated_user) / size;
|
|
|
|
|
|
|
|
if (kMetadataSize) {
|
|
|
|
// Calculate the required space for metadata.
|
|
|
|
const uptr total_meta_bytes =
|
|
|
|
region->allocated_meta + new_chunks_count * kMetadataSize;
|
|
|
|
const uptr meta_map_size = (total_meta_bytes > region->mapped_meta) ?
|
|
|
|
RoundUpTo(total_meta_bytes - region->mapped_meta, kMetaMapSize) : 0;
|
|
|
|
// Map more space for metadata, if necessary.
|
|
|
|
if (meta_map_size) {
|
|
|
|
if (UNLIKELY(IsRegionExhausted(region, class_id, meta_map_size)))
|
|
|
|
return false;
|
|
|
|
if (UNLIKELY(!MapWithCallback(
|
|
|
|
GetMetadataEnd(region_beg) - region->mapped_meta - meta_map_size,
|
|
|
|
meta_map_size)))
|
|
|
|
return false;
|
|
|
|
region->mapped_meta += meta_map_size;
|
2017-06-27 06:54:10 +08:00
|
|
|
}
|
2016-08-25 05:20:10 +08:00
|
|
|
}
|
2017-06-27 06:54:10 +08:00
|
|
|
|
|
|
|
// If necessary, allocate more space for the free array and populate it with
|
|
|
|
// newly allocated chunks.
|
|
|
|
const uptr total_freed_chunks = region->num_freed_chunks + new_chunks_count;
|
|
|
|
if (UNLIKELY(!EnsureFreeArraySpace(region, region_beg, total_freed_chunks)))
|
|
|
|
return false;
|
|
|
|
CompactPtrT *free_array = GetFreeArray(region_beg);
|
2017-12-05 02:56:38 +08:00
|
|
|
for (uptr i = 0, chunk = region->allocated_user; i < new_chunks_count;
|
2017-06-27 06:54:10 +08:00
|
|
|
i++, chunk += size)
|
|
|
|
free_array[total_freed_chunks - 1 - i] = PointerToCompactPtr(0, chunk);
|
2016-08-26 08:06:03 +08:00
|
|
|
if (kRandomShuffleChunks)
|
2017-06-27 06:54:10 +08:00
|
|
|
RandomShuffle(&free_array[region->num_freed_chunks], new_chunks_count,
|
2016-08-26 08:06:03 +08:00
|
|
|
®ion->rand_state);
|
2016-08-25 05:20:10 +08:00
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
// All necessary memory is mapped and now it is safe to advance all
|
|
|
|
// 'allocated_*' counters.
|
|
|
|
region->num_freed_chunks += new_chunks_count;
|
|
|
|
region->allocated_user += new_chunks_count * size;
|
|
|
|
CHECK_LE(region->allocated_user, region->mapped_user);
|
2017-12-05 02:56:38 +08:00
|
|
|
region->allocated_meta += new_chunks_count * kMetadataSize;
|
2016-07-21 06:06:41 +08:00
|
|
|
CHECK_LE(region->allocated_meta, region->mapped_meta);
|
2017-06-27 06:54:10 +08:00
|
|
|
region->exhausted = false;
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
// TODO(alekseyshl): Consider bumping last_release_at_ns here to prevent
|
|
|
|
// MaybeReleaseToOS from releasing just allocated pages or protect these
|
|
|
|
// not yet used chunks some other way.
|
|
|
|
|
2017-06-27 06:54:10 +08:00
|
|
|
return true;
|
2016-07-21 06:06:41 +08:00
|
|
|
}
|
2016-08-27 07:58:42 +08:00
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
class MemoryMapper {
|
|
|
|
public:
|
|
|
|
MemoryMapper(const ThisT& base_allocator, uptr class_id)
|
|
|
|
: allocator(base_allocator),
|
|
|
|
region_base(base_allocator.GetRegionBeginBySizeClass(class_id)),
|
2017-10-14 02:38:10 +08:00
|
|
|
released_ranges_count(0),
|
|
|
|
released_bytes(0) {
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
}
|
2016-08-27 07:58:42 +08:00
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr GetReleasedRangesCount() const {
|
|
|
|
return released_ranges_count;
|
|
|
|
}
|
|
|
|
|
2017-10-14 02:38:10 +08:00
|
|
|
uptr GetReleasedBytes() const {
|
|
|
|
return released_bytes;
|
|
|
|
}
|
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
uptr MapPackedCounterArrayBuffer(uptr buffer_size) {
|
|
|
|
// TODO(alekseyshl): The idea to explore is to check if we have enough
|
|
|
|
// space between num_freed_chunks*sizeof(CompactPtrT) and
|
|
|
|
// mapped_free_array to fit buffer_size bytes and use that space instead
|
|
|
|
// of mapping a temporary one.
|
|
|
|
return reinterpret_cast<uptr>(
|
|
|
|
MmapOrDieOnFatalError(buffer_size, "ReleaseToOSPageCounters"));
|
|
|
|
}
|
|
|
|
|
|
|
|
void UnmapPackedCounterArrayBuffer(uptr buffer, uptr buffer_size) {
|
|
|
|
UnmapOrDie(reinterpret_cast<void *>(buffer), buffer_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Releases [from, to) range of pages back to OS.
|
|
|
|
void ReleasePageRangeToOS(CompactPtrT from, CompactPtrT to) {
|
2017-10-14 02:38:10 +08:00
|
|
|
const uptr from_page = allocator.CompactPtrToPointer(region_base, from);
|
|
|
|
const uptr to_page = allocator.CompactPtrToPointer(region_base, to);
|
|
|
|
ReleaseMemoryPagesToOS(from_page, to_page);
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
released_ranges_count++;
|
2017-10-14 02:38:10 +08:00
|
|
|
released_bytes += to_page - from_page;
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const ThisT& allocator;
|
|
|
|
const uptr region_base;
|
|
|
|
uptr released_ranges_count;
|
2017-10-14 02:38:10 +08:00
|
|
|
uptr released_bytes;
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
// Attempts to release RAM occupied by freed chunks back to OS. The region is
|
|
|
|
// expected to be locked.
|
2017-10-24 01:12:07 +08:00
|
|
|
void MaybeReleaseToOS(uptr class_id, bool force) {
|
2016-08-27 07:58:42 +08:00
|
|
|
RegionInfo *region = GetRegionInfo(class_id);
|
2016-11-29 08:22:50 +08:00
|
|
|
const uptr chunk_size = ClassIdToSize(class_id);
|
|
|
|
const uptr page_size = GetPageSizeCached();
|
|
|
|
|
2016-08-27 07:58:42 +08:00
|
|
|
uptr n = region->num_freed_chunks;
|
2016-11-29 08:22:50 +08:00
|
|
|
if (n * chunk_size < page_size)
|
|
|
|
return; // No chance to release anything.
|
2017-06-27 06:54:10 +08:00
|
|
|
if ((region->stats.n_freed -
|
|
|
|
region->rtoi.n_freed_at_last_release) * chunk_size < page_size) {
|
2016-08-27 07:58:42 +08:00
|
|
|
return; // Nothing new to release.
|
2016-11-29 08:22:50 +08:00
|
|
|
}
|
|
|
|
|
2017-10-24 01:12:07 +08:00
|
|
|
if (!force) {
|
|
|
|
s32 interval_ms = ReleaseToOSIntervalMs();
|
|
|
|
if (interval_ms < 0)
|
|
|
|
return;
|
2016-11-29 08:22:50 +08:00
|
|
|
|
2017-10-24 01:12:07 +08:00
|
|
|
if (region->rtoi.last_release_at_ns + interval_ms * 1000000ULL >
|
2017-12-14 00:23:54 +08:00
|
|
|
MonotonicNanoTime()) {
|
2017-10-24 01:12:07 +08:00
|
|
|
return; // Memory was returned recently.
|
|
|
|
}
|
|
|
|
}
|
2016-11-29 08:22:50 +08:00
|
|
|
|
[Sanitizers] Allocator: new "release memory to OS" implementation
Summary:
The current implementation of the allocator returning freed memory
back to OS (controlled by allocator_release_to_os_interval_ms flag)
requires sorting of the free chunks list, which has two major issues,
first, when free list grows to millions of chunks, sorting, even the
fastest one, is just too slow, and second, sorting chunks in place
is unacceptable for Scudo allocator as it makes allocations more
predictable and less secure.
The proposed approach is linear in complexity (altough requires quite
a bit more temporary memory). The idea is to count the number of free
chunks on each memory page and release pages containing free chunks
only. It requires one iteration over the free list of chunks and one
iteration over the array of page counters. The obvious disadvantage
is the allocation of the array of the counters, but even in the worst
case we support (4T allocator space, 64 buckets, 16 bytes bucket size,
full free list, which leads to 2 bytes per page counter and ~17M page
counters), requires just about 34Mb of the intermediate buffer (comparing
to ~64Gb of actually allocated chunks) and usually it stays under 100K
and released after each use. It is expected to be a relatively rare event,
releasing memory back to OS, keeping the buffer between those runs
and added complexity of the bookkeeping seems unnesessary here (it can
always be improved later, though, never say never).
The most interesting problem here is how to calculate the number of chunks
falling into each memory page in the bucket. Skipping all the details,
there are three cases when the number of chunks per page is constant:
1) P >= C, P % C == 0 --> N = P / C
2) C > P , C % P == 0 --> N = 1
3) C <= P, P % C != 0 && C % (P % C) == 0 --> N = P / C + 1
where P is page size, C is chunk size and N is the number of chunks per
page and the rest of the cases, where the number of chunks per page is
calculated on the go, during the page counter array iteration.
Among the rest, there are still cases where N can be deduced from the
page index, but they require not that much less calculations per page
than the current "brute force" way and 2/3 of the buckets fall into
the first three categories anyway, so, for the sake of simplicity,
it was decided to stick to those two variations. It can always be
refined and improved later, should we see that brute force way slows
us down unacceptably.
Reviewers: eugenis, cryptoad, dvyukov
Subscribers: kubamracek, mehdi_amini, llvm-commits
Differential Revision: https://reviews.llvm.org/D38245
llvm-svn: 314311
2017-09-27 23:38:05 +08:00
|
|
|
MemoryMapper memory_mapper(*this, class_id);
|
|
|
|
|
|
|
|
ReleaseFreeMemoryToOS<MemoryMapper>(
|
|
|
|
GetFreeArray(GetRegionBeginBySizeClass(class_id)), n, chunk_size,
|
|
|
|
RoundUpTo(region->allocated_user, page_size) / page_size,
|
|
|
|
&memory_mapper);
|
|
|
|
|
|
|
|
if (memory_mapper.GetReleasedRangesCount() > 0) {
|
|
|
|
region->rtoi.n_freed_at_last_release = region->stats.n_freed;
|
|
|
|
region->rtoi.num_releases += memory_mapper.GetReleasedRangesCount();
|
2017-10-14 02:38:10 +08:00
|
|
|
region->rtoi.last_released_bytes = memory_mapper.GetReleasedBytes();
|
2016-08-27 07:58:42 +08:00
|
|
|
}
|
2017-12-14 00:23:54 +08:00
|
|
|
region->rtoi.last_release_at_ns = MonotonicNanoTime();
|
2016-08-27 07:58:42 +08:00
|
|
|
}
|
2016-07-21 06:06:41 +08:00
|
|
|
};
|