305 lines
8.7 KiB
C++
305 lines
8.7 KiB
C++
/*
|
|
* FastAlloc.h
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef FLOW_FASTALLOC_H
|
|
#define FLOW_FASTALLOC_H
|
|
#pragma once
|
|
|
|
#include "flow/Error.h"
|
|
#include "flow/Platform.h"
|
|
#include "flow/config.h"
|
|
|
|
// ALLOC_INSTRUMENTATION_STDOUT enables non-sampled logging of all allocations and deallocations to stdout to be
|
|
// processed by tools/alloc_instrumentation.py
|
|
//#define ALLOC_INSTRUMENTATION_STDOUT ENABLED(NOT_IN_CLEAN)
|
|
|
|
//#define ALLOC_INSTRUMENTATION ENABLED(NOT_IN_CLEAN)
|
|
// The form "(1==1)" in this context is used to satisfy both clang and vc++ with a single syntax. Clang rejects "1" and
|
|
// vc++ rejects "true".
|
|
#define FASTALLOC_THREAD_SAFE (FLOW_THREAD_SAFE || (1 == 1))
|
|
|
|
#if VALGRIND
|
|
#include <drd.h>
|
|
#include <memcheck.h>
|
|
bool valgrindPrecise();
|
|
#endif
|
|
|
|
#include "flow/Hash3.h"
|
|
|
|
#include <assert.h>
|
|
#include <atomic>
|
|
#include <vector>
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <unordered_map>
|
|
|
|
#if defined(ALLOC_INSTRUMENTATION) && defined(__linux__)
|
|
#include <execinfo.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#ifdef ALLOC_INSTRUMENTATION
|
|
#include <map>
|
|
#include <algorithm>
|
|
#include "flow/ThreadPrimitives.h"
|
|
struct AllocInstrInfo {
|
|
int64_t allocCount;
|
|
int64_t deallocCount;
|
|
int64_t maxAllocated;
|
|
inline void alloc(int64_t count = 1) {
|
|
allocCount += count;
|
|
maxAllocated = std::max(allocCount - deallocCount, maxAllocated);
|
|
}
|
|
inline void dealloc(int64_t count = 1) { deallocCount += count; }
|
|
};
|
|
extern std::map<const char*, AllocInstrInfo> allocInstr;
|
|
#define INSTRUMENT_ALLOCATE(name) (allocInstr[(name)].alloc())
|
|
#define INSTRUMENT_RELEASE(name) (allocInstr[(name)].dealloc())
|
|
|
|
// extern std::map<uint32_t, uint64_t> stackAllocations;
|
|
|
|
// maps from an address to the hash of the backtrace and the size of the alloction
|
|
extern std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample;
|
|
|
|
struct BackTraceAccount {
|
|
double count;
|
|
size_t sampleCount;
|
|
size_t totalSize;
|
|
std::vector<void*>* backTrace;
|
|
};
|
|
// maps from a hash of a backtrace to a backtrace and the total size of data currently allocated from this stack
|
|
extern std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup;
|
|
|
|
extern ThreadSpinLock memLock;
|
|
extern thread_local bool memSample_entered;
|
|
extern const size_t SAMPLE_BYTES;
|
|
|
|
#else
|
|
#define INSTRUMENT_ALLOCATE(name)
|
|
#define INSTRUMENT_RELEASE(name)
|
|
#endif
|
|
|
|
#if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
|
|
void recordAllocation(void* ptr, size_t size);
|
|
void recordDeallocation(void* ptr);
|
|
#endif
|
|
|
|
inline constexpr auto kFastAllocMagazineBytes = 128 << 10;
|
|
|
|
template <int Size>
|
|
class FastAllocator {
|
|
public:
|
|
[[nodiscard]] static void* allocate();
|
|
static void release(void* ptr);
|
|
static void check(void* ptr, bool alloc);
|
|
|
|
static long long getTotalMemory();
|
|
static long long getApproximateMemoryUnused();
|
|
static long long getActiveThreads();
|
|
|
|
#ifdef ALLOC_INSTRUMENTATION
|
|
static volatile int32_t pageCount;
|
|
#endif
|
|
|
|
FastAllocator() = delete;
|
|
|
|
private:
|
|
#ifdef VALGRIND
|
|
static unsigned long vLock;
|
|
#endif
|
|
|
|
static const int magazine_size = kFastAllocMagazineBytes / Size;
|
|
static const int PSize = Size / sizeof(void*);
|
|
struct GlobalData;
|
|
struct ThreadData {
|
|
void* freelist;
|
|
int count; // there are count items on freelist
|
|
void* alternate; // alternate is either a full magazine, or an empty one
|
|
ThreadData();
|
|
~ThreadData();
|
|
};
|
|
struct ThreadDataInit {
|
|
ThreadDataInit() { threadData(); }
|
|
};
|
|
// Used to try to initialize threadData as early as possible. It's still
|
|
// possible that a static thread local variable (that owns fast-allocated
|
|
// memory) could be constructed before threadData, in which case threadData
|
|
// would be destroyed by the time that variable's destructor attempts to free.
|
|
// This is undefined behavior if this happens, which is why we want to
|
|
// initialize threadData as early as possible.
|
|
static thread_local ThreadDataInit threadDataInit;
|
|
// Used to access threadData. Returning a reference to a function-level
|
|
// static guarantees that threadData will be constructed before it's
|
|
// accessed here. Furthermore, if accessing threadData from a static thread
|
|
// local variable's constructor, this guarantees that threadData will
|
|
// outlive this object, since destruction order is the reverse of
|
|
// construction order.
|
|
static ThreadData& threadData() noexcept;
|
|
static GlobalData* globalData() noexcept {
|
|
#ifdef VALGRIND
|
|
ANNOTATE_RWLOCK_ACQUIRED(vLock, 1);
|
|
#endif
|
|
static GlobalData* data = new GlobalData(); // This is thread-safe as of c++11 (VS 2015, gcc 4.8, clang 3.3)
|
|
|
|
#ifdef VALGRIND
|
|
ANNOTATE_RWLOCK_RELEASED(vLock, 1);
|
|
#endif
|
|
|
|
return data;
|
|
}
|
|
static void* freelist;
|
|
|
|
static void getMagazine();
|
|
static void releaseMagazine(void*);
|
|
};
|
|
|
|
extern std::atomic<int64_t> g_hugeArenaMemory;
|
|
void hugeArenaSample(int size);
|
|
void releaseAllThreadMagazines();
|
|
int64_t getTotalUnusedAllocatedMemory();
|
|
|
|
inline constexpr int nextFastAllocatedSize(int x) {
|
|
assert(x > 0 && x <= 16384);
|
|
if (x <= 16)
|
|
return 16;
|
|
else if (x <= 32)
|
|
return 32;
|
|
else if (x <= 64)
|
|
return 64;
|
|
else if (x <= 96)
|
|
return 96;
|
|
else if (x <= 128)
|
|
return 128;
|
|
else if (x <= 256)
|
|
return 256;
|
|
else if (x <= 512)
|
|
return 512;
|
|
else if (x <= 1024)
|
|
return 1024;
|
|
else if (x <= 2048)
|
|
return 2048;
|
|
else if (x <= 4096)
|
|
return 4096;
|
|
else if (x <= 8192)
|
|
return 8192;
|
|
else
|
|
return 16384;
|
|
}
|
|
|
|
template <class Object>
|
|
class FastAllocated {
|
|
public:
|
|
[[nodiscard]] static void* operator new(size_t s) {
|
|
if (s != sizeof(Object))
|
|
abort();
|
|
INSTRUMENT_ALLOCATE(typeid(Object).name());
|
|
|
|
if constexpr (sizeof(Object) <= 256) {
|
|
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
|
|
return p;
|
|
} else {
|
|
void* p = new uint8_t[nextFastAllocatedSize(sizeof(Object))];
|
|
return p;
|
|
}
|
|
}
|
|
|
|
static void operator delete(void* s) {
|
|
INSTRUMENT_RELEASE(typeid(Object).name());
|
|
|
|
if constexpr (sizeof(Object) <= 256) {
|
|
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
|
|
} else {
|
|
delete[] reinterpret_cast<uint8_t*>(s);
|
|
}
|
|
}
|
|
// Redefine placement new so you can still use it
|
|
static void* operator new(size_t, void* p) { return p; }
|
|
static void operator delete(void*, void*) {}
|
|
};
|
|
|
|
[[nodiscard]] inline void* allocateFast(int size) {
|
|
if (size <= 16)
|
|
return FastAllocator<16>::allocate();
|
|
if (size <= 32)
|
|
return FastAllocator<32>::allocate();
|
|
if (size <= 64)
|
|
return FastAllocator<64>::allocate();
|
|
if (size <= 96)
|
|
return FastAllocator<96>::allocate();
|
|
if (size <= 128)
|
|
return FastAllocator<128>::allocate();
|
|
if (size <= 256)
|
|
return FastAllocator<256>::allocate();
|
|
return new uint8_t[size];
|
|
}
|
|
|
|
inline void freeFast(int size, void* ptr) {
|
|
if (size <= 16)
|
|
return FastAllocator<16>::release(ptr);
|
|
if (size <= 32)
|
|
return FastAllocator<32>::release(ptr);
|
|
if (size <= 64)
|
|
return FastAllocator<64>::release(ptr);
|
|
if (size <= 96)
|
|
return FastAllocator<96>::release(ptr);
|
|
if (size <= 128)
|
|
return FastAllocator<128>::release(ptr);
|
|
if (size <= 256)
|
|
return FastAllocator<256>::release(ptr);
|
|
delete[](uint8_t*) ptr;
|
|
}
|
|
|
|
// Allocate a block of memory aligned to 4096 bytes. Size must be a multiple of
|
|
// 4096. Guaranteed not to return null. Use freeFast4kAligned to free.
|
|
[[nodiscard]] inline void* allocateFast4kAligned(int size) {
|
|
#if !defined(USE_JEMALLOC)
|
|
// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
|
|
if (size <= 4096)
|
|
return FastAllocator<4096>::allocate();
|
|
if (size <= 8192)
|
|
return FastAllocator<8192>::allocate();
|
|
if (size <= 16384)
|
|
return FastAllocator<16384>::allocate();
|
|
#endif
|
|
auto* result = aligned_alloc(4096, size);
|
|
if (result == nullptr) {
|
|
platform::outOfMemory();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Free a pointer returned from allocateFast4kAligned(size)
|
|
inline void freeFast4kAligned(int size, void* ptr) {
|
|
#if !defined(USE_JEMALLOC)
|
|
// Sizes supported by FastAllocator must be release via FastAllocator
|
|
if (size <= 4096)
|
|
return FastAllocator<4096>::release(ptr);
|
|
if (size <= 8192)
|
|
return FastAllocator<8192>::release(ptr);
|
|
if (size <= 16384)
|
|
return FastAllocator<16384>::release(ptr);
|
|
#endif
|
|
aligned_free(ptr);
|
|
}
|
|
|
|
#endif
|