foundationdb/flow/FastAlloc.h

313 lines
8.9 KiB
C++

/*
* FastAlloc.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FLOW_FASTALLOC_H
#define FLOW_FASTALLOC_H
#pragma once
#include "flow/Error.h"
#include "flow/Platform.h"
#include "flow/config.h"
// ALLOC_INSTRUMENTATION_STDOUT enables non-sampled logging of all allocations and deallocations to stdout to be
// processed by tools/alloc_instrumentation.py
//#define ALLOC_INSTRUMENTATION_STDOUT ENABLED(NOT_IN_CLEAN)
//#define ALLOC_INSTRUMENTATION ENABLED(NOT_IN_CLEAN)
// The form "(1==1)" in this context is used to satisfy both clang and vc++ with a single syntax. Clang rejects "1" and
// vc++ rejects "true".
#define FASTALLOC_THREAD_SAFE (FLOW_THREAD_SAFE || (1 == 1))
#if VALGRIND
#include <drd.h>
#include <memcheck.h>
bool valgrindPrecise();
#endif
#include "flow/Hash3.h"
#include <assert.h>
#include <atomic>
#include <vector>
#include <cstdlib>
#include <cstdio>
#include <unordered_map>
#if defined(ALLOC_INSTRUMENTATION) && defined(__linux__)
#include <execinfo.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#endif
#ifdef ALLOC_INSTRUMENTATION
#include <map>
#include <algorithm>
#include "flow/ThreadPrimitives.h"
struct AllocInstrInfo {
int64_t allocCount;
int64_t deallocCount;
int64_t maxAllocated;
inline void alloc(int64_t count = 1) {
allocCount += count;
maxAllocated = std::max(allocCount - deallocCount, maxAllocated);
}
inline void dealloc(int64_t count = 1) { deallocCount += count; }
};
extern std::map<const char*, AllocInstrInfo> allocInstr;
#define INSTRUMENT_ALLOCATE(name) (allocInstr[(name)].alloc())
#define INSTRUMENT_RELEASE(name) (allocInstr[(name)].dealloc())
// extern std::map<uint32_t, uint64_t> stackAllocations;
// maps from an address to the hash of the backtrace and the size of the alloction
extern std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample;
struct BackTraceAccount {
double count;
size_t sampleCount;
size_t totalSize;
std::vector<void*>* backTrace;
};
// maps from a hash of a backtrace to a backtrace and the total size of data currently allocated from this stack
extern std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup;
extern ThreadSpinLock memLock;
extern thread_local bool memSample_entered;
extern const size_t SAMPLE_BYTES;
#else
#define INSTRUMENT_ALLOCATE(name)
#define INSTRUMENT_RELEASE(name)
#endif
#if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
void recordAllocation(void* ptr, size_t size);
void recordDeallocation(void* ptr);
#endif
inline constexpr auto kFastAllocMagazineBytes = 128 << 10;
template <int Size>
class FastAllocator {
public:
[[nodiscard]] static void* allocate();
static void release(void* ptr);
static void check(void* ptr, bool alloc);
static long long getTotalMemory();
static long long getApproximateMemoryUnused();
static long long getActiveThreads();
#ifdef ALLOC_INSTRUMENTATION
static volatile int32_t pageCount;
#endif
FastAllocator() = delete;
private:
#ifdef VALGRIND
static unsigned long vLock;
#endif
static const int magazine_size = kFastAllocMagazineBytes / Size;
static const int PSize = Size / sizeof(void*);
struct GlobalData;
struct ThreadData {
void* freelist;
int count; // there are count items on freelist
void* alternate; // alternate is either a full magazine, or an empty one
ThreadData();
~ThreadData();
};
struct ThreadDataInit {
ThreadDataInit() { threadData(); }
};
// Used to try to initialize threadData as early as possible. It's still
// possible that a static thread local variable (that owns fast-allocated
// memory) could be constructed before threadData, in which case threadData
// would be destroyed by the time that variable's destructor attempts to free.
// This is undefined behavior if this happens, which is why we want to
// initialize threadData as early as possible.
static thread_local ThreadDataInit threadDataInit;
// Used to access threadData. Returning a reference to a function-level
// static guarantees that threadData will be constructed before it's
// accessed here. Furthermore, if accessing threadData from a static thread
// local variable's constructor, this guarantees that threadData will
// outlive this object, since destruction order is the reverse of
// construction order.
static ThreadData& threadData() noexcept;
static GlobalData* globalData() noexcept {
#ifdef VALGRIND
ANNOTATE_RWLOCK_ACQUIRED(vLock, 1);
#endif
static GlobalData* data = new GlobalData(); // This is thread-safe as of c++11 (VS 2015, gcc 4.8, clang 3.3)
#ifdef VALGRIND
ANNOTATE_RWLOCK_RELEASED(vLock, 1);
#endif
return data;
}
static void* freelist;
static void getMagazine();
static void releaseMagazine(void*);
};
extern std::atomic<int64_t> g_hugeArenaMemory;
void hugeArenaSample(int size);
void releaseAllThreadMagazines();
int64_t getTotalUnusedAllocatedMemory();
inline constexpr int nextFastAllocatedSize(int x) {
assert(x > 0 && x <= 8192);
if (x <= 16)
return 16;
else if (x <= 32)
return 32;
else if (x <= 64)
return 64;
else if (x <= 96)
return 96;
else if (x <= 128)
return 128;
else if (x <= 256)
return 256;
else if (x <= 512)
return 512;
else if (x <= 1024)
return 1024;
else if (x <= 2048)
return 2048;
else if (x <= 4096)
return 4096;
else
return 8192;
}
template <class Object>
class FastAllocated {
public:
[[nodiscard]] static void* operator new(size_t s) {
if (s != sizeof(Object))
abort();
INSTRUMENT_ALLOCATE(typeid(Object).name());
void* p = FastAllocator < sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object)) > ::allocate();
return p;
}
static void operator delete(void* s) {
INSTRUMENT_RELEASE(typeid(Object).name());
FastAllocator<sizeof(Object) <= 64 ? 64 : nextFastAllocatedSize(sizeof(Object))>::release(s);
}
// Redefine placement new so you can still use it
static void* operator new(size_t, void* p) { return p; }
static void operator delete(void*, void*) {}
};
[[nodiscard]] inline void* allocateFast(int size) {
if (size <= 16)
return FastAllocator<16>::allocate();
if (size <= 32)
return FastAllocator<32>::allocate();
if (size <= 64)
return FastAllocator<64>::allocate();
if (size <= 96)
return FastAllocator<96>::allocate();
if (size <= 128)
return FastAllocator<128>::allocate();
if (size <= 256)
return FastAllocator<256>::allocate();
if (size <= 512)
return FastAllocator<512>::allocate();
if (size <= 1024)
return FastAllocator<1024>::allocate();
if (size <= 2048)
return FastAllocator<2048>::allocate();
if (size <= 4096)
return FastAllocator<4096>::allocate();
if (size <= 8192)
return FastAllocator<8192>::allocate();
if (size <= 16384)
return FastAllocator<16384>::allocate();
return new uint8_t[size];
}
inline void freeFast(int size, void* ptr) {
if (size <= 16)
return FastAllocator<16>::release(ptr);
if (size <= 32)
return FastAllocator<32>::release(ptr);
if (size <= 64)
return FastAllocator<64>::release(ptr);
if (size <= 96)
return FastAllocator<96>::release(ptr);
if (size <= 128)
return FastAllocator<128>::release(ptr);
if (size <= 256)
return FastAllocator<256>::release(ptr);
if (size <= 512)
return FastAllocator<512>::release(ptr);
if (size <= 1024)
return FastAllocator<1024>::release(ptr);
if (size <= 2048)
return FastAllocator<2048>::release(ptr);
if (size <= 4096)
return FastAllocator<4096>::release(ptr);
if (size <= 8192)
return FastAllocator<8192>::release(ptr);
if (size <= 16384)
return FastAllocator<16384>::release(ptr);
delete[](uint8_t*) ptr;
}
[[nodiscard]] inline void* allocateFast4kAligned(int size) {
#if !defined(USE_JEMALLOC)
// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
if (size <= 4096)
return FastAllocator<4096>::allocate();
if (size <= 8192)
return FastAllocator<8192>::allocate();
if (size <= 16384)
return FastAllocator<16384>::allocate();
#endif
auto* result = aligned_alloc(4096, size);
if (result == nullptr) {
platform::outOfMemory();
}
return result;
}
inline void freeFast4kAligned(int size, void* ptr) {
#if !defined(USE_JEMALLOC)
// Sizes supported by FastAllocator must be release via FastAllocator
if (size <= 4096)
return FastAllocator<4096>::release(ptr);
if (size <= 8192)
return FastAllocator<8192>::release(ptr);
if (size <= 16384)
return FastAllocator<16384>::release(ptr);
#endif
aligned_free(ptr);
}
#endif