foundationdb/fdbserver/VersionedBTree.actor.cpp

11464 lines
425 KiB
C++

/*
* VersionedBTree.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Tuple.h"
#include "fdbrpc/ContinuousSample.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/DeltaTree.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/IPager.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "flow/Error.h"
#include "flow/FastRef.h"
#include "flow/flow.h"
#include "flow/genericactors.actor.h"
#include "flow/Histogram.h"
#include "flow/IAsyncFile.h"
#include "flow/IRandom.h"
#include "flow/Knobs.h"
#include "flow/ObjectSerializer.h"
#include "flow/PriorityMultiLock.actor.h"
#include "flow/serialize.h"
#include "flow/Trace.h"
#include "flow/UnitTest.h"
#include "fmt/format.h"
#include <boost/intrusive/list.hpp>
#include <cinttypes>
#include <limits>
#include <map>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#include "flow/actorcompiler.h" // must be last include
#define REDWOOD_DEBUG 0
// Only print debug info for a specific address
static NetworkAddress g_debugAddress = NetworkAddress::parse("0.0.0.0:0");
// Only print debug info after a specific time
static double g_debugStart = 0;
// Debug output stream
static FILE* g_debugStream = stdout;
#define debug_printf_always(...) \
if (now() >= g_debugStart && (!g_network->getLocalAddress().isValid() || !g_debugAddress.isValid() || \
g_network->getLocalAddress() == g_debugAddress)) { \
std::string prefix = format("%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \
std::string msg = format(__VA_ARGS__); \
fputs(addPrefix(prefix, msg).c_str(), g_debugStream); \
fflush(g_debugStream); \
}
#define debug_print(str) debug_printf("%s\n", str.c_str())
#define debug_print_always(str) debug_printf_always("%s\n", str.c_str())
#define debug_printf_noop(...)
#if defined(NO_INTELLISENSE)
#if REDWOOD_DEBUG
#define debug_printf debug_printf_always
#else
#define debug_printf debug_printf_noop
#endif
#else
// To get error-checking on debug_printf statements in IDE
#define debug_printf printf
#endif
#define BEACON debug_printf_always("HERE\n")
#define TRACE \
debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str());
// Returns a string where every line in lines is prefixed with prefix
std::string addPrefix(std::string prefix, std::string lines) {
StringRef m = lines;
std::string s;
while (m.size() != 0) {
StringRef line = m.eat("\n");
s += prefix;
s += ' ';
s += line.toString();
s += '\n';
}
return s;
}
// Some convenience functions for debugging to stringify various structures
// Classes can add compatibility by either specializing toString<T> or implementing
// std::string toString() const;
template <typename T>
std::string toString(const T& o) {
return o.toString();
}
std::string toString(const std::string& s) {
return s;
}
std::string toString(StringRef s) {
return s.printable();
}
std::string toString(LogicalPageID id) {
if (id == invalidLogicalPageID) {
return "LogicalPageID{invalid}";
}
return format("LogicalPageID{%u}", id);
}
std::string toString(Version v) {
if (v == invalidVersion) {
return "invalidVersion";
}
return format("@%" PRId64, v);
}
std::string toString(bool b) {
return b ? "true" : "false";
}
template <typename T>
std::string toString(const Standalone<T>& s) {
return toString((T)s);
}
template <typename T>
std::string toString(const T* begin, const T* end) {
std::string r = "{";
bool comma = false;
while (begin != end) {
if (comma) {
r += ", ";
} else {
comma = true;
}
r += toString(*begin++);
}
r += "}";
return r;
}
template <typename T>
std::string toString(const std::vector<T>& v) {
return toString(&v.front(), &v.back() + 1);
}
template <typename T>
std::string toString(const VectorRef<T>& v) {
return toString(v.begin(), v.end());
}
template <typename K, typename V>
std::string toString(const std::map<K, V>& m) {
std::string r = "{";
bool comma = false;
for (const auto& [key, value] : m) {
if (comma) {
r += ", ";
} else {
comma = true;
}
r += toString(value);
r += " ";
r += toString(key);
}
r += "}\n";
return r;
}
template <typename K, typename V>
std::string toString(const std::unordered_map<K, V>& u) {
std::string r = "{";
bool comma = false;
for (const auto& n : u) {
if (comma) {
r += ", ";
} else {
comma = true;
}
r += toString(n.first);
r += " => ";
r += toString(n.second);
}
r += "}";
return r;
}
template <typename T>
std::string toString(const Optional<T>& o) {
if (o.present()) {
return toString(o.get());
}
return "<not present>";
}
template <typename F, typename S>
std::string toString(const std::pair<F, S>& o) {
return format("{%s, %s}", toString(o.first).c_str(), toString(o.second).c_str());
}
constexpr static int ioMinPriority = 0;
constexpr static int ioLeafPriority = 1;
constexpr static int ioMaxPriority = 3;
// A FIFO queue of T stored as a linked list of pages.
// Main operations are pop(), pushBack(), pushFront(), and flush().
//
// flush() will ensure all queue pages are written to the pager and move the unflushed
// pushFront()'d records onto the front of the queue, in FIFO order.
//
// pop() will only return records that have been flushed, and pops
// from the front of the queue.
//
// Each page contains some number of T items and a link to the next page and starting position on that page.
// When the queue is flushed, the last page in the chain is ended and linked to a newly allocated
// but not-yet-written-to pageID, which future writes after the flush will write to.
// Items pushed onto the front of the queue are written to a separate linked list until flushed,
// at which point that list becomes the new front of the queue.
//
// The write pattern is designed such that no page is ever expected to be valid after
// being written to or updated but not fsync'd. This is why a new unused page is added
// to the queue, linked to by the last data page, before commit. The new page can't be
// added and filled with data as part of the next commit because that would mean modifying
// the previous tail page to update its next link, which risks corrupting it and losing
// data that was not yet popped if that write is never fsync'd.
//
// While FIFOQueue stores data inside a pager, it also is used _by_ the pager to hold internal metadata
// such as free lists and logical page remapping activity.
// Therefore, FIFOQueue
// - does not use versioned page reads
// - does not use versioned page updates via atomicUpdatePage()
// - does not update pages in-place
// - writes to a Page ID only once before freeing it
//
// For clarity, FIFOQueue Page IDs are always typed as PhysicalPageID, as they are always
// physical page IDs since atomic page updates are not used.
//
// Requirements on T
// - must be trivially copyable
// OR have a specialization for FIFOQueueCodec<T>
// OR have the following methods
// // Deserialize from src into *this, return number of bytes from src consumed
// int readFromBytes(const uint8_t *src);
// // Return the size of *this serialized
// int bytesNeeded() const;
// // Serialize *this to dst, return number of bytes written to dst
// int writeToBytes(uint8_t *dst) const;
// - must be supported by toString(object) (see above)
template <typename T, typename Enable = void>
struct FIFOQueueCodec {
static T readFromBytes(const uint8_t* src, int& bytesRead) {
T x;
bytesRead = x.readFromBytes(src);
return x;
}
static int bytesNeeded(const T& x) { return x.bytesNeeded(); }
static int writeToBytes(uint8_t* dst, const T& x) { return x.writeToBytes(dst); }
};
template <typename T>
struct FIFOQueueCodec<T, typename std::enable_if<std::is_trivially_copyable<T>::value>::type> {
static_assert(std::is_trivially_copyable<T>::value);
static T readFromBytes(const uint8_t* src, int& bytesRead) {
bytesRead = sizeof(T);
return *(T*)src;
}
static int bytesNeeded(const T& x) { return sizeof(T); }
static int writeToBytes(uint8_t* dst, const T& x) {
*(T*)dst = x;
return sizeof(T);
}
};
template <typename T, typename Codec = FIFOQueueCodec<T>>
class FIFOQueue {
public:
#pragma pack(push, 1)
struct QueueState {
constexpr static FileIdentifier file_identifier = 16482812;
QueueID queueID;
// First page in the linked-list queue structure
PhysicalPageID headPageID;
// Item offset representing the next item in the queue in the first page
uint16_t headOffset;
// Last page in the linked-list queue structure, where new entries will be written
// Note there is no tailPageOffset, it is always 0 because the tail page has never been written
PhysicalPageID tailPageID;
uint32_t numPages;
// numEntries could technically exceed the max page ID so it is 64 bits
uint64_t numEntries;
// State for queues that use pages in contiguous blocks of disk space called extents
bool usesExtents;
PhysicalPageID prevExtentEndPageID;
bool tailPageNewExtent; // Tail page points to the start of a new extent
std::string toString() const {
return format("{queueID: %u head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64
" usesExtents:%d}",
queueID,
::toString(headPageID).c_str(),
(int)headOffset,
::toString(tailPageID).c_str(),
numPages,
numEntries,
usesExtents);
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar,
queueID,
headPageID,
headOffset,
tailPageID,
numPages,
numEntries,
usesExtents,
prevExtentEndPageID,
tailPageNewExtent);
}
};
struct QueuePage {
// The next page of the queue after this one
PhysicalPageID nextPageID;
// The start offset of the next page
uint16_t nextOffset;
// The end offset of the current page's data entries
uint16_t endOffset;
// Current page within the extent
PhysicalPageID extentCurPageID;
// The end page within the extent
PhysicalPageID extentEndPageID;
uint16_t itemSpace;
// Get pointer to data after page header
const uint8_t* begin() const { return (const uint8_t*)(this + 1); }
uint8_t* begin() { return (uint8_t*)(this + 1); }
};
#pragma pack(pop)
struct Cursor {
// Queue mode
enum Mode { NONE, POP, READONLY, WRITE };
Mode mode;
// Queue this cursor is accessing
FIFOQueue* queue;
// The current page and pageID being read or written to
PhysicalPageID pageID;
Reference<ArenaPage> page;
// The first page ID to be written to the pager, if this cursor has written anything
PhysicalPageID firstPageIDWritten;
// Offset after QueuePage::begin() to next read from or write to
int offset;
// A read cursor will not read this page (or beyond)
PhysicalPageID endPageID;
// Page future and corresponding page ID for the expected next page to be used. It may not
// match the current page's next page link because queues can prepended with new front pages.
Future<Reference<ArenaPage>> nextPageReader;
PhysicalPageID nextPageID;
// Future that represents all outstanding write operations previously issued
// This exists because writing the queue returns void, not a future
Future<Void> writeOperations;
FlowMutex mutex;
Cursor() : mode(NONE) {}
// Initialize a cursor.
void init(FIFOQueue* q = nullptr,
Mode m = NONE,
PhysicalPageID initialPageID = invalidPhysicalPageID,
bool initExtentInfo = true,
bool tailPageNewExtent = false,
PhysicalPageID endPage = invalidPhysicalPageID,
int readOffset = 0,
PhysicalPageID prevExtentEndPageID = invalidPhysicalPageID) {
queue = q;
mode = m;
firstPageIDWritten = invalidPhysicalPageID;
offset = readOffset;
endPageID = endPage;
page.clear();
writeOperations = Void();
if (mode == POP || mode == READONLY) {
// If cursor is not pointed at the end page then start loading it.
// The end page will not have been written to disk yet.
pageID = initialPageID;
if (pageID != endPageID) {
// For extent based queues, we loads extents at a time during recovery
if (queue->usesExtents && initExtentInfo)
loadExtent();
else
startNextPageLoad(pageID);
} else {
nextPageID = invalidPhysicalPageID;
}
} else {
pageID = invalidPhysicalPageID;
ASSERT(mode == WRITE ||
(initialPageID == invalidPhysicalPageID && readOffset == 0 && endPage == invalidPhysicalPageID));
}
debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str());
if (mode == WRITE && initialPageID != invalidPhysicalPageID) {
debug_printf("FIFOQueue::Cursor(%s) init. Adding new page %u\n", toString().c_str(), initialPageID);
addNewPage(initialPageID, 0, true, initExtentInfo, tailPageNewExtent, prevExtentEndPageID);
}
}
// Reset the read cursor (this is used only for extent based remap queue after recovering the remap
// queue contents via fastpath extent reads)
void resetRead() {
ASSERT(mode == POP || mode == READONLY);
page.clear();
if (pageID != endPageID) {
startNextPageLoad(pageID);
}
}
// Since cursors can have async operations pending which modify their state they can't be copied cleanly
Cursor(const Cursor& other) = delete;
~Cursor() { writeOperations.cancel(); }
// A read cursor can be initialized from a pop cursor
void initReadOnly(const Cursor& c, bool readExtents = false) {
ASSERT(c.mode == READONLY || c.mode == POP);
init(c.queue, READONLY, c.pageID, readExtents, false, c.endPageID, c.offset);
}
std::string toString() const {
if (mode == WRITE) {
return format("{WriteCursor %s:%p pos=%s:%d rawEndOffset=%d}",
queue->name.c_str(),
this,
::toString(pageID).c_str(),
offset,
page ? header()->endOffset : -1);
}
if (mode == POP || mode == READONLY) {
return format("{ReadCursor %s:%p pos=%s:%d rawEndOffset=%d endPage=%s nextPage=%s}",
queue->name.c_str(),
this,
::toString(pageID).c_str(),
offset,
(page && header()) ? header()->endOffset : -1,
::toString(endPageID).c_str(),
::toString(nextPageID).c_str());
}
ASSERT(mode == NONE);
return format("{NullCursor=%p}", this);
}
// Returns true if the mutex cannot be immediately taken.
bool isBusy() { return !mutex.available(); }
// Wait for all operations started before now to be ready, which is done by
// obtaining and releasing the mutex.
Future<Void> notBusy() {
return isBusy() ? map(mutex.take(),
[&](FlowMutex::Lock lock) {
lock.release();
return Void();
})
: Void();
}
// Returns true if any items have been written to the last page
bool pendingTailWrites() const { return mode == WRITE && offset != 0; }
QueuePage* header() const { return ((QueuePage*)(page->mutateData())); }
void setNext(PhysicalPageID pageID, int offset) {
ASSERT(mode == WRITE);
QueuePage* p = header();
p->nextPageID = pageID;
p->nextOffset = offset;
}
void startNextPageLoad(PhysicalPageID id) {
nextPageID = id;
debug_printf(
"FIFOQueue::Cursor(%s) loadPage start id=%s\n", toString().c_str(), ::toString(nextPageID).c_str());
nextPageReader = queue->pager->readPage(
PagerEventReasons::MetaData, nonBtreeLevel, nextPageID, ioMaxPriority, true, false);
if (!nextPageReader.isReady()) {
nextPageReader = waitOrError(nextPageReader, queue->pagerError);
}
}
Future<Void> loadExtent() {
ASSERT(mode == POP | mode == READONLY);
debug_printf("FIFOQueue::Cursor(%s) loadExtent\n", toString().c_str());
return map(queue->pager->readExtent(pageID), [=](Reference<ArenaPage> p) {
page = p;
debug_printf("FIFOQueue::Cursor(%s) loadExtent done. Page: %p\n", toString().c_str(), page->rawData());
return Void();
});
}
void writePage() {
ASSERT(mode == WRITE);
debug_printf("FIFOQueue::Cursor(%s) writePage\n", toString().c_str());
// Zero unused space within itemspace
memset(header()->begin() + header()->endOffset, 0, header()->itemSpace - header()->endOffset);
queue->pager->updatePage(
PagerEventReasons::MetaData, nonBtreeLevel, VectorRef<PhysicalPageID>(&pageID, 1), page);
if (firstPageIDWritten == invalidPhysicalPageID) {
firstPageIDWritten = pageID;
}
}
// Link the current page to newPageID:newOffset and then write it to the pager.
// The link destination could be a new page at the end of the queue, or the beginning of
// an existing chain of queue pages.
// If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized
// as a new tail page.
// if initializeExtentInfo is true in addition to initializeNewPage, update the extentEndPageID info
// in the mew page being added using newExtentPage and prevExtentEndPageID parameters
void addNewPage(PhysicalPageID newPageID,
int newOffset,
bool initializeNewPage,
bool initializeExtentInfo = false,
bool newExtentPage = false,
PhysicalPageID prevExtentEndPageID = invalidPhysicalPageID) {
ASSERT(mode == WRITE);
ASSERT(newPageID != invalidPhysicalPageID);
debug_printf("FIFOQueue::Cursor(%s) Adding page %s initPage=%d initExtentInfo=%d newExtentPage=%d\n",
toString().c_str(),
::toString(newPageID).c_str(),
initializeNewPage,
initializeExtentInfo,
newExtentPage);
// Update existing page/newLastPageID and write, if it exists
if (page) {
setNext(newPageID, newOffset);
debug_printf("FIFOQueue::Cursor(%s) Linked new page %s:%d\n",
toString().c_str(),
::toString(newPageID).c_str(),
newOffset);
writePage();
prevExtentEndPageID = header()->extentEndPageID;
if (pageID == prevExtentEndPageID)
newExtentPage = true;
debug_printf(
"FIFOQueue::Cursor(%s) Linked new page. pageID %u, newPageID %u, prevExtentEndPageID %u\n",
toString().c_str(),
pageID,
newPageID,
prevExtentEndPageID);
}
pageID = newPageID;
offset = newOffset;
if (initializeNewPage) {
debug_printf("FIFOQueue::Cursor(%s) Initializing new page. usesExtents: %d, initializeExtentInfo: %d\n",
toString().c_str(),
queue->usesExtents,
initializeExtentInfo);
page = queue->pager->newPageBuffer();
page->init(EncodingType::XXHash64,
queue->usesExtents ? PageType::QueuePageInExtent : PageType::QueuePageStandalone,
(uint8_t)queue->queueID);
setNext(0, 0);
auto p = header();
ASSERT(newOffset == 0);
p->endOffset = 0;
p->itemSpace = page->dataSize() - sizeof(QueuePage);
if (g_network->isSimulated() && deterministicRandom()->coinflip()) {
// Randomly reduce available item space to cause more queue pages to be needed
int reducedSpace = deterministicRandom()->randomInt(50, p->itemSpace);
// Zero the eliminated space
memset(header()->begin() + reducedSpace, 0, p->itemSpace - reducedSpace);
p->itemSpace = reducedSpace;
}
// For extent based queue, update the index of current page within the extent
if (queue->usesExtents) {
debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d pageCount % " PRId64 "\n",
toString().c_str(),
::toString(newPageID).c_str(),
initializeNewPage,
queue->pager->getPageCount());
p->extentCurPageID = newPageID;
if (initializeExtentInfo) {
int pagesPerExtent = queue->pagesPerExtent;
if (newExtentPage) {
p->extentEndPageID = newPageID + pagesPerExtent - 1;
debug_printf("FIFOQueue::Cursor(%s) newExtentPage. newPageID %u, pagesPerExtent %d, "
"ExtentEndPageID: %s\n",
toString().c_str(),
newPageID,
pagesPerExtent,
::toString(p->extentEndPageID).c_str());
} else {
p->extentEndPageID = prevExtentEndPageID;
debug_printf("FIFOQueue::Cursor(%s) Copied ExtentEndPageID: %s\n",
toString().c_str(),
::toString(p->extentEndPageID).c_str());
}
}
} else {
p->extentCurPageID = invalidPhysicalPageID;
p->extentEndPageID = invalidPhysicalPageID;
}
} else {
debug_printf("FIFOQueue::Cursor(%s) Clearing new page\n", toString().c_str());
page.clear();
}
}
// Write item to the next position in the current page or, if it won't fit, add a new page and write it there.
ACTOR static Future<Void> write_impl(Cursor* self, T item) {
ASSERT(self->mode == WRITE);
state FlowMutex::Lock lock;
state bool mustWait = self->isBusy();
state int bytesNeeded = Codec::bytesNeeded(item);
state bool needNewPage =
self->pageID == invalidPhysicalPageID || self->offset + bytesNeeded > self->header()->itemSpace;
if (g_network->isSimulated()) {
// Sometimes (1% probability) decide a new page is needed as long as at least 1 item has been
// written (indicated by non-zero offset) to the current page.
if ((self->offset > 0) && deterministicRandom()->random01() < 0.01) {
needNewPage = true;
}
}
debug_printf("FIFOQueue::Cursor(%s) write(%s) mustWait=%d needNewPage=%d\n",
self->toString().c_str(),
::toString(item).c_str(),
mustWait,
needNewPage);
// If we have to wait for the mutex because it's busy, or we need a new page, then wait for the mutex.
if (mustWait || needNewPage) {
FlowMutex::Lock _lock = wait(self->mutex.take());
lock = _lock;
// If we had to wait because the mutex was busy, then update needNewPage as another writer
// would have changed the cursor state
// Otherwise, taking the mutex would be immediate so no other writer could have run
if (mustWait) {
needNewPage =
self->pageID == invalidPhysicalPageID || self->offset + bytesNeeded > self->header()->itemSpace;
if (g_network->isSimulated()) {
// Sometimes (1% probability) decide a new page is needed as long as at least 1 item has been
// written (indicated by non-zero offset) to the current page.
if ((self->offset > 0) && deterministicRandom()->random01() < 0.01) {
needNewPage = true;
}
}
}
}
// If we need a new page, add one.
if (needNewPage) {
debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n",
self->toString().c_str(),
::toString(item).c_str());
state PhysicalPageID newPageID;
// If this is an extent based queue, check if there is an available page in current extent
if (self->queue->usesExtents) {
bool allocateNewExtent = false;
if (self->pageID != invalidPhysicalPageID) {
auto praw = self->header();
if (praw->extentCurPageID < praw->extentEndPageID) {
newPageID = praw->extentCurPageID + 1;
} else {
allocateNewExtent = true;
}
} else
allocateNewExtent = true;
if (allocateNewExtent) {
PhysicalPageID newPID = wait(self->queue->pager->newExtentPageID(self->queue->queueID));
newPageID = newPID;
}
} else {
PhysicalPageID newPID = wait(self->queue->pager->newPageID());
newPageID = newPID;
}
self->addNewPage(newPageID, 0, true, true);
++self->queue->numPages;
}
debug_printf(
"FIFOQueue::Cursor(%s) write(%s) writing\n", self->toString().c_str(), ::toString(item).c_str());
auto p = self->header();
Codec::writeToBytes(p->begin() + self->offset, item);
self->offset += bytesNeeded;
p->endOffset = self->offset;
++self->queue->numEntries;
if (mustWait || needNewPage) {
// Prevent possible stack overflow if too many waiters which require no IO are queued up
// Using static because multiple Cursors can be involved
static int sinceYield = 0;
if (++sinceYield == 1000) {
sinceYield = 0;
wait(delay(0));
}
lock.release();
}
return Void();
}
void write(const T& item) {
// Start the write. It may complete immediately if no IO was being waited on
Future<Void> w = write_impl(this, item);
// If it didn't complete immediately, then store the future in operation
if (!w.isReady()) {
writeOperations = writeOperations && w;
}
}
// If readNext() cannot complete immediately because it must wait for IO, it will route to here.
// The purpose of this function is to serialize simultaneous readers on self while letting the
// common case (>99.8% of the time) be handled with low overhead by the non-actor readNext() function.
//
// The mutex will be taken if locked is false.
// The next page will be waited for if load is true.
// Only mutex holders will wait on the page read.
ACTOR static Future<Optional<T>> waitThenReadNext(Cursor* self,
Optional<T> inclusiveMaximum,
FlowMutex::Lock* lock,
bool load) {
state FlowMutex::Lock localLock;
// Lock the mutex if it wasn't already locked, so we didn't get a lock pointer
if (lock == nullptr) {
debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext locking mutex\n", self->toString().c_str());
FlowMutex::Lock newLock = wait(self->mutex.take());
localLock = newLock;
}
if (load) {
debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext waiting for page load\n",
self->toString().c_str());
wait(success(self->nextPageReader));
}
state Optional<T> result = wait(self->readNext(inclusiveMaximum, &localLock));
// If a lock was not passed in, so this actor locked the mutex above, then unlock it
if (lock == nullptr) {
// Prevent possible stack overflow if too many waiters which require no IO are queued up
// Using static because multiple Cursors can be involved
static int sinceYield = 0;
if (++sinceYield == 1000) {
sinceYield = 0;
wait(delay(0));
}
debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext unlocking mutex\n", self->toString().c_str());
localLock.release();
}
return result;
}
// Read the next item from the cursor, possibly moving to and waiting for a new page if the prior page was
// exhausted. If the item is <= inclusiveMaximum, then return it after advancing the cursor to the next item.
// Otherwise, return nothing and do not advance the cursor.
// If locked is true, this call owns the mutex, which would have been locked by readNext() before a recursive
// call. See waitThenReadNext() for more detail.
Future<Optional<T>> readNext(const Optional<T>& inclusiveMaximum = {}, FlowMutex::Lock* lock = nullptr) {
if ((mode != POP && mode != READONLY) || pageID == invalidPhysicalPageID || pageID == endPageID) {
debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", toString().c_str());
return Optional<T>();
}
// If we don't have a lock and the mutex isn't available then acquire it
if (lock == nullptr && isBusy()) {
return waitThenReadNext(this, inclusiveMaximum, lock, false);
}
// We now know pageID is valid and should be used, but page might not point to it yet
if (!page) {
debug_printf("FIFOQueue::Cursor(%s) loading\n", toString().c_str());
// If the next pageID loading or loaded is not the page we should be reading then restart the load
// nextPageID coud be different because it could be invalid or it could be no longer relevant
// if the previous commit added new pages to the front of the queue.
if (pageID != nextPageID) {
debug_printf("FIFOQueue::Cursor(%s) reloading\n", toString().c_str());
startNextPageLoad(pageID);
}
if (!nextPageReader.isReady()) {
return waitThenReadNext(this, inclusiveMaximum, lock, true);
}
page = nextPageReader.get();
// Start loading the next page if it's not the end page
auto p = header();
if (p->nextPageID != endPageID) {
startNextPageLoad(p->nextPageID);
} else {
// Prevent a future next page read from reusing the same result as page would have to be updated
// before the queue would read it again
nextPageID = invalidPhysicalPageID;
}
}
auto p = header();
debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", toString().c_str());
ASSERT(offset < p->endOffset);
int bytesRead;
const T result = Codec::readFromBytes(p->begin() + offset, bytesRead);
if (inclusiveMaximum.present() && inclusiveMaximum.get() < result) {
debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n",
toString().c_str(),
::toString(result).c_str(),
::toString(inclusiveMaximum.get()).c_str());
return Optional<T>();
}
offset += bytesRead;
if (mode == POP) {
--queue->numEntries;
}
debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", toString().c_str(), ::toString(result).c_str());
ASSERT(offset <= p->endOffset);
// If this page is exhausted, start reading the next page for the next readNext() to use, unless it's the
// tail page
if (offset == p->endOffset) {
debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", toString().c_str());
PhysicalPageID oldPageID = pageID;
PhysicalPageID extentCurPageID = p->extentCurPageID;
PhysicalPageID extentEndPageID = p->extentEndPageID;
pageID = p->nextPageID;
offset = p->nextOffset;
// If pageID isn't the tail page and nextPageID isn't pageID then start loading the next page
if (pageID != endPageID && nextPageID != pageID) {
startNextPageLoad(pageID);
}
if (mode == POP) {
--queue->numPages;
}
page.clear();
debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", toString().c_str());
if (mode == POP) {
if (!queue->usesExtents) {
// Freeing the old page must happen after advancing the cursor and clearing the page reference
// because freePage() could cause a push onto a queue that causes a newPageID() call which could
// pop() from this very same queue. Queue pages are freed at version 0 because they can be
// reused after the next commit.
queue->pager->freePage(oldPageID, 0);
} else if (extentCurPageID == extentEndPageID) {
// Figure out the beginning of the extent
int pagesPerExtent = queue->pagesPerExtent;
queue->pager->freeExtent(oldPageID - pagesPerExtent + 1);
}
}
}
debug_printf("FIFOQueue(%s) %s(inclusiveMaximum=%s) -> %s\n",
queue->name.c_str(),
(mode == POP ? "pop" : "peek"),
::toString(inclusiveMaximum).c_str(),
::toString(result).c_str());
return Optional<T>(result);
}
};
public:
FIFOQueue() : pager(nullptr) {}
~FIFOQueue() { newTailPage.cancel(); }
FIFOQueue(const FIFOQueue& other) = delete;
void operator=(const FIFOQueue& rhs) = delete;
// Create a new queue at newPageID
void create(IPager2* p, PhysicalPageID newPageID, std::string queueName, QueueID id, bool extent) {
debug_printf("FIFOQueue(%s) create from page %s. usesExtents %d\n",
queueName.c_str(),
toString(newPageID).c_str(),
extent);
pager = p;
pagerError = pager->getError();
name = queueName;
queueID = id;
numPages = 1;
numEntries = 0;
usesExtents = extent;
tailPageNewExtent = false;
prevExtentEndPageID = invalidPhysicalPageID;
pagesPerExtent = pager->getPagesPerExtent();
headReader.init(this, Cursor::POP, newPageID, false, false, newPageID, 0);
tailWriter.init(this, Cursor::WRITE, newPageID, true, true);
headWriter.init(this, Cursor::WRITE);
newTailPage = invalidPhysicalPageID;
debug_printf("FIFOQueue(%s) created\n", queueName.c_str());
}
// Load an existing queue from its queue state
void recover(IPager2* p, const QueueState& qs, std::string queueName, bool loadExtents = true) {
debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str());
pager = p;
pagerError = pager->getError();
name = queueName;
queueID = qs.queueID;
numPages = qs.numPages;
numEntries = qs.numEntries;
usesExtents = qs.usesExtents;
tailPageNewExtent = qs.tailPageNewExtent;
prevExtentEndPageID = qs.prevExtentEndPageID;
pagesPerExtent = pager->getPagesPerExtent();
headReader.init(this, Cursor::POP, qs.headPageID, loadExtents, false, qs.tailPageID, qs.headOffset);
tailWriter.init(this,
Cursor::WRITE,
qs.tailPageID,
true,
qs.tailPageNewExtent,
invalidPhysicalPageID,
0,
qs.prevExtentEndPageID);
headWriter.init(this, Cursor::WRITE);
newTailPage = invalidPhysicalPageID;
debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str());
}
// Reset the head reader (this is used only for extent based remap queue after recovering the remap
// queue contents via fastpath extent reads)
void resetHeadReader() {
headReader.resetRead();
debug_printf("FIFOQueue(%s) read cursor reset\n", name.c_str());
}
// Fast path extent peekAll (this zooms through the queue reading extents at a time)
// Output interface is a promise stream and one vector of results per extent found is sent to the promise stream
// Once we are finished reading all the extents of the queue, end_of_stream() is sent to mark completion
ACTOR static Future<Void> peekAll_ext(FIFOQueue* self, PromiseStream<Standalone<VectorRef<T>>> res) {
state Cursor c;
c.initReadOnly(self->headReader, true);
debug_printf("FIFOQueue::Cursor(%s) peekAllExt begin\n", c.toString().c_str());
if (c.pageID == invalidPhysicalPageID || c.pageID == c.endPageID) {
debug_printf("FIFOQueue::Cursor(%s) peekAllExt returning nothing\n", c.toString().c_str());
res.sendError(end_of_stream());
return Void();
}
state int entriesRead = 0;
// Loop over all the extents in this queue
loop {
// We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet.
if (!c.page) {
debug_printf("FIFOQueue::Cursor(%s) peekAllExt going to Load Extent %s.\n",
c.toString().c_str(),
::toString(c.pageID).c_str());
wait(c.loadExtent());
wait(yield());
}
state Standalone<VectorRef<T>> results;
results.reserve(results.arena(), self->pagesPerExtent * self->pager->getPhysicalPageSize() / sizeof(T));
// Loop over all the pages in this extent
state int pageIdx = 0;
loop {
// Position the page pointer to current page in the extent
state Reference<ArenaPage> page = c.page->getSubPage(pageIdx++ * self->pager->getPhysicalPageSize(),
self->pager->getLogicalPageSize());
debug_printf("FIFOQueue::Cursor(%s) peekALLExt %s. Offset %d\n",
c.toString().c_str(),
toString(c.pageID).c_str(),
c.pageID * self->pager->getPhysicalPageSize());
try {
page->postReadHeader(c.pageID);
// These pages are not encrypted
page->postReadPayload(c.pageID);
} catch (Error& e) {
TraceEvent(SevError, "RedwoodChecksumFailed")
.error(e)
.detail("PageID", c.pageID)
.detail("PageSize", self->pager->getPhysicalPageSize())
.detail("Offset", c.pageID * self->pager->getPhysicalPageSize());
debug_printf("FIFOQueue::Cursor(%s) peekALLExt getSubPage error=%s for %s. Offset %d ",
c.toString().c_str(),
e.what(),
toString(c.pageID).c_str(),
c.pageID * self->pager->getPhysicalPageSize());
throw;
}
const QueuePage* p = (const QueuePage*)(page->data());
int bytesRead;
// Now loop over all entries inside the current page
loop {
ASSERT(c.offset < p->endOffset);
T result = Codec::readFromBytes(p->begin() + c.offset, bytesRead);
debug_printf(
"FIFOQueue::Cursor(%s) after read of %s\n", c.toString().c_str(), ::toString(result).c_str());
results.push_back(results.arena(), result);
entriesRead++;
c.offset += bytesRead;
ASSERT(c.offset <= p->endOffset);
if (c.offset == p->endOffset) {
c.pageID = p->nextPageID;
c.offset = p->nextOffset;
debug_printf("FIFOQueue::Cursor(%s) peekAllExt page exhausted, moved to new page\n",
c.toString().c_str());
debug_printf("FIFOQueue:: nextPageID=%s, extentCurPageID=%s, extentEndPageID=%s\n",
::toString(p->nextPageID).c_str(),
::toString(p->extentCurPageID).c_str(),
::toString(p->extentEndPageID).c_str());
break;
}
} // End of Page
// Check if we have reached the end of the queue
if (c.pageID == invalidPhysicalPageID || c.pageID == c.endPageID) {
debug_printf("FIFOQueue::Cursor(%s) peekAllExt Queue exhausted\n", c.toString().c_str());
res.send(results);
// Since we have reached the end of the queue, verify that the number of entries read matches
// the queue metadata. If it does, send end_of_stream() to mark completion, else throw an error
if (entriesRead != self->numEntries) {
Error e = internal_error(); // TODO: Something better?
TraceEvent(SevError, "RedwoodQueueNumEntriesMisMatch")
.error(e)
.detail("EntriesRead", entriesRead)
.detail("ExpectedEntries", self->numEntries);
throw e;
}
res.sendError(end_of_stream());
return Void();
}
// Check if we have reached the end of current extent
if (p->extentCurPageID == p->extentEndPageID) {
c.page.clear();
debug_printf("FIFOQueue::Cursor(%s) peekAllExt extent exhausted, moved to new extent\n",
c.toString().c_str());
// send an extent worth of entries to the promise stream
res.send(results);
self->pager->releaseExtentReadLock();
break;
}
} // End of Extent
} // End of Queue
}
Future<Void> peekAllExt(PromiseStream<Standalone<VectorRef<T>>> resStream) { return peekAll_ext(this, resStream); }
ACTOR static Future<Standalone<VectorRef<T>>> peekAll_impl(FIFOQueue* self) {
state Standalone<VectorRef<T>> results;
state Cursor c;
c.initReadOnly(self->headReader);
results.reserve(results.arena(), self->numEntries);
state int sinceYield = 0;
loop {
Optional<T> x = wait(c.readNext());
if (!x.present()) {
break;
}
results.push_back(results.arena(), x.get());
// yield periodically to avoid overflowing the stack
if (++sinceYield >= 100) {
sinceYield = 0;
wait(yield());
}
}
return results;
}
Future<Standalone<VectorRef<T>>> peekAll() { return peekAll_impl(this); }
ACTOR static Future<Optional<T>> peek_impl(FIFOQueue* self) {
state Cursor c;
c.initReadOnly(self->headReader);
Optional<T> x = wait(c.readNext());
return x;
}
Future<Optional<T>> peek() { return peek_impl(this); }
// Pop the next item on front of queue if it is <= inclusiveMaximum or if inclusiveMaximum is not present
Future<Optional<T>> pop(Optional<T> inclusiveMaximum = {}) { return headReader.readNext(inclusiveMaximum); }
QueueState getState() const {
QueueState s;
s.queueID = queueID;
s.headOffset = headReader.offset;
s.headPageID = headReader.pageID;
s.tailPageID = tailWriter.pageID;
s.numEntries = numEntries;
s.numPages = numPages;
s.usesExtents = usesExtents;
s.tailPageNewExtent = tailPageNewExtent;
s.prevExtentEndPageID = prevExtentEndPageID;
debug_printf("FIFOQueue(%s) getState(): %s\n", name.c_str(), s.toString().c_str());
return s;
}
void pushBack(const T& item) {
debug_printf("FIFOQueue(%s) pushBack(%s)\n", name.c_str(), toString(item).c_str());
tailWriter.write(item);
}
void pushFront(const T& item) {
debug_printf("FIFOQueue(%s) pushFront(%s)\n", name.c_str(), toString(item).c_str());
headWriter.write(item);
}
bool isBusy() {
return headWriter.isBusy() || headReader.isBusy() || tailWriter.isBusy() || !newTailPage.isReady();
}
// Wait until all previously started operations on each cursor are done and the new tail page is ready
Future<Void> notBusy() {
auto f = headWriter.notBusy() && headReader.notBusy() && tailWriter.notBusy() && ready(newTailPage);
debug_printf("FIFOQueue(%s) notBusy future ready=%d\n", name.c_str(), f.isReady());
return f;
}
// preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still
// be pushed and popped after this operation. It returns whether or not any operations were pending or
// started during execution.
//
// If one or more queues are used by their pager in newPageID() or freePage() operations, then preFlush()
// must be called on each of them inside a loop that runs until each of the preFlush() calls have returned
// false twice in a row.
//
// The reason for all this is that:
// - queue pop() can call pager->freePage() which can call push() on the same or another queue
// - queue push() can call pager->newPageID() which can call pop() on the same or another queue
// This creates a circular dependency with 1 or more queues when those queues are used by the pager
// to manage free page IDs.
ACTOR static Future<bool> preFlush_impl(FIFOQueue* self) {
debug_printf("FIFOQueue(%s) preFlush begin\n", self->name.c_str());
wait(self->notBusy());
// Completion of the pending operations as of the start of notBusy() could have began new operations,
// so see if any work is pending now.
bool workPending = self->isBusy();
if (!workPending) {
// A newly created or flushed queue starts out in a state where its tail page to be written to is empty.
// After pushBack() is called, this is no longer the case and never will be again until the queue is
// flushed. Before the non-empty tail page is written it must be linked to a new empty page for use after
// the next flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only
// be written once because once they contain durable data a second write to link to a new page could corrupt
// the existing data if the subsequent commit never succeeds.)
//
// If the newTailPage future is ready but it's an invalid page and the tail page we are currently pointed to
// has had items added to it, then get a new tail page ID.
if (self->newTailPage.isReady() && self->newTailPage.get() == invalidPhysicalPageID) {
if (self->tailWriter.pendingTailWrites()) {
debug_printf("FIFOQueue(%s) preFlush starting to get new page ID\n", self->name.c_str());
if (self->usesExtents) {
if (self->tailWriter.pageID == invalidPhysicalPageID) {
self->newTailPage = self->pager->newExtentPageID(self->queueID);
self->tailPageNewExtent = true;
self->prevExtentEndPageID = invalidPhysicalPageID;
} else {
auto p = self->tailWriter.header();
debug_printf(
"FIFOQueue(%s) newTailPage tailWriterPage %u extentCurPageID %u, extentEndPageID %u\n",
self->name.c_str(),
self->tailWriter.pageID,
p->extentCurPageID,
p->extentEndPageID);
if (p->extentCurPageID < p->extentEndPageID) {
self->newTailPage = p->extentCurPageID + 1;
self->tailPageNewExtent = false;
self->prevExtentEndPageID = p->extentEndPageID;
} else {
self->newTailPage = self->pager->newExtentPageID(self->queueID);
self->tailPageNewExtent = true;
self->prevExtentEndPageID = invalidPhysicalPageID;
}
}
debug_printf("FIFOQueue(%s) newTailPage tailPageNewExtent:%d prevExtentEndPageID: %u "
"tailWriterPage %u\n",
self->name.c_str(),
self->tailPageNewExtent,
self->prevExtentEndPageID,
self->tailWriter.pageID);
} else
self->newTailPage = self->pager->newPageID();
workPending = true;
} else {
if (self->usesExtents) {
auto p = self->tailWriter.header();
self->prevExtentEndPageID = p->extentEndPageID;
self->tailPageNewExtent = false;
debug_printf("FIFOQueue(%s) newTailPage tailPageNewExtent: %d prevExtentEndPageID: %u "
"tailWriterPage %u\n",
self->name.c_str(),
self->tailPageNewExtent,
self->prevExtentEndPageID,
self->tailWriter.pageID);
}
}
}
}
debug_printf("FIFOQueue(%s) preFlush returning %d\n", self->name.c_str(), workPending);
return workPending;
}
Future<bool> preFlush() { return preFlush_impl(this); }
void finishFlush() {
debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str());
ASSERT(!isBusy());
bool initTailWriter = true;
// If a new tail page was allocated, link the last page of the tail writer to it.
if (newTailPage.get() != invalidPhysicalPageID) {
tailWriter.addNewPage(newTailPage.get(), 0, false, false);
// The flush sequence allocated a page and added it to the queue so increment numPages
++numPages;
// newPage() should be ready immediately since a pageID is being explicitly passed.
ASSERT(!tailWriter.isBusy());
newTailPage = invalidPhysicalPageID;
initTailWriter = true;
}
// If the headWriter wrote anything, link its tail page to the headReader position and point the headReader
// to the start of the headWriter
if (headWriter.pendingTailWrites()) {
headWriter.addNewPage(headReader.pageID, headReader.offset, false);
headReader.pageID = headWriter.firstPageIDWritten;
headReader.offset = 0;
headReader.page.clear();
}
// Update headReader's end page to the new tail page
headReader.endPageID = tailWriter.pageID;
// Reset the write cursors
debug_printf("FIFOQueue(%s) Reset tailWriter cursor. tailPageNewExtent: %d\n", name.c_str(), tailPageNewExtent);
tailWriter.init(this,
Cursor::WRITE,
tailWriter.pageID,
initTailWriter /*false*/,
tailPageNewExtent,
invalidPhysicalPageID,
0,
prevExtentEndPageID);
headWriter.init(this, Cursor::WRITE);
debug_printf("FIFOQueue(%s) finishFlush end\n", name.c_str());
}
ACTOR static Future<Void> flush_impl(FIFOQueue* self) {
loop {
bool notDone = wait(self->preFlush());
if (!notDone) {
break;
}
}
self->finishFlush();
return Void();
}
Future<Void> flush() { return flush_impl(this); }
IPager2* pager;
QueueID queueID;
Future<Void> pagerError;
int64_t numPages;
int64_t numEntries;
int pagesPerExtent;
bool usesExtents;
bool tailPageNewExtent;
PhysicalPageID prevExtentEndPageID;
Cursor headReader;
Cursor tailWriter;
Cursor headWriter;
Future<PhysicalPageID> newTailPage;
// For debugging
std::string name;
void toTraceEvent(TraceEvent& e, const char* prefix) const {
e.detail(format("%sRecords", prefix), numEntries);
e.detail(format("%sPages", prefix), numPages);
e.detail(format("%sRecordsPerPage", prefix), numPages > 0 ? (double)numEntries / numPages : 0);
}
};
int nextPowerOf2(uint32_t x) {
return 1 << (32 - clz(x - 1));
}
struct RedwoodMetrics {
constexpr static unsigned int btreeLevels = 5;
static int maxRecordCount;
struct EventReasonsArray {
unsigned int eventReasons[(size_t)PagerEvents::MAXEVENTS][(size_t)PagerEventReasons::MAXEVENTREASONS];
EventReasonsArray() { clear(); }
void clear() { memset(eventReasons, 0, sizeof(eventReasons)); }
void addEventReason(PagerEvents event, PagerEventReasons reason) {
eventReasons[(size_t)event][(size_t)reason] += 1;
}
unsigned int getEventReason(PagerEvents event, PagerEventReasons reason) const {
return eventReasons[(size_t)event][(size_t)reason];
}
std::string toString(unsigned int level, double elapsed) const {
std::string result;
const auto& pairs = (level == 0 ? L0PossibleEventReasonPairs : possibleEventReasonPairs);
PagerEvents prevEvent = pairs.front().first;
std::string lineStart = (level == 0) ? "" : "\t";
for (const auto& p : pairs) {
if (p.first != prevEvent) {
result += "\n";
result += lineStart;
}
std::string name =
format("%s%s", PagerEventsStrings[(int)p.first], PagerEventReasonsStrings[(int)p.second]);
int count = getEventReason(p.first, p.second);
result += format("%-15s %8u %8u/s ", name.c_str(), count, int(count / elapsed));
prevEvent = p.first;
}
return result;
}
void toTraceEvent(TraceEvent* t, unsigned int level) const {
const auto& pairs = (level == 0 ? L0PossibleEventReasonPairs : possibleEventReasonPairs);
for (const auto& p : pairs) {
std::string name =
format(level == 0 ? "" : "L%d", level) +
format("%s%s", PagerEventsStrings[(int)p.first], PagerEventReasonsStrings[(int)p.second]);
int count = getEventReason(p.first, p.second);
t->detail(std::move(name), count);
}
}
};
// Metrics by level
struct Level {
struct Counters {
unsigned int pageRead;
unsigned int pageReadExt;
unsigned int pageBuild;
unsigned int pageBuildExt;
unsigned int pageCommitStart;
unsigned int pageModify;
unsigned int pageModifyExt;
unsigned int lazyClearRequeue;
unsigned int lazyClearRequeueExt;
unsigned int lazyClearFree;
unsigned int lazyClearFreeExt;
unsigned int forceUpdate;
unsigned int detachChild;
EventReasonsArray events;
};
Counters metrics;
Reference<Histogram> buildFillPctSketch;
Reference<Histogram> modifyFillPctSketch;
Reference<Histogram> buildStoredPctSketch;
Reference<Histogram> modifyStoredPctSketch;
Reference<Histogram> buildItemCountSketch;
Reference<Histogram> modifyItemCountSketch;
Level() { metrics = {}; }
void clear() { metrics = {}; }
};
struct metrics {
unsigned int opSet;
unsigned int opSetKeyBytes;
unsigned int opSetValueBytes;
unsigned int opClear;
unsigned int opClearKey;
unsigned int opCommit;
unsigned int opGet;
unsigned int opGetRange;
unsigned int pagerDiskWrite;
unsigned int pagerDiskRead;
unsigned int pagerRemapFree;
unsigned int pagerRemapCopy;
unsigned int pagerRemapSkip;
unsigned int pagerCacheHit;
unsigned int pagerCacheMiss;
unsigned int pagerProbeHit;
unsigned int pagerProbeMiss;
unsigned int pagerEvictUnhit;
unsigned int pagerEvictFail;
unsigned int btreeLeafPreload;
unsigned int btreeLeafPreloadExt;
};
RedwoodMetrics() {
// All histograms have reset their buckets to 0 in the constructor.
kvSizeWritten = Reference<Histogram>(
new Histogram(Reference<HistogramRegistry>(), "kvSize", "Written", Histogram::Unit::bytes));
kvSizeReadByGet = Reference<Histogram>(
new Histogram(Reference<HistogramRegistry>(), "kvSize", "ReadByGet", Histogram::Unit::bytes));
kvSizeReadByGetRange = Reference<Histogram>(
new Histogram(Reference<HistogramRegistry>(), "kvSize", "ReadByGetRange", Histogram::Unit::bytes));
ioLock = nullptr;
// These histograms are used for Btree events, hence level > 0
unsigned int levelCounter = 0;
for (RedwoodMetrics::Level& level : levels) {
if (levelCounter > 0) {
std::string levelString = "L" + std::to_string(levelCounter);
level.buildFillPctSketch = Reference<Histogram>(new Histogram(
Reference<HistogramRegistry>(), "buildFillPct", levelString, Histogram::Unit::percentageLinear));
level.modifyFillPctSketch = Reference<Histogram>(new Histogram(
Reference<HistogramRegistry>(), "modifyFillPct", levelString, Histogram::Unit::percentageLinear));
level.buildStoredPctSketch = Reference<Histogram>(new Histogram(
Reference<HistogramRegistry>(), "buildStoredPct", levelString, Histogram::Unit::percentageLinear));
level.modifyStoredPctSketch = Reference<Histogram>(new Histogram(
Reference<HistogramRegistry>(), "modifyStoredPct", levelString, Histogram::Unit::percentageLinear));
level.buildItemCountSketch = Reference<Histogram>(new Histogram(Reference<HistogramRegistry>(),
"buildItemCount",
levelString,
Histogram::Unit::countLinear,
0,
maxRecordCount));
level.modifyItemCountSketch = Reference<Histogram>(new Histogram(Reference<HistogramRegistry>(),
"modifyItemCount",
levelString,
Histogram::Unit::countLinear,
0,
maxRecordCount));
}
++levelCounter;
}
clear();
}
void clear() {
for (RedwoodMetrics::Level& level : levels) {
level.clear();
}
metric = {};
startTime = g_network ? now() : 0;
}
// btree levels and one extra level for non btree level.
Level levels[btreeLevels + 1];
metrics metric;
// pointer to the priority multi lock used in pager
PriorityMultiLock* ioLock;
Reference<Histogram> kvSizeWritten;
Reference<Histogram> kvSizeReadByGet;
Reference<Histogram> kvSizeReadByGetRange;
double startTime;
// Return number of pages read or written, from cache or disk
unsigned int pageOps() const {
// All page reads are either a cache hit, probe hit, or a disk read
return metric.pagerDiskWrite + metric.pagerDiskRead + metric.pagerCacheHit + metric.pagerProbeHit;
}
Level& level(unsigned int level) {
// Valid levels are from 0 - btreeLevels
// Level 0 is for operations that are not BTree level specific, as many of the metrics are the same
// Level 0 - btreeLevels correspond to BTree node height, however heights above btreeLevels are combined
// into the level at btreeLevels
return levels[std::min(level, btreeLevels)];
}
void updateMaxRecordCount(int maxRecords) {
if (maxRecordCount != maxRecords) {
maxRecordCount = maxRecords;
for (int i = 1; i <= btreeLevels; ++i) {
auto& level = levels[i];
level.buildItemCountSketch->updateUpperBound(maxRecordCount);
level.modifyItemCountSketch->updateUpperBound(maxRecordCount);
}
}
}
void logHistograms(double elapsed) {
// All histograms have reset their buckets to 0 after writeToLog.
kvSizeWritten->writeToLog(elapsed);
kvSizeReadByGet->writeToLog(elapsed);
kvSizeReadByGetRange->writeToLog(elapsed);
unsigned int levelCounter = 0;
for (RedwoodMetrics::Level& level : levels) {
if (levelCounter > 0) {
level.buildFillPctSketch->writeToLog(elapsed);
level.modifyFillPctSketch->writeToLog(elapsed);
level.buildStoredPctSketch->writeToLog(elapsed);
level.modifyStoredPctSketch->writeToLog(elapsed);
level.buildItemCountSketch->writeToLog(elapsed);
level.modifyItemCountSketch->writeToLog(elapsed);
}
++levelCounter;
}
}
// This will populate a trace event and/or a string with Redwood metrics.
// The string is a reasonably well formatted page of information
void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false);
void getIOLockFields(TraceEvent* e, std::string* s = nullptr);
std::string toString(bool clearAfter) {
std::string s;
getFields(nullptr, &s);
getIOLockFields(nullptr, &s);
if (clearAfter) {
clear();
}
return s;
}
};
// Using a global for Redwood metrics because a single process shouldn't normally have multiple storage engines
int RedwoodMetrics::maxRecordCount = 315;
RedwoodMetrics g_redwoodMetrics = {};
Future<Void> g_redwoodMetricsActor;
ACTOR Future<Void> redwoodHistogramsLogger(double interval) {
state double currTime;
loop {
currTime = now();
wait(delay(interval));
double elapsed = now() - currTime;
g_redwoodMetrics.logHistograms(elapsed);
}
}
ACTOR Future<Void> redwoodMetricsLogger() {
g_redwoodMetrics.clear();
state Future<Void> loggingFuture = redwoodHistogramsLogger(SERVER_KNOBS->REDWOOD_HISTOGRAM_INTERVAL);
loop {
wait(delay(SERVER_KNOBS->REDWOOD_METRICS_INTERVAL));
TraceEvent e("RedwoodMetrics");
double elapsed = now() - g_redwoodMetrics.startTime;
e.detail("Elapsed", elapsed);
g_redwoodMetrics.getFields(&e);
g_redwoodMetrics.getIOLockFields(&e);
g_redwoodMetrics.clear();
}
}
// Holds an index of recently used objects.
// ObjectType must have these methods
//
// // Returns true iff the entry can be evicted
// bool evictable() const;
//
// // Ready when object is safe to evict from cache
// Future<Void> onEvictable() const;
//
// // Ready when object destruction is safe
// // Should cancel pending async operations that are safe to cancel when cache is being destroyed
// Future<Void> cancel() const;
template <class IndexType, class ObjectType>
class ObjectCache : NonCopyable {
struct Entry;
typedef std::unordered_map<IndexType, Entry> CacheT;
struct Entry : public boost::intrusive::list_base_hook<> {
Entry() : hits(0), size(0) {}
IndexType index;
ObjectType item;
int hits;
int size;
bool ownedByEvictor;
CacheT* pCache;
};
typedef boost::intrusive::list<Entry> EvictionOrderT;
public:
// Object evictor, manages the eviction order for one or more ObjectCaches
// Not all objects tracked by the Evictor are in its evictionOrder, as ObjectCaches
// using this Evictor can temporarily remove entries to an external order but they
// must eventually give them back with moveIn() or remove them with reclaim().
class Evictor : NonCopyable {
public:
Evictor(int64_t sizeLimit = 0) : sizeLimit(sizeLimit) {}
// Evictors are normally singletons, either one per real process or one per virtual process in simulation
static Evictor* getEvictor() {
static Evictor nonSimEvictor;
static std::map<NetworkAddress, Evictor> simEvictors;
if (g_network->isSimulated()) {
return &simEvictors[g_network->getLocalAddress()];
} else {
return &nonSimEvictor;
}
}
// Move an entry to a different eviction order, stored outside of the Evictor,
// but the entry size is still counted against the evictor
void moveOut(Entry& e, EvictionOrderT& dest) {
ASSERT(e.ownedByEvictor);
dest.splice(dest.end(), evictionOrder, EvictionOrderT::s_iterator_to(e));
e.ownedByEvictor = false;
++movedOutCount;
}
// Move an entry to the back of the eviction order if it is in the eviction order
void moveToBack(Entry& e) {
ASSERT(e.ownedByEvictor);
evictionOrder.splice(evictionOrder.end(), evictionOrder, EvictionOrderT::s_iterator_to(e));
}
// Move entire contents of an external eviction order containing entries whose size is part of
// this Evictor to the front of its eviction order.
void moveIn(EvictionOrderT& otherOrder) {
for (auto& e : otherOrder) {
ASSERT(!e.ownedByEvictor);
e.ownedByEvictor = true;
--movedOutCount;
}
evictionOrder.splice(evictionOrder.begin(), otherOrder);
}
// Add a new item to the back of the eviction order
void addNew(Entry& e) {
sizeUsed += e.size;
evictionOrder.push_back(e);
e.ownedByEvictor = true;
}
// Claim ownership of an entry, removing its size from the current size and removing it
// from the eviction order if it exists there
void reclaim(Entry& e) {
sizeUsed -= e.size;
// If e is in evictionOrder then remove it
if (e.ownedByEvictor) {
evictionOrder.erase(EvictionOrderT::s_iterator_to(e));
e.ownedByEvictor = false;
} else {
// Otherwise, it wasn't so it had to be a movedOut item so decrement the count
--movedOutCount;
}
}
void trim(int additionalSpaceNeeded = 0) {
int attemptsLeft = FLOW_KNOBS->MAX_EVICT_ATTEMPTS;
// While the cache is too big, evict the oldest entry until the oldest entry can't be evicted.
while (attemptsLeft-- > 0 && sizeUsed > (sizeLimit - reservedSize - additionalSpaceNeeded) &&
!evictionOrder.empty()) {
Entry& toEvict = evictionOrder.front();
debug_printf("Evictor count=%d sizeUsed=%" PRId64 " sizeLimit=%" PRId64 " sizePenalty=%" PRId64
" needed=%d Trying to evict %s evictable %d\n",
(int)evictionOrder.size(),
sizeUsed,
sizeLimit,
reservedSize,
additionalSpaceNeeded,
::toString(toEvict.index).c_str(),
toEvict.item.evictable());
if (!toEvict.item.evictable()) {
// shift the front to the back
evictionOrder.shift_forward(1);
++g_redwoodMetrics.metric.pagerEvictFail;
break;
} else {
if (toEvict.hits == 0) {
++g_redwoodMetrics.metric.pagerEvictUnhit;
}
sizeUsed -= toEvict.size;
debug_printf("Evicting %s\n", ::toString(toEvict.index).c_str());
evictionOrder.pop_front();
toEvict.pCache->erase(toEvict.index);
}
}
}
int64_t getCountUsed() const { return evictionOrder.size() + movedOutCount; }
int64_t getCountMoved() const { return movedOutCount; }
int64_t getSizeUsed() const { return sizeUsed + reservedSize; }
// Only to be used in tests at a point where all ObjectCache instances should be destroyed.
bool empty() const { return reservedSize == 0 && sizeUsed == 0 && getCountUsed() == 0; }
std::string toString() const {
std::string s = format("Evictor {sizeLimit=%" PRId64 " sizeUsed=%" PRId64 " countUsed=%" PRId64
" sizePenalty=%" PRId64 " movedOutCount=%" PRId64,
sizeLimit,
sizeUsed,
getCountUsed(),
reservedSize,
movedOutCount);
for (auto& entry : evictionOrder) {
s += format("\n\tindex %s size %d evictable %d\n",
::toString(entry.index).c_str(),
entry.size,
entry.item.evictable());
}
s += "}\n";
return s;
}
// Any external data strutures whose memory usage should be counted as part of the object cache
// budget should add their usage to this total and keep it updated.
int64_t reservedSize = 0;
int64_t sizeLimit;
private:
EvictionOrderT evictionOrder;
// Size of all entries in the eviction order or held in external eviction orders
int64_t sizeUsed = 0;
// Number of items that have been moveOut()'d to other evictionOrders and aren't back yet
int64_t movedOutCount = 0;
};
ObjectCache(Evictor* evictor = nullptr) : pEvictor(evictor) {
if (pEvictor == nullptr) {
pEvictor = Evictor::getEvictor();
}
}
Evictor& evictor() const { return *pEvictor; }
int64_t getCount() const { return cache.size(); }
void reserveCount(int count) { cache.reserve(count); }
// Get the object for i if it exists, else return nullptr.
// If the object exists, its eviction order will NOT change as this is not a cache hit.
ObjectType* getIfExists(const IndexType& index) {
auto i = cache.find(index);
if (i != cache.end()) {
++i->second.hits;
return &i->second.item;
}
return nullptr;
}
// If index is in cache and not on the prioritized eviction order list, move it there.
void prioritizeEviction(const IndexType& index) {
auto i = cache.find(index);
if (i != cache.end() && i->second.ownedByEvictor) {
pEvictor->moveOut(i->second, prioritizedEvictions);
}
}
// Get the object for i or create a new one.
// After a get(), the object for i is the last in evictionOrder.
// If noHit is set, do not consider this access to be cache hit if the object is present
// If noMiss is set, do not consider this access to be a cache miss if the object is not present
ObjectType& get(const IndexType& index, int size, bool noHit = false) {
Entry& entry = cache[index];
// If entry is linked into an evictionOrder
if (entry.is_linked()) {
// If this access is meant to be a hit
if (!noHit) {
++entry.hits;
// If item eviction is not prioritized, move to end of eviction order
if (entry.ownedByEvictor) {
pEvictor->moveToBack(entry);
}
}
} else {
// Otherwise it was a cache miss
// Finish initializing entry
entry.index = index;
entry.pCache = &cache;
entry.hits = 0;
entry.size = size;
pEvictor->trim(entry.size);
pEvictor->addNew(entry);
}
return entry.item;
}
// Clears the cache, saving the entries to second cache, then waits for each item to be evictable and evicts it.
ACTOR static Future<Void> clear_impl(ObjectCache* self, bool waitForSafeEviction) {
// Claim ownership of all of our cached items, removing them from the evictor's control and quota.
for (auto& ie : self->cache) {
self->pEvictor->reclaim(ie.second);
}
// All items are in the cache so we don't need the prioritized eviction order anymore, and the cache is about
// to be destroyed so the prioritizedEvictions head/tail will become invalid.
self->prioritizedEvictions.clear();
state typename CacheT::iterator i = self->cache.begin();
while (i != self->cache.end()) {
wait(waitForSafeEviction ? i->second.item.onEvictable() : i->second.item.cancel());
++i;
}
self->cache.clear();
return Void();
}
Future<Void> clear(bool waitForSafeEviction = false) { return clear_impl(this, waitForSafeEviction); }
// Move the prioritized evictions queued to the front of the eviction order
void flushPrioritizedEvictions() { pEvictor->moveIn(prioritizedEvictions); }
private:
Evictor* pEvictor;
CacheT cache;
EvictionOrderT prioritizedEvictions;
};
ACTOR template <class T>
Future<T> forwardError(Future<T> f, Promise<Void> target) {
try {
T x = wait(f);
return x;
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled && target.canBeSet()) {
target.sendError(e);
}
throw;
}
}
constexpr int initialVersion = invalidVersion;
class DWALPagerSnapshot;
// An implementation of IPager2 that supports atomicUpdate() of a page without forcing a change to new page ID.
// It does this internally mapping the original page ID to alternate page IDs by write version.
// The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start.
// To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the
// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue,
// copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory.
// This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated
// alternate pages it references basically serve as a write ahead log for pages that will eventually be copied
// back to their original location once the original version is no longer needed.
class DWALPager final : public IPager2 {
public:
typedef FIFOQueue<LogicalPageID> LogicalPageQueueT;
typedef std::map<Version, LogicalPageID> VersionToPageMapT;
typedef std::unordered_map<LogicalPageID, VersionToPageMapT> PageToVersionedMapT;
struct PageCacheEntry {
Future<Reference<ArenaPage>> readFuture;
Future<Void> writeFuture;
bool initialized() const { return readFuture.isValid(); }
bool reading() const { return !readFuture.isReady(); }
bool writing() const { return !writeFuture.isReady(); }
bool evictable() const {
// Don't evict if a page is still being read or written
return !reading() && !writing();
}
// Entry is evictable when its write and read futures are ready, even if they are
// errors, so any buffers they hold are no longer needed by the underlying file actors
Future<Void> onEvictable() const { return ready(readFuture) && ready(writeFuture); }
// Read and write futures are safe to cancel so just cancel them and return
Future<Void> cancel() {
writeFuture.cancel();
readFuture.cancel();
return Void();
}
};
typedef ObjectCache<LogicalPageID, PageCacheEntry> PageCacheT;
int64_t* getPageCachePenaltySource() override { return &pageCache.evictor().reservedSize; }
constexpr static PhysicalPageID primaryHeaderPageID = 0;
constexpr static PhysicalPageID backupHeaderPageID = 1;
#pragma pack(push, 1)
struct DelayedFreePage {
Version version;
LogicalPageID pageID;
bool operator<(const DelayedFreePage& rhs) const { return version < rhs.version; }
std::string toString() const {
return format("DelayedFreePage{%s @%" PRId64 "}", ::toString(pageID).c_str(), version);
}
};
struct RemappedPage {
enum Type { NONE = 'N', REMAP = 'R', FREE = 'F', DETACH = 'D' };
RemappedPage(Version v = invalidVersion,
LogicalPageID o = invalidLogicalPageID,
LogicalPageID n = invalidLogicalPageID)
: version(v), originalPageID(o), newPageID(n) {}
Version version;
LogicalPageID originalPageID;
LogicalPageID newPageID;
static Type getTypeOf(LogicalPageID newPageID) {
if (newPageID == invalidLogicalPageID) {
return FREE;
}
if (newPageID == 0) {
return DETACH;
}
return REMAP;
}
Type getType() const { return getTypeOf(newPageID); }
bool operator<(const RemappedPage& rhs) const { return version < rhs.version; }
std::string toString() const {
return format("RemappedPage(%c: %s -> %s %s}",
getType(),
::toString(originalPageID).c_str(),
::toString(newPageID).c_str(),
::toString(version).c_str());
}
};
struct ExtentUsedListEntry {
QueueID queueID;
LogicalPageID extentID;
bool operator<(const ExtentUsedListEntry& rhs) const { return queueID < rhs.queueID; }
std::string toString() const {
return format("ExtentUsedListEntry{%s @%s}", ::toString(extentID).c_str(), ::toString(queueID).c_str());
}
};
#pragma pack(pop)
typedef FIFOQueue<DelayedFreePage> DelayedFreePageQueueT;
typedef FIFOQueue<RemappedPage> RemapQueueT;
typedef FIFOQueue<ExtentUsedListEntry> ExtentUsedListQueueT;
// If the file already exists, pageSize might be different than desiredPageSize
// Use pageCacheSizeBytes == 0 to use default from flow knobs
// If memoryOnly is true, the pager will exist only in memory and once the cache is full writes will fail.
// Note that ownership is not taken for keyProvider and it must outlive the pager
DWALPager(int desiredPageSize,
int desiredExtentSize,
std::string filename,
int64_t pageCacheSizeBytes,
int64_t remapCleanupWindowBytes,
int concurrentExtentReads,
bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
if (!keyProvider) {
keyProvider = makeReference<NullKeyProvider>();
}
// This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock;
if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger();
}
commitFuture = Void();
recoverFuture = forwardError(recover(this), errorPromise);
}
std::string getName() const override { return filename; }
void setPageSize(int size) {
// Conservative maximum for number of records that can fit in this page size
g_redwoodMetrics.updateMaxRecordCount(315.0 * size / 4096);
logicalPageSize = size;
// Physical page size is the total size of the smallest number of physical blocks needed to store
// logicalPageSize bytes
int blocks = 1 + ((logicalPageSize - 1) / smallestPhysicalBlock);
physicalPageSize = blocks * smallestPhysicalBlock;
header.pageSize = logicalPageSize;
}
void setExtentSize(int size) {
// if the specified extent size is smaller than the physical page size, round it off to one physical page size
// physical extent size has to be a multiple of physical page size
if (size <= physicalPageSize) {
pagesPerExtent = 1;
} else {
pagesPerExtent = 1 + ((size - 1) / physicalPageSize);
}
physicalExtentSize = pagesPerExtent * physicalPageSize;
header.extentSize = size;
}
void updateHeaderPage() {
Value serializedHeader = ObjectWriter::toValue(header, Unversioned());
ASSERT(serializedHeader.size() <= headerPage->dataSize());
serializedHeader.copyTo(headerPage->mutateData());
// Set remaining header bytes to \xff
memset(
headerPage->mutateData() + serializedHeader.size(), 0xff, headerPage->dataSize() - serializedHeader.size());
}
void updateLastCommittedHeader() {
lastCommittedHeaderPage = headerPage->clone();
lastCommittedHeader = header;
}
ACTOR static Future<Void> recover(DWALPager* self) {
ASSERT(!self->recoverFuture.isValid());
state bool exists = false;
if (!self->memoryOnly) {
int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_UNBUFFERED | IAsyncFile::OPEN_READWRITE |
IAsyncFile::OPEN_LOCK;
exists = fileExists(self->filename);
if (!exists) {
flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE;
}
wait(store(self->pageFile, IAsyncFileSystem::filesystem()->open(self->filename, flags, 0644)));
}
// Header page is always treated as having a page size of smallestPhysicalBlock
self->setPageSize(smallestPhysicalBlock);
state int64_t fileSize = 0;
if (exists) {
wait(store(fileSize, self->pageFile->size()));
}
self->fileExtension = Void();
debug_printf(
"DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize);
// TODO: If the file exists but appears to never have been successfully committed is this an error or
// should recovery proceed with a new pager instance?
// If there are at least 2 pages then try to recover the existing file
if (exists && fileSize >= (self->smallestPhysicalBlock * 2)) {
debug_printf("DWALPager(%s) recovering using existing file\n", self->filename.c_str());
state bool recoveredBackupHeader = false;
// Try to read primary header
try {
wait(store(self->headerPage, self->readHeaderPage(primaryHeaderPageID)));
} catch (Error& e) {
debug_printf("DWALPager(%s) Primary header read failed with %s\n", self->filename.c_str(), e.what());
// Errors that can be caused by a corrupted and unsync'd write to the header page can be ignored and the
// committed, sync'd backup header will be tried. Notably, page_header_wrong_page_id is not included in
// this list because it means the header checksum passed but this page data is not meant to be at this
// location, which is a very serious error so recovery should not continue.
bool tryBackupHeader =
(e.code() == error_code_page_header_version_not_supported ||
e.code() == error_code_page_encoding_not_supported ||
e.code() == error_code_page_decoding_failed || e.code() == error_code_page_header_checksum_failed);
TraceEvent(SevWarn, "RedwoodRecoveryErrorPrimaryHeaderFailed")
.errorUnsuppressed(e)
.detail("Filename", self->filename)
.detail("PageID", primaryHeaderPageID)
.detail("TryingBackupHeader", tryBackupHeader);
// Don't throw if trying backup header below
if (!tryBackupHeader) {
throw;
}
}
// If primary header wasn't valid, try backup header
if (!self->headerPage.isValid()) {
// Try to read backup header
try {
wait(store(self->headerPage, self->readHeaderPage(backupHeaderPageID)));
recoveredBackupHeader = true;
} catch (Error& e) {
debug_printf("DWALPager(%s) Backup header read failed with %s\n", self->filename.c_str(), e.what());
TraceEvent(SevWarn, "RedwoodRecoveryErrorBackupHeaderFailed")
.error(e)
.detail("Filename", self->filename)
.detail("PageID", backupHeaderPageID);
throw;
}
}
// Get header from the header page data
self->header =
ObjectReader::fromStringRef<PagerCommitHeader>(self->headerPage->dataAsStringRef(), Unversioned());
if (self->header.formatVersion != PagerCommitHeader::FORMAT_VERSION) {
Error e = unsupported_format_version();
TraceEvent(SevWarnAlways, "RedwoodRecoveryFailedWrongVersion")
.error(e)
.detail("Filename", self->filename)
.detail("Version", self->header.formatVersion)
.detail("ExpectedVersion", PagerCommitHeader::FORMAT_VERSION);
throw e;
}
self->setPageSize(self->header.pageSize);
self->filePageCount = fileSize / self->physicalPageSize;
self->filePageCountPending = self->filePageCount;
if (self->logicalPageSize != self->desiredPageSize) {
TraceEvent(SevWarnAlways, "RedwoodPageSizeMismatch")
.detail("InstanceName", self->getName())
.detail("ExistingPageSize", self->logicalPageSize)
.detail("DesiredPageSize", self->desiredPageSize);
}
self->setExtentSize(self->header.extentSize);
self->freeList.recover(self, self->header.freeList, "FreeListRecovered");
self->extentFreeList.recover(self, self->header.extentFreeList, "ExtentFreeListRecovered");
self->delayedFreeList.recover(self, self->header.delayedFreeList, "DelayedFreeListRecovered");
self->extentUsedList.recover(self, self->header.extentUsedList, "ExtentUsedListRecovered");
self->remapQueue.recover(self, self->header.remapQueue, "RemapQueueRecovered");
debug_printf("DWALPager(%s) Queue recovery complete.\n", self->filename.c_str());
// remapQueue entries are recovered using a fast path reading extents at a time
// we first issue disk reads for remapQueue extents obtained from extentUsedList
Standalone<VectorRef<ExtentUsedListEntry>> extents = wait(self->extentUsedList.peekAll());
debug_printf("DWALPager(%s) ExtentUsedList size: %d.\n", self->filename.c_str(), extents.size());
if (extents.size() > 1) {
QueueID remapQueueID = self->remapQueue.queueID;
for (int i = 1; i < extents.size() - 1; i++) {
if (extents[i].queueID == remapQueueID) {
LogicalPageID extID = extents[i].extentID;
debug_printf("DWALPager Extents: ID: %s ", toString(extID).c_str());
self->readExtent(extID);
}
}
}
// And here we consume results of the disk reads and populate the remappedPages map
// Using a promiseStream for the peeked results ensures that we use the CPU to populate the map
// and the disk concurrently
state PromiseStream<Standalone<VectorRef<RemappedPage>>> remapStream;
state Future<Void> remapRecoverActor;
remapRecoverActor = self->remapQueue.peekAllExt(remapStream);
try {
loop choose {
when(Standalone<VectorRef<RemappedPage>> remaps = waitNext(remapStream.getFuture())) {
debug_printf("DWALPager(%s) recovery. remaps size: %d, queueEntries: %" PRId64 "\n",
self->filename.c_str(),
remaps.size(),
self->remapQueue.numEntries);
for (auto& r : remaps) {
self->remappedPages[r.originalPageID][r.version] = r.newPageID;
}
}
when(wait(remapRecoverActor)) { remapRecoverActor = Never(); }
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
debug_printf("DWALPager(%s) recovery complete. RemappedPagesMap: %s\n",
self->filename.c_str(),
toString(self->remappedPages).c_str());
debug_printf("DWALPager(%s) recovery complete. destroy extent cache\n", self->filename.c_str());
wait(self->extentCache.clear());
// If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing.
// If this fails, the backup header is still in tact for the next recovery attempt.
if (recoveredBackupHeader) {
// Write the header to page 0
wait(self->writeHeaderPage(primaryHeaderPageID, self->headerPage));
// Wait for all outstanding writes to complete
wait(waitForAll(self->operations));
self->operations.clear();
// Sync header
wait(self->pageFile->sync());
debug_printf("DWALPager(%s) Header recovery complete.\n", self->filename.c_str());
}
// Update the last committed header with the one that was recovered (which is the last known committed
// header)
self->updateLastCommittedHeader();
self->addSnapshot(self->header.committedVersion, self->header.userCommitRecord);
// Reset the remapQueue head reader for normal reads
self->remapQueue.resetHeadReader();
self->remapCleanupFuture = remapCleanup(self);
} else {
// Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully
// committed. A new pager will be created in its place.
// TODO: Is the right behavior?
exists = false;
debug_printf("DWALPager(%s) creating new pager\n", self->filename.c_str());
self->headerPage = self->newPageBuffer();
self->headerPage->init(EncodingType::XXHash64, PageType::HeaderPage, 0);
// Now that the header page has been allocated, set page size to desired
self->setPageSize(self->desiredPageSize);
self->filePageCount = 0;
self->filePageCountPending = 0;
// Now set the extent size, do this always after setting the page size as
// extent size is a multiple of page size
self->setExtentSize(self->desiredExtentSize);
// Write new header using desiredPageSize
self->header.formatVersion = PagerCommitHeader::FORMAT_VERSION;
self->header.committedVersion = initialVersion;
self->header.oldestVersion = initialVersion;
// There are 2 reserved pages:
// Page 0 - header
// Page 1 - header backup
self->header.pageCount = 2;
// Create queues
self->header.queueCount = 0;
self->freeList.create(self, self->newLastPageID(), "FreeList", self->newLastQueueID(), false);
self->delayedFreeList.create(self, self->newLastPageID(), "DelayedFreeList", self->newLastQueueID(), false);
self->extentFreeList.create(self, self->newLastPageID(), "ExtentFreeList", self->newLastQueueID(), false);
self->extentUsedList.create(self, self->newLastPageID(), "ExtentUsedList", self->newLastQueueID(), false);
LogicalPageID extID = self->newLastExtentID();
self->remapQueue.create(self, extID, "RemapQueue", self->newLastQueueID(), true);
self->extentUsedList.pushBack({ self->remapQueue.queueID, extID });
// The first commit() below will flush the queues and update the queue states in the header,
// but since the queues will not be used between now and then their states will not change.
// In order to populate lastCommittedHeader, update the header now with the queue states.
self->header.freeList = self->freeList.getState();
self->header.delayedFreeList = self->delayedFreeList.getState();
self->header.extentFreeList = self->extentFreeList.getState();
self->header.extentUsedList = self->extentUsedList.getState();
self->header.remapQueue = self->remapQueue.getState();
// Serialize header to header page
self->updateHeaderPage();
// There is no previously committed header, but the current header state is sufficient to use as the backup
// header for the next commit, which if recovered would result in a valid empty pager at version 0.
self->updateLastCommittedHeader();
self->addSnapshot(initialVersion, KeyRef());
self->remapCleanupFuture = Void();
}
if (!self->memoryOnly) {
wait(store(fileSize, self->pageFile->size()));
}
TraceEvent e(SevInfo, "RedwoodRecoveredPager");
e.detail("OpenedExisting", exists);
self->toTraceEvent(e);
e.log();
self->recoveryVersion = self->header.committedVersion;
debug_printf("DWALPager(%s) recovered. recoveryVersion=%" PRId64 " oldestVersion=%" PRId64
" logicalPageSize=%d physicalPageSize=%d headerPageCount=%" PRId64 " filePageCount=%" PRId64 "\n",
self->filename.c_str(),
self->recoveryVersion,
self->header.oldestVersion,
self->logicalPageSize,
self->physicalPageSize,
self->header.pageCount,
self->filePageCount);
return Void();
}
void toTraceEvent(TraceEvent& e) const override {
e.detail("FileName", filename.c_str());
e.detail("LogicalFileSize", header.pageCount * physicalPageSize);
e.detail("PhysicalFileSize", filePageCountPending * physicalPageSize);
e.detail("CommittedVersion", header.committedVersion);
e.detail("LogicalPageSize", logicalPageSize);
e.detail("PhysicalPageSize", physicalPageSize);
remapQueue.toTraceEvent(e, "RemapQueue");
delayedFreeList.toTraceEvent(e, "FreeQueue");
freeList.toTraceEvent(e, "DelayedFreeQueue");
extentUsedList.toTraceEvent(e, "UsedExtentQueue");
extentFreeList.toTraceEvent(e, "FreeExtentQueue");
getStorageBytes().toTraceEvent(e);
}
ACTOR static void extentCacheClear_impl(DWALPager* self) { wait(self->extentCache.clear()); }
void extentCacheClear() override { extentCacheClear_impl(this); }
// get a list of used extents for a given extent based queue (for testing purpose)
ACTOR static Future<Standalone<VectorRef<LogicalPageID>>> getUsedExtents_impl(DWALPager* self, QueueID queueID) {
state Standalone<VectorRef<LogicalPageID>> extentIDs;
extentIDs.reserve(extentIDs.arena(),
self->extentUsedList.numEntries); // TODO this is overreserving. is that a problem?
Standalone<VectorRef<ExtentUsedListEntry>> extents = wait(self->extentUsedList.peekAll());
debug_printf("DWALPager(%s) ExtentUsedList size: %d.\n", self->filename.c_str(), extents.size());
if (extents.size() > 1) {
for (int i = 1; i < extents.size() - 1; i++) {
if (extents[i].queueID == queueID) {
LogicalPageID extID = extents[i].extentID;
debug_printf("DWALPager Extents: ID: %s ", toString(extID).c_str());
extentIDs.push_back(extentIDs.arena(), extID);
}
}
}
return extentIDs;
}
Future<Standalone<VectorRef<LogicalPageID>>> getUsedExtents(QueueID queueID) override {
return getUsedExtents_impl(this, queueID);
}
void pushExtentUsedList(QueueID queueID, LogicalPageID extID) override {
extentUsedList.pushBack({ queueID, extID });
}
// Allocate a new queueID
QueueID newLastQueueID() override { return header.queueCount++; }
Reference<ArenaPage> newPageBuffer(size_t blocks = 1) override {
return makeReference<ArenaPage>(logicalPageSize * blocks, physicalPageSize * blocks);
}
int getPhysicalPageSize() const override { return physicalPageSize; }
int getLogicalPageSize() const override { return logicalPageSize; }
int getPagesPerExtent() const override { return pagesPerExtent; }
// Get a new, previously available page ID. The page will be considered in-use after the next commit
// regardless of whether or not it was written to, until it is returned to the pager via freePage()
ACTOR static Future<LogicalPageID> newPageID_impl(DWALPager* self) {
// First try the free list
Optional<LogicalPageID> freePageID = wait(self->freeList.pop());
if (freePageID.present()) {
debug_printf("DWALPager(%s) newPageID() returning %s from free list\n",
self->filename.c_str(),
toString(freePageID.get()).c_str());
return freePageID.get();
}
// Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in
// the snapshots list
Optional<DelayedFreePage> delayedFreePageID =
wait(self->delayedFreeList.pop(DelayedFreePage{ self->effectiveOldestVersion(), 0 }));
if (delayedFreePageID.present()) {
debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n",
self->filename.c_str(),
toString(delayedFreePageID.get()).c_str());
return delayedFreePageID.get().pageID;
}
// Lastly, add a new page to the pager
LogicalPageID id = self->newLastPageID();
debug_printf(
"DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str());
return id;
};
// Grow the pager file by one page and return it
LogicalPageID newLastPageID() {
LogicalPageID id = header.pageCount;
growPager(1);
return id;
}
Future<LogicalPageID> newPageID() override { return newPageID_impl(this); }
void growPager(int64_t pages) { header.pageCount += pages; }
// Get a new, previously available extent and it's first page ID. The page will be considered in-use after the next
// commit regardless of whether or not it was written to, until it is returned to the pager via freePage()
ACTOR static Future<LogicalPageID> newExtentPageID_impl(DWALPager* self, QueueID queueID) {
// First try the free list
Optional<LogicalPageID> freeExtentID = wait(self->extentFreeList.pop());
if (freeExtentID.present()) {
debug_printf("DWALPager(%s) remapQueue newExtentPageID() returning %s from free list\n",
self->filename.c_str(),
toString(freeExtentID.get()).c_str());
self->extentUsedList.pushBack({ queueID, freeExtentID.get() });
self->extentUsedList.getState();
return freeExtentID.get();
}
// Lastly, add a new extent to the pager
LogicalPageID id = self->newLastExtentID();
debug_printf("DWALPager(%s) remapQueue newExtentPageID() returning %s at end of file\n",
self->filename.c_str(),
toString(id).c_str());
self->extentUsedList.pushBack({ queueID, id });
self->extentUsedList.getState();
return id;
}
// Grow the pager file by one extent and return it
// We reserve all the pageIDs within the extent during this step
// That translates to extentID being same as the return first pageID
LogicalPageID newLastExtentID() {
LogicalPageID id = header.pageCount;
growPager(pagesPerExtent);
return id;
}
Future<LogicalPageID> newExtentPageID(QueueID queueID) override { return newExtentPageID_impl(this, queueID); }
// Write one block of a page of a physical page in the page file. Futures returned must be allowed to complete.
ACTOR static UNCANCELLABLE Future<Void> writePhysicalBlock(DWALPager* self,
Reference<ArenaPage> page,
int blockNum,
int blockSize,
PhysicalPageID pageID,
PagerEventReasons reason,
unsigned int level,
bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite;
g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason);
if (self->memoryOnly) {
return Void();
}
// If a truncation up to include pageID has not yet been completed
if (pageID >= self->filePageCount) {
// And no extension pending will include pageID
if (pageID >= self->filePageCountPending) {
// Update extension to a new one that waits on the old one and extends further
self->fileExtension = extendToCover(self, pageID, self->fileExtension);
}
// Wait for extension that covers pageID to complete;
wait(self->fileExtension);
}
// Note: Not using forwardError here so a write error won't be discovered until commit time.
debug_printf("DWALPager(%s) op=writeBlock %s\n", self->filename.c_str(), toString(pageID).c_str());
wait(self->pageFile->write(page->rawData() + (blockNum * blockSize), blockSize, (int64_t)pageID * blockSize));
// This next line could crash on shutdown because this actor can't be cancelled so self could be destroyed after
// write, so enable this line with caution when debugging.
// debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str());
return Void();
}
ACTOR static Future<Void> extendToCover(DWALPager* self, uint64_t pageID, Future<Void> previousExtension) {
// Calculate new page count, round up to nearest multiple of growth size > pageID
state int64_t newPageCount = pageID + SERVER_KNOBS->REDWOOD_PAGEFILE_GROWTH_SIZE_PAGES -
(pageID % SERVER_KNOBS->REDWOOD_PAGEFILE_GROWTH_SIZE_PAGES);
// Indicate that extension to this new count has been started
self->filePageCountPending = newPageCount;
// Wait for any previous extensions to complete
wait(previousExtension);
// Grow the file
wait(self->pageFile->truncate(newPageCount * self->physicalPageSize));
// Indicate that extension to the new count has been completed
self->filePageCount = newPageCount;
return Void();
}
// All returned futures are added to the operations vector
Future<Void> writePhysicalPage(PagerEventReasons reason,
unsigned int level,
Standalone<VectorRef<PhysicalPageID>> pageIDs,
Reference<ArenaPage> page,
bool header = false) {
debug_printf("DWALPager(%s) op=%s %s ptr=%p\n",
filename.c_str(),
(header ? "writePhysicalHeader" : "writePhysicalPage"),
toString(pageIDs).c_str(),
page->rawData());
// Set metadata before prewrite so it's in the pre-encrypted page in cache if the page is encrypted
// The actual next commit version is unknown, so the write version of a page is always the
// last committed version + 1
page->setWriteInfo(pageIDs.front(), this->getLastCommittedVersion() + 1);
// Copy the page if preWrite will encrypt/modify the payload
bool copy = page->isEncrypted();
if (copy) {
page = page->clone();
}
page->preWrite(pageIDs.front());
int blockSize = header ? smallestPhysicalBlock : physicalPageSize;
Future<Void> f;
if (pageIDs.size() == 1) {
f = writePhysicalBlock(this, page, 0, blockSize, pageIDs.front(), reason, level, header);
} else {
std::vector<Future<Void>> writers;
for (int i = 0; i < pageIDs.size(); ++i) {
Future<Void> p = writePhysicalBlock(this, page, i, blockSize, pageIDs[i], reason, level, header);
writers.push_back(p);
}
f = waitForAll(writers);
}
operations.push_back(f);
return f;
}
Future<Void> writeHeaderPage(PhysicalPageID pageID, Reference<ArenaPage> page) {
return writePhysicalPage(
PagerEventReasons::MetaData, nonBtreeLevel, VectorRef<PhysicalPageID>(&pageID, 1), page, true);
}
void updatePage(PagerEventReasons reason,
unsigned int level,
Standalone<VectorRef<LogicalPageID>> pageIDs,
Reference<ArenaPage> data) override {
// Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now
// or as a cache miss because there is no benefit to the page already being in cache
// Similarly, this does not count as a point lookup for reason.
ASSERT(pageIDs.front() != invalidLogicalPageID);
PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size() * physicalPageSize, true);
debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n",
filename.c_str(),
toString(pageIDs).c_str(),
cacheEntry.initialized(),
cacheEntry.initialized() && cacheEntry.reading(),
cacheEntry.initialized() && cacheEntry.writing());
// If the page is still being read then it's not also being written because a write places
// the new content into readFuture when the write is launched, not when it is completed.
// Read/write ordering is being enforced so waiting readers will not see the new write. This
// is necessary for remap erasure to work correctly since the oldest version of a page, located
// at the original page ID, could have a pending read when that version is expired (after which
// future reads of the version are not allowed) and the write of the next newest version over top
// of the original page begins.
if (!cacheEntry.initialized()) {
cacheEntry.writeFuture = detach(writePhysicalPage(reason, level, pageIDs, data));
} else if (cacheEntry.reading()) {
// This is very unlikely, maybe impossible in the current pager use cases
// Wait for the outstanding read to finish, then start the write
cacheEntry.writeFuture = mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
success(cacheEntry.readFuture), [=](Void) { return writePhysicalPage(reason, level, pageIDs, data); });
}
// If the page is being written, wait for this write before issuing the new write to ensure the
// writes happen in the correct order
else if (cacheEntry.writing()) {
// This is very unlikely, maybe impossible in the current pager use cases
// Wait for the previous write to finish, then start new write
cacheEntry.writeFuture = mapAsync<Void, std::function<Future<Void>(Void)>, Void>(
cacheEntry.writeFuture, [=](Void) { return writePhysicalPage(reason, level, pageIDs, data); });
} else {
cacheEntry.writeFuture = detach(writePhysicalPage(reason, level, pageIDs, data));
}
// Always update the page contents immediately regardless of what happened above.
cacheEntry.readFuture = data;
}
Future<LogicalPageID> atomicUpdatePage(PagerEventReasons reason,
unsigned int level,
LogicalPageID pageID,
Reference<ArenaPage> data,
Version v) override {
debug_printf("DWALPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v);
Future<LogicalPageID> f = map(newPageID(), [=](LogicalPageID newPageID) {
updatePage(reason, level, VectorRef<LogicalPageID>(&newPageID, 1), data);
// TODO: Possibly limit size of remap queue since it must be recovered on cold start
RemappedPage r{ v, pageID, newPageID };
remapQueue.pushBack(r);
auto& versionedMap = remappedPages[pageID];
if (SERVER_KNOBS->REDWOOD_EVICT_UPDATED_PAGES) {
// An update page is unlikely to have its old version read again soon, so prioritize its cache eviction
// If the versioned map is empty for this page then the prior version of the page is at stored at the
// PhysicalPageID pageID, otherwise it is the last mapped value in the version-ordered map.
pageCache.prioritizeEviction(versionedMap.empty() ? pageID : versionedMap.rbegin()->second);
}
versionedMap[v] = newPageID;
debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str());
return pageID;
});
// No need for forwardError here because newPageID() is already wrapped in forwardError
return f;
}
// Free pageID as of version v. This means that once the oldest readable pager snapshot is at version v, pageID is
// not longer in use by any structure so it can be used to write new data.
void freeUnmappedPage(PhysicalPageID pageID, Version v) {
// If v is older than the oldest version still readable then mark pageID as free as of the next commit
if (v < effectiveOldestVersion()) {
debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n",
filename.c_str(),
toString(pageID).c_str(),
v,
lastCommittedHeader.oldestVersion);
freeList.pushBack(pageID);
} else {
// Otherwise add it to the delayed free list
debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n",
filename.c_str(),
toString(pageID).c_str(),
v,
lastCommittedHeader.oldestVersion);
delayedFreeList.pushBack({ v, pageID });
}
// A freed page is unlikely to be read again soon so prioritize its cache eviction
if (SERVER_KNOBS->REDWOOD_EVICT_UPDATED_PAGES) {
pageCache.prioritizeEviction(pageID);
}
}
LogicalPageID detachRemappedPage(LogicalPageID pageID, Version v) override {
auto i = remappedPages.find(pageID);
if (i == remappedPages.end()) {
// Page is not remapped
return invalidLogicalPageID;
}
// Get the page that id was most recently remapped to
auto iLast = i->second.rbegin();
LogicalPageID newID = iLast->second;
ASSERT(RemappedPage::getTypeOf(newID) == RemappedPage::REMAP);
// If the last change remap was also at v then change the remap to a delete, as it's essentially
// the same as the original page being deleted at that version and newID being used from then on.
if (iLast->first == v) {
debug_printf("DWALPager(%s) op=detachDelete originalID=%s newID=%s @%" PRId64 " oldestVersion=%" PRId64
"\n",
filename.c_str(),
toString(pageID).c_str(),
toString(newID).c_str(),
v,
lastCommittedHeader.oldestVersion);
iLast->second = invalidLogicalPageID;
remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID });
} else {
debug_printf("DWALPager(%s) op=detach originalID=%s newID=%s @%" PRId64 " oldestVersion=%" PRId64 "\n",
filename.c_str(),
toString(pageID).c_str(),
toString(newID).c_str(),
v,
lastCommittedHeader.oldestVersion);
// Mark id as converted to its last remapped location as of v
i->second[v] = 0;
remapQueue.pushBack(RemappedPage{ v, pageID, 0 });
}
return newID;
}
void freePage(LogicalPageID pageID, Version v) override {
// If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone,
// so queue it for later deletion during remap cleanup
auto i = remappedPages.find(pageID);
if (i != remappedPages.end()) {
debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n",
filename.c_str(),
toString(pageID).c_str(),
v,
lastCommittedHeader.oldestVersion);
remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID });
// A freed page is unlikely to be read again soon so prioritize its cache eviction
if (SERVER_KNOBS->REDWOOD_EVICT_UPDATED_PAGES) {
PhysicalPageID previousPhysicalPage = i->second.rbegin()->second;
pageCache.prioritizeEviction(previousPhysicalPage);
}
i->second[v] = invalidLogicalPageID;
return;
}
freeUnmappedPage(pageID, v);
};
ACTOR static void freeExtent_impl(DWALPager* self, LogicalPageID pageID) {
self->extentFreeList.pushBack(pageID);
Optional<ExtentUsedListEntry> freeExtent = wait(self->extentUsedList.pop());
// Optional<LogicalPageID> freeExtentPageID = wait(self->extentUsedList.pop());
if (freeExtent.present()) {
debug_printf("DWALPager(%s) freeExtentPageID() popped %s from used list\n",
self->filename.c_str(),
toString(freeExtent.get().extentID).c_str());
}
}
void freeExtent(LogicalPageID pageID) override { freeExtent_impl(this, pageID); }
ACTOR static UNCANCELLABLE Future<int> readPhysicalBlock(DWALPager* self,
Reference<ArenaPage> pageBuffer,
int pageOffset,
int blockSize,
int64_t offset,
int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes;
}
// Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock.
// If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages
// and before the user-chosen sized pages.
ACTOR static Future<Reference<ArenaPage>> readPhysicalPage(DWALPager* self,
PhysicalPageID pageID,
int priority,
bool header) {
ASSERT(!self->memoryOnly);
state Reference<ArenaPage> page =
header ? makeReference<ArenaPage>(smallestPhysicalBlock, smallestPhysicalBlock) : self->newPageBuffer();
debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p header=%d\n",
self->filename.c_str(),
toString(pageID).c_str(),
page->rawData(),
header);
int readBytes =
wait(readPhysicalBlock(self, page, 0, page->rawSize(), (int64_t)pageID * page->rawSize(), priority));
debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n",
self->filename.c_str(),
toString(pageID).c_str(),
page->rawData(),
readBytes);
try {
page->postReadHeader(pageID);
if (page->isEncrypted()) {
ArenaPage::EncryptionKey k = wait(self->keyProvider->getEncryptionKey(page->getEncodingHeader()));
page->encryptionKey = k;
}
page->postReadPayload(pageID);
debug_printf("DWALPager(%s) op=readPhysicalVerified %s ptr=%p\n",
self->filename.c_str(),
toString(pageID).c_str(),
page->rawData());
} catch (Error& e) {
Error err = e;
if (g_network->isSimulated() && g_simulator->checkInjectedCorruption()) {
err = err.asInjectedFault();
}
// For header pages, event is a warning because the primary header could be read after an unsync'd write,
// but no other page can.
TraceEvent(header ? SevWarnAlways : SevError, "RedwoodPageError")
.error(err)
.detail("Filename", self->filename.c_str())
.detail("PageID", pageID)
.detail("PageSize", self->physicalPageSize)
.detail("Offset", pageID * self->physicalPageSize);
debug_printf("DWALPager(%s) postread failed for %s with %s\n",
self->filename.c_str(),
toString(pageID).c_str(),
err.what());
throw err;
}
return page;
}
ACTOR static Future<Reference<ArenaPage>> readPhysicalMultiPage(DWALPager* self,
Standalone<VectorRef<PhysicalPageID>> pageIDs,
int priority) {
ASSERT(!self->memoryOnly);
// if (g_network->getCurrentTask() > TaskPriority::DiskRead) {
// wait(delay(0, TaskPriority::DiskRead));
// }
state Reference<ArenaPage> page = self->newPageBuffer(pageIDs.size());
debug_printf("DWALPager(%s) op=readPhysicalMultiStart %s ptr=%p\n",
self->filename.c_str(),
toString(pageIDs).c_str(),
page->rawData());
// TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled?
state int blockSize = self->physicalPageSize;
std::vector<Future<int>> reads;
for (int i = 0; i < pageIDs.size(); ++i) {
reads.push_back(
readPhysicalBlock(self, page, i * blockSize, blockSize, ((int64_t)pageIDs[i]) * blockSize, priority));
}
// wait for all the parallel read futures
wait(waitForAll(reads));
debug_printf("DWALPager(%s) op=readPhysicalMultiDiskReadsComplete %s ptr=%p bytes=%d\n",
self->filename.c_str(),
toString(pageIDs).c_str(),
page->rawData(),
pageIDs.size() * blockSize);
try {
page->postReadHeader(pageIDs.front());
if (page->isEncrypted()) {
ArenaPage::EncryptionKey k = wait(self->keyProvider->getEncryptionKey(page->getEncodingHeader()));
page->encryptionKey = k;
}
page->postReadPayload(pageIDs.front());
debug_printf("DWALPager(%s) op=readPhysicalVerified %s ptr=%p bytes=%d\n",
self->filename.c_str(),
toString(pageIDs).c_str(),
page->rawData(),
pageIDs.size() * blockSize);
} catch (Error& e) {
// For header pages, error is a warning because recovery may still be possible
TraceEvent(SevError, "RedwoodPageError")
.error(e)
.detail("Filename", self->filename.c_str())
.detail("PageIDs", pageIDs)
.detail("PageSize", self->physicalPageSize);
debug_printf("DWALPager(%s) postread failed for %s with %s\n",
self->filename.c_str(),
toString(pageIDs).c_str(),
e.what());
throw;
}
return page;
}
Future<Reference<ArenaPage>> readHeaderPage(PhysicalPageID pageID) {
debug_printf("DWALPager(%s) readHeaderPage %s\n", filename.c_str(), toString(pageID).c_str());
return readPhysicalPage(this, pageID, ioMaxPriority, true);
}
// Reads the most recent version of pageID, either previously committed or written using updatePage()
// in the current commit
Future<Reference<ArenaPage>> readPage(PagerEventReasons reason,
unsigned int level,
PhysicalPageID pageID,
int priority,
bool cacheable,
bool noHit) override {
// Use cached page if present, without triggering a cache hit.
// Otherwise, read the page and return it but don't add it to the cache
debug_printf("DWALPager(%s) op=read %s reason=%s noHit=%d\n",
filename.c_str(),
toString(pageID).c_str(),
PagerEventReasonsStrings[(int)reason],
noHit);
auto& eventReasons = g_redwoodMetrics.level(level).metrics.events;
eventReasons.addEventReason(PagerEvents::CacheLookup, reason);
if (!cacheable) {
debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str());
PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID);
if (pCacheEntry != nullptr) {
++g_redwoodMetrics.metric.pagerProbeHit;
debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str());
return pCacheEntry->readFuture;
}
++g_redwoodMetrics.metric.pagerProbeMiss;
debug_printf("DWALPager(%s) op=readUncachedMiss %s\n", filename.c_str(), toString(pageID).c_str());
return forwardError(readPhysicalPage(this, pageID, priority, false), errorPromise);
}
PageCacheEntry& cacheEntry = pageCache.get(pageID, physicalPageSize, noHit);
debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n",
filename.c_str(),
toString(pageID).c_str(),
cacheEntry.initialized(),
cacheEntry.initialized() && cacheEntry.reading(),
cacheEntry.initialized() && cacheEntry.writing(),
noHit);
if (!cacheEntry.initialized()) {
debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str());
cacheEntry.readFuture = forwardError(readPhysicalPage(this, pageID, priority, false), errorPromise);
cacheEntry.writeFuture = Void();
++g_redwoodMetrics.metric.pagerCacheMiss;
eventReasons.addEventReason(PagerEvents::CacheMiss, reason);
} else {
++g_redwoodMetrics.metric.pagerCacheHit;
eventReasons.addEventReason(PagerEvents::CacheHit, reason);
}
return cacheEntry.readFuture;
}
Future<Reference<ArenaPage>> readMultiPage(PagerEventReasons reason,
unsigned int level,
VectorRef<PhysicalPageID> pageIDs,
int priority,
bool cacheable,
bool noHit) override {
// Use cached page if present, without triggering a cache hit.
// Otherwise, read the page and return it but don't add it to the cache
debug_printf("DWALPager(%s) op=read %s reason=%s noHit=%d\n",
filename.c_str(),
toString(pageIDs).c_str(),
PagerEventReasonsStrings[(int)reason],
noHit);
auto& eventReasons = g_redwoodMetrics.level(level).metrics.events;
eventReasons.addEventReason(PagerEvents::CacheLookup, reason);
if (!cacheable) {
debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageIDs).c_str());
PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageIDs.front());
if (pCacheEntry != nullptr) {
++g_redwoodMetrics.metric.pagerProbeHit;
debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageIDs).c_str());
return pCacheEntry->readFuture;
}
++g_redwoodMetrics.metric.pagerProbeMiss;
debug_printf("DWALPager(%s) op=readUncachedMiss %s\n", filename.c_str(), toString(pageIDs).c_str());
return forwardError(readPhysicalMultiPage(this, pageIDs, priority), errorPromise);
}
PageCacheEntry& cacheEntry = pageCache.get(pageIDs.front(), pageIDs.size() * physicalPageSize, noHit);
debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n",
filename.c_str(),
toString(pageIDs).c_str(),
cacheEntry.initialized(),
cacheEntry.initialized() && cacheEntry.reading(),
cacheEntry.initialized() && cacheEntry.writing(),
noHit);
if (!cacheEntry.initialized()) {
debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageIDs).c_str());
cacheEntry.readFuture = forwardError(readPhysicalMultiPage(this, pageIDs, priority), errorPromise);
cacheEntry.writeFuture = Void();
++g_redwoodMetrics.metric.pagerCacheMiss;
eventReasons.addEventReason(PagerEvents::CacheMiss, reason);
} else {
++g_redwoodMetrics.metric.pagerCacheHit;
eventReasons.addEventReason(PagerEvents::CacheHit, reason);
}
return cacheEntry.readFuture;
}
PhysicalPageID getPhysicalPageID(LogicalPageID pageID, Version v) {
auto i = remappedPages.find(pageID);
if (i != remappedPages.end()) {
auto j = i->second.upper_bound(v);
if (j != i->second.begin()) {
--j;
debug_printf("DWALPager(%s) op=lookupRemapped %s @%" PRId64 " -> %s\n",
filename.c_str(),
toString(pageID).c_str(),
v,
toString(j->second).c_str());
pageID = j->second;
if (pageID == invalidLogicalPageID)
debug_printf(
"DWALPager(%s) remappedPagesMap: %s\n", filename.c_str(), toString(remappedPages).c_str());
ASSERT(pageID != invalidLogicalPageID);
}
} else {
debug_printf("DWALPager(%s) op=lookupNotRemapped %s @%" PRId64 " (not remapped)\n",
filename.c_str(),
toString(pageID).c_str(),
v);
}
return (PhysicalPageID)pageID;
}
Future<Reference<ArenaPage>> readPageAtVersion(PagerEventReasons reason,
unsigned int level,
LogicalPageID logicalID,
int priority,
Version v,
bool cacheable,
bool noHit) {
PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
return readPage(reason, level, physicalID, priority, cacheable, noHit);
}
void releaseExtentReadLock() override { concurrentExtentReads->release(); }
// Read the physical extent at given pageID
// NOTE that we use the same interface (<ArenaPage>) for the extent as the page
ACTOR static Future<Reference<ArenaPage>> readPhysicalExtent(DWALPager* self,
PhysicalPageID pageID,
int readSize = 0) {
// First take the concurrentExtentReads lock to avoid issuing too many reads concurrently
wait(self->concurrentExtentReads->take());
ASSERT(!self->memoryOnly);
if (g_network->getCurrentTask() > TaskPriority::DiskRead) {
wait(delay(0, TaskPriority::DiskRead));
}
// readSize may not be equal to the physical extent size (for the first and last extents)
// but otherwise use the full physical extent size
if (readSize == 0) {
readSize = self->physicalExtentSize;
}
state Reference<ArenaPage> extent = makeReference<ArenaPage>(readSize, readSize);
// physicalReadSize is the size of disk read we intend to issue
auto physicalReadSize = SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_READ_SIZE;
auto parallelReads = readSize / physicalReadSize;
auto lastReadSize = readSize % physicalReadSize;
debug_printf("DWALPager(%s) op=readPhysicalExtentStart %s readSize %d offset %" PRId64
" physicalReadSize %d parallelReads %d\n",
self->filename.c_str(),
toString(pageID).c_str(),
readSize,
(int64_t)pageID * (self->physicalPageSize),
physicalReadSize,
parallelReads);
// we split the extent read into a number of parallel disk reads based on the determined physical
// disk read size. All those reads are issued in parallel and their futures are stored into the following
// reads vector
std::vector<Future<int>> reads;
int i;
int64_t startOffset = (int64_t)pageID * (self->physicalPageSize);
int64_t currentOffset;
for (i = 0; i < parallelReads; i++) {
currentOffset = i * physicalReadSize;
debug_printf("DWALPager(%s) current offset %" PRId64 "\n", self->filename.c_str(), currentOffset);
++g_redwoodMetrics.metric.pagerDiskRead;
reads.push_back(self->readPhysicalBlock(
self, extent, currentOffset, physicalReadSize, startOffset + currentOffset, ioMaxPriority));
}
// Handle the last read separately as it may be smaller than physicalReadSize
if (lastReadSize) {
currentOffset = i * physicalReadSize;
debug_printf("DWALPager(%s) iter %d current offset %" PRId64 " lastReadSize %d\n",
self->filename.c_str(),
i,
currentOffset,
lastReadSize);
++g_redwoodMetrics.metric.pagerDiskRead;
reads.push_back(self->readPhysicalBlock(
self, extent, currentOffset, lastReadSize, startOffset + currentOffset, ioMaxPriority));
}
// wait for all the parallel read futures for the given extent
wait(waitForAll(reads));
debug_printf("DWALPager(%s) op=readPhysicalExtentComplete %s ptr=%p bytes=%d file offset=%d\n",
self->filename.c_str(),
toString(pageID).c_str(),
extent->rawData(),
readSize,
(pageID * self->physicalPageSize));
return extent;
}
Future<Reference<ArenaPage>> readExtent(LogicalPageID pageID) override {
debug_printf("DWALPager(%s) op=readExtent %s\n", filename.c_str(), toString(pageID).c_str());
PageCacheEntry* pCacheEntry = extentCache.getIfExists(pageID);
auto& eventReasons = g_redwoodMetrics.level(nonBtreeLevel).metrics.events;
if (pCacheEntry != nullptr) {
eventReasons.addEventReason(PagerEvents::CacheLookup, PagerEventReasons::MetaData);
debug_printf("DWALPager(%s) Cache Entry exists for %s\n", filename.c_str(), toString(pageID).c_str());
return pCacheEntry->readFuture;
}
eventReasons.addEventReason(PagerEvents::CacheLookup, PagerEventReasons::MetaData);
LogicalPageID headPageID = header.remapQueue.headPageID;
LogicalPageID tailPageID = header.remapQueue.tailPageID;
int readSize = physicalExtentSize;
bool headExt = false;
bool tailExt = false;
debug_printf("DWALPager(%s) #extentPages: %d, headPageID: %s, tailPageID: %s\n",
filename.c_str(),
pagesPerExtent,
toString(headPageID).c_str(),
toString(tailPageID).c_str());
if (headPageID >= pageID && ((headPageID - pageID) < pagesPerExtent))
headExt = true;
if ((tailPageID - pageID) < pagesPerExtent)
tailExt = true;
if (headExt && tailExt) {
readSize = (tailPageID - headPageID + 1) * physicalPageSize;
} else if (headExt) {
readSize = (pagesPerExtent - (headPageID - pageID)) * physicalPageSize;
} else if (tailExt) {
readSize = (tailPageID - pageID + 1) * physicalPageSize;
}
PageCacheEntry& cacheEntry = extentCache.get(pageID, 1);
if (!cacheEntry.initialized()) {
cacheEntry.writeFuture = Void();
cacheEntry.readFuture =
forwardError(readPhysicalExtent(this, (PhysicalPageID)pageID, readSize), errorPromise);
debug_printf("DWALPager(%s) Set the cacheEntry readFuture for page: %s\n",
filename.c_str(),
toString(pageID).c_str());
++g_redwoodMetrics.metric.pagerCacheMiss;
eventReasons.addEventReason(PagerEvents::CacheMiss, PagerEventReasons::MetaData);
eventReasons.addEventReason(PagerEvents::CacheLookup, PagerEventReasons::MetaData);
} else {
++g_redwoodMetrics.metric.pagerCacheHit;
eventReasons.addEventReason(PagerEvents::CacheHit, PagerEventReasons::MetaData);
eventReasons.addEventReason(PagerEvents::CacheLookup, PagerEventReasons::MetaData);
}
return cacheEntry.readFuture;
}
// Get snapshot as of the most recent committed version of the pager
Reference<IPagerSnapshot> getReadSnapshot(Version v) override;
void addSnapshot(Version version, KeyRef meta) {
if (snapshots.empty()) {
oldestSnapshotVersion = version;
} else {
ASSERT(snapshots.back().version != version);
}
snapshots.push_back({ version, makeReference<DWALPagerSnapshot>(this, meta, version) });
}
// Set the pending oldest versiont to keep as of the next commit
void setOldestReadableVersion(Version v) override {
ASSERT(v >= header.oldestVersion);
ASSERT(v <= header.committedVersion);
header.oldestVersion = v;
expireSnapshots(v);
};
// Get the oldest *readable* version, which is not the same as the oldest retained version as the version
// returned could have been set as the oldest version in the pending commit
Version getOldestReadableVersion() const override { return header.oldestVersion; };
// Calculate the *effective* oldest version, which can be older than the one set in the last commit since we
// are allowing active snapshots to temporarily delay page reuse.
Version effectiveOldestVersion() { return std::min(lastCommittedHeader.oldestVersion, oldestSnapshotVersion); }
ACTOR static Future<Void> removeRemapEntry(DWALPager* self, RemappedPage p, Version oldestRetainedVersion) {
// Get iterator to the versioned page map entry for the original page
state PageToVersionedMapT::iterator iPageMapPair = self->remappedPages.find(p.originalPageID);
// The iterator must be valid and not empty and its first page map entry must match p's version
ASSERT(iPageMapPair != self->remappedPages.end());
ASSERT(!iPageMapPair->second.empty());
state VersionToPageMapT::iterator iVersionPagePair = iPageMapPair->second.find(p.version);
ASSERT(iVersionPagePair != iPageMapPair->second.end());
RemappedPage::Type firstType = p.getType();
state RemappedPage::Type secondType;
bool secondAfterOldestRetainedVersion = false;
state bool deleteAtSameVersion = false;
if (p.newPageID == iVersionPagePair->second) {
auto nextEntry = iVersionPagePair;
++nextEntry;
if (nextEntry == iPageMapPair->second.end()) {
secondType = RemappedPage::NONE;
} else {
secondType = RemappedPage::getTypeOf(nextEntry->second);
secondAfterOldestRetainedVersion = nextEntry->first > oldestRetainedVersion;
}
} else {
ASSERT(iVersionPagePair->second == invalidLogicalPageID);
secondType = RemappedPage::FREE;
deleteAtSameVersion = true;
}
ASSERT(firstType == RemappedPage::REMAP || secondType == RemappedPage::NONE);
// Scenarios and actions to take:
//
// The first letter (firstType) is the type of the entry just popped from the remap queue.
// The second letter (secondType) is the type of the next item in the queue for the same
// original page ID, if present. If not present, secondType will be NONE.
//
// Since the next item can be arbitrarily ahead in the queue, secondType is determined by
// looking at the remappedPages structure.
//
// R == Remap F == Free D == Detach | == oldestRetainedVersion
//
// oldestRetainedVersion is the oldest version being maintained as readable, either because it is explicitly the
// oldest readable version set or because there is an active snapshot for the version even though it is older
// than the explicitly set oldest readable version.
//
// R R | free new ID
// R F | free new ID if R and D are at different versions
// R D | do nothing
// R | R copy new to original ID, free new ID
// R | F copy new to original ID, free new ID
// R | D copy new to original ID
// R | copy new to original ID, free new ID
// F | free original ID
// D | free original ID
//
// Note that
//
// Special case: Page is detached while it is being read in remapCopyAndFree()
// Initial state: R |
// Start remapCopyAndFree(), intending to copy new, ID to originalID and free newID
// New state: R | D
// Read of newID completes.
// Copy new contents over original, do NOT free new ID
// Later popped state: D |
// free original ID
//
state bool freeNewID =
(firstType == RemappedPage::REMAP && secondType != RemappedPage::DETACH && !deleteAtSameVersion);
state bool copyNewToOriginal = (firstType == RemappedPage::REMAP &&
(secondAfterOldestRetainedVersion || secondType == RemappedPage::NONE));
state bool freeOriginalID = (firstType == RemappedPage::FREE || firstType == RemappedPage::DETACH);
debug_printf("DWALPager(%s) remapCleanup %s secondType=%c mapEntry=%s oldestRetainedVersion=%" PRId64 " \n",
self->filename.c_str(),
p.toString().c_str(),
secondType,
::toString(*iVersionPagePair).c_str(),
oldestRetainedVersion);
if (copyNewToOriginal) {
if (g_network->isSimulated()) {
ASSERT(self->remapDestinationsSimOnly.count(p.originalPageID) == 0);
self->remapDestinationsSimOnly.insert(p.originalPageID);
}
debug_printf("DWALPager(%s) remapCleanup copy %s\n", self->filename.c_str(), p.toString().c_str());
// Read the data from the page that the original was mapped to
Reference<ArenaPage> data = wait(
self->readPage(PagerEventReasons::MetaData, nonBtreeLevel, p.newPageID, ioLeafPriority, false, true));
// Write the data to the original page so it can be read using its original pageID
self->updatePage(
PagerEventReasons::MetaData, nonBtreeLevel, VectorRef<LogicalPageID>(&p.originalPageID, 1), data);
++g_redwoodMetrics.metric.pagerRemapCopy;
} else if (firstType == RemappedPage::REMAP) {
++g_redwoodMetrics.metric.pagerRemapSkip;
}
// Now that the page contents have been copied to the original page, if the corresponding map entry
// represented the remap and there wasn't a delete later in the queue at p for the same version then
// erase the entry.
if (!deleteAtSameVersion) {
debug_printf(
"DWALPager(%s) remapCleanup deleting map entry %s\n", self->filename.c_str(), p.toString().c_str());
// Erase the entry and set iVersionPagePair to the next entry or end
iVersionPagePair = iPageMapPair->second.erase(iVersionPagePair);
// If the map is now empty, delete it
if (iPageMapPair->second.empty()) {
debug_printf(
"DWALPager(%s) remapCleanup deleting empty map %s\n", self->filename.c_str(), p.toString().c_str());
self->remappedPages.erase(iPageMapPair);
} else if (freeNewID && secondType == RemappedPage::NONE &&
iVersionPagePair != iPageMapPair->second.end() &&
RemappedPage::getTypeOf(iVersionPagePair->second) == RemappedPage::DETACH) {
// If we intend to free the new ID and there was no map entry, one could have been added during the wait
// above. If so, and if it was a detach operation, then we can't free the new page ID as its lifetime
// will be managed by the client starting at some later version.
freeNewID = false;
}
}
if (freeNewID) {
debug_printf("DWALPager(%s) remapCleanup freeNew %s %s\n",
self->filename.c_str(),
p.toString().c_str(),
toString(self->getLastCommittedVersion()).c_str());
// newID must be freed at the latest committed version to avoid a read race between caching and non-caching
// readers. It is possible that there are readers of newID in flight right now that either
// - Did not read through the page cache
// - Did read through the page cache but there was no entry for the page at the time, so one was created
// and the read future is still pending
// In either case the physical read of newID from disk can happen at some time after right now and after the
// current commit is finished.
//
// If newID is freed immediately, meaning as of the end of the current commit, then it could be reused in
// the next commit which could be before any reads fitting the above description have completed, causing
// those reads to the new write which is incorrect. Since such readers could be using pager snapshots at
// versions up to and including the latest committed version, newID must be freed *after* that version is no
// longer readable.
self->freeUnmappedPage(p.newPageID, self->getLastCommittedVersion() + 1);
++g_redwoodMetrics.metric.pagerRemapFree;
}
if (freeOriginalID) {
debug_printf("DWALPager(%s) remapCleanup freeOriginal %s\n", self->filename.c_str(), p.toString().c_str());
// originalID can be freed immediately because it is already the case that there are no readers at a version
// prior to oldestRetainedVersion so no reader will need originalID.
self->freeUnmappedPage(p.originalPageID, 0);
++g_redwoodMetrics.metric.pagerRemapFree;
}
return Void();
}
ACTOR static Future<Void> remapCleanup(DWALPager* self) {
state ActorCollection tasks(true);
state Promise<Void> signal;
tasks.add(signal.getFuture());
self->remapCleanupStop = false;
// The oldest retained version cannot change during the cleanup run as this would allow multiple read/copy
// operations with the same original page ID destination to be started and they could complete out of order.
state Version oldestRetainedVersion = self->effectiveOldestVersion();
// Cutoff is the version we can pop to
state RemappedPage cutoff(oldestRetainedVersion);
// Maximum number of remaining remap entries to keep before obeying stop command.
double toleranceRatio = BUGGIFY ? deterministicRandom()->randomInt(0, 10) / 100.0
: SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_TOLERANCE_RATIO;
// For simplicity, we assume each entry in the remap queue corresponds to one remapped page.
uint64_t remapCleanupWindowEntries =
static_cast<uint64_t>(self->remapCleanupWindowBytes / self->header.pageSize);
state uint64_t minRemapEntries = static_cast<uint64_t>(remapCleanupWindowEntries * (1.0 - toleranceRatio));
state uint64_t maxRemapEntries = static_cast<uint64_t>(remapCleanupWindowEntries * (1.0 + toleranceRatio));
debug_printf("DWALPager(%s) remapCleanup oldestRetainedVersion=%" PRId64 " remapCleanupWindowBytes=%" PRId64
" pageSize=%" PRIu32 " minRemapEntries=%" PRId64 " maxRemapEntries=%" PRId64 " items=%" PRId64
"\n",
self->filename.c_str(),
oldestRetainedVersion,
self->remapCleanupWindowBytes,
self->header.pageSize,
minRemapEntries,
maxRemapEntries,
self->remapQueue.numEntries);
if (g_network->isSimulated()) {
self->remapDestinationsSimOnly.clear();
}
state int sinceYield = 0;
loop {
// Stop if we have cleanup enough remap entries, or if the stop flag is set and the remaining remap
// entries are less than that allowed by the lag.
int64_t remainingEntries = self->remapQueue.numEntries;
if (remainingEntries <= minRemapEntries ||
(self->remapCleanupStop && remainingEntries <= maxRemapEntries)) {
debug_printf("DWALPager(%s) remapCleanup finished remainingEntries=%" PRId64 " minRemapEntries=%" PRId64
" maxRemapEntries=%" PRId64,
self->filename.c_str(),
remainingEntries,
minRemapEntries,
maxRemapEntries);
break;
}
state Optional<RemappedPage> p = wait(self->remapQueue.pop(cutoff));
debug_printf("DWALPager(%s) remapCleanup popped %s items=%" PRId64 "\n",
self->filename.c_str(),
::toString(p).c_str(),
self->remapQueue.numEntries);
// Stop if we have reached the cutoff version, which is the start of the cleanup coalescing window
if (!p.present()) {
debug_printf("DWALPager(%s) remapCleanup pop failed cutoffVer=%" PRId64 " items=%" PRId64 "\n",
self->filename.c_str(),
cutoff.version,
self->remapQueue.numEntries);
break;
}
Future<Void> task = removeRemapEntry(self, p.get(), oldestRetainedVersion);
if (!task.isReady()) {
tasks.add(task);
}
// Yield to prevent slow task in case no IO waits are encountered
if (++sinceYield >= 100) {
sinceYield = 0;
wait(yield());
}
}
debug_printf("DWALPager(%s) remapCleanup stopped stopSignal=%d remap=%" PRId64 " free=%" PRId64
" delayedFree=%" PRId64 "\n",
self->filename.c_str(),
self->remapCleanupStop,
self->remapQueue.numEntries,
self->freeList.numEntries,
self->delayedFreeList.numEntries);
signal.send(Void());
wait(tasks.getResult());
return Void();
}
// Flush all queues so they have no operations pending.
ACTOR static Future<Void> flushQueues(DWALPager* self) {
ASSERT(self->remapCleanupFuture.isReady());
// Flush remap queue and related queues separately, they are not involved in free page management
wait(self->remapQueue.flush());
wait(self->extentFreeList.flush());
wait(self->extentUsedList.flush());
// Flush the free list and delayed free list queues together as they are used by freePage() and newPageID()
// Since each queue's preFlush can create work for the other, we must see preflush return false for both
// twice in row.
state int clear = 0;
loop {
state bool freeBusy = wait(self->freeList.preFlush());
state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush());
debug_printf("DWALPager(%s) flushQueues freeBusy=%d delayedFreeBusy=%d\n",
self->filename.c_str(),
freeBusy,
delayedFreeBusy);
// Once preFlush() returns false for both queues then there are no more operations pending
// on either queue. If preFlush() returns true for either queue in one loop execution then
// it could have generated new work for itself or the other queue.
if (!freeBusy && !delayedFreeBusy) {
if (++clear == 2) {
break;
}
} else {
clear = 0;
}
}
self->freeList.finishFlush();
self->delayedFreeList.finishFlush();
return Void();
}
ACTOR static Future<Void> commit_impl(DWALPager* self, Version v, Value commitRecord) {
debug_printf("DWALPager(%s) commit begin %s\n", self->filename.c_str(), ::toString(v).c_str());
// Write old committed header to Page 1
self->writeHeaderPage(backupHeaderPageID, self->lastCommittedHeaderPage);
// Trigger the remap eraser to stop and then wait for it.
self->remapCleanupStop = true;
wait(self->remapCleanupFuture);
wait(flushQueues(self));
self->header.committedVersion = v;
self->header.remapQueue = self->remapQueue.getState();
self->header.extentFreeList = self->extentFreeList.getState();
self->header.extentUsedList = self->extentUsedList.getState();
self->header.freeList = self->freeList.getState();
self->header.delayedFreeList = self->delayedFreeList.getState();
// Wait for all outstanding writes to complete
debug_printf("DWALPager(%s) waiting for outstanding writes\n", self->filename.c_str());
wait(waitForAll(self->operations));
self->operations.clear();
debug_printf("DWALPager(%s) Syncing\n", self->filename.c_str());
// Sync everything except the header
if (g_network->getCurrentTask() > TaskPriority::DiskWrite) {
wait(delay(0, TaskPriority::DiskWrite));
}
if (!self->memoryOnly) {
wait(self->pageFile->sync());
debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n",
self->filename.c_str(),
self->header.committedVersion);
}
// Update new commit header to the primary header page
self->header.userCommitRecord = commitRecord;
self->updateHeaderPage();
// Update primary header page on disk and sync again.
wait(self->writeHeaderPage(primaryHeaderPageID, self->headerPage));
if (g_network->getCurrentTask() > TaskPriority::DiskWrite) {
wait(delay(0, TaskPriority::DiskWrite));
}
if (!self->memoryOnly) {
wait(self->pageFile->sync());
debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n",
self->filename.c_str(),
self->header.committedVersion);
}
// Update the last committed header for use in the next commit.
self->updateLastCommittedHeader();
self->addSnapshot(v, self->header.userCommitRecord);
// Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use,
// because maybe some are no longer in use.
self->expireSnapshots(self->header.oldestVersion);
// Start unmapping pages for expired versions
self->remapCleanupFuture = remapCleanup(self);
// If there are prioritized evictions queued, flush them to the regular eviction order.
self->pageCache.flushPrioritizedEvictions();
return Void();
}
Future<Void> commit(Version v, Value commitRecord) override {
// Can't have more than one commit outstanding.
ASSERT(commitFuture.isReady());
ASSERT(v > lastCommittedHeader.committedVersion);
commitFuture = forwardError(commit_impl(this, v, commitRecord), errorPromise);
return commitFuture;
}
Value getCommitRecord() const override { return lastCommittedHeader.userCommitRecord; }
ACTOR void shutdown(DWALPager* self, bool dispose) {
// Send to the error promise first and then delay(0) to give users a chance to cancel
// any outstanding operations
if (self->errorPromise.canBeSet()) {
debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str());
self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
}
wait(delay(0));
// The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel();
debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str());
self->commitFuture.cancel();
debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str());
self->remapCleanupFuture.cancel();
debug_printf("DWALPager(%s) shutdown kill file extension\n", self->filename.c_str());
self->fileExtension.cancel();
debug_printf("DWALPager(%s) shutdown cancel operations\n", self->filename.c_str());
for (auto& f : self->operations) {
f.cancel();
}
self->operations.clear();
debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str());
wait(self->extentCache.clear());
wait(self->pageCache.clear());
debug_printf("DWALPager(%s) shutdown remappedPagesMap: %s\n",
self->filename.c_str(),
toString(self->remappedPages).c_str());
// Unreference the file and clear
self->pageFile.clear();
if (dispose) {
if (!self->memoryOnly) {
debug_printf("DWALPager(%s) shutdown deleting file\n", self->filename.c_str());
wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true));
}
}
self->closedPromise.send(Void());
delete self;
}
void dispose() override { shutdown(this, true); }
void close() override { shutdown(this, false); }
Future<Void> getError() const override { return errorPromise.getFuture(); }
Future<Void> onClosed() const override { return closedPromise.getFuture(); }
StorageBytes getStorageBytes() const override {
int64_t free;
int64_t total;
if (memoryOnly) {
total = pageCache.evictor().sizeLimit;
free = pageCache.evictor().getSizeUsed();
} else {
g_network->getDiskBytes(parentDirectory(filename), free, total);
}
int64_t pagerSize = header.pageCount * physicalPageSize;
// It is not exactly known how many pages on the delayed free list are usable as of right now. It could be
// known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the
// free queue, but this doesn't seem necessary.
// Amount of space taken up by all of the items in the free lists
int64_t reusablePageSpace = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize;
// Amount of space taken up by the free list queues themselves, as if we were to pop and use
// items on the free lists the space the items are stored in would also become usable
int64_t reusableQueueSpace = (freeList.numPages + delayedFreeList.numPages) * physicalPageSize;
int64_t reusable = reusablePageSpace + reusableQueueSpace;
// Space currently in used by old page versions have have not yet been freed due to the remap cleanup window.
int64_t temp = remapQueue.numEntries * physicalPageSize;
return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp);
}
int64_t getPageCacheCount() override { return pageCache.getCount(); }
int64_t getPageCount() override { return header.pageCount; }
int64_t getExtentCacheCount() override { return extentCache.getCount(); }
ACTOR static Future<Void> getUserPageCount_cleanup(DWALPager* self) {
// Wait for the remap eraser to finish all of its work (not triggering stop)
wait(self->remapCleanupFuture);
// Flush queues so there are no pending freelist operations
wait(flushQueues(self));
debug_printf("DWALPager getUserPageCount_cleanup\n");
self->freeList.getState();
self->delayedFreeList.getState();
self->extentFreeList.getState();
self->extentUsedList.getState();
self->remapQueue.getState();
return Void();
}
// Get the number of pages in use by the pager's user
Future<int64_t> getUserPageCount() override {
return map(getUserPageCount_cleanup(this), [=](Void) {
int64_t userPages =
header.pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages -
delayedFreeList.numEntries - ((((remapQueue.numPages - 1) / pagesPerExtent) + 1) * pagesPerExtent) -
extentFreeList.numPages - (pagesPerExtent * extentFreeList.numEntries) - extentUsedList.numPages;
debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64
" freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64
" remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n",
filename.c_str(),
userPages,
header.pageCount,
freeList.numPages,
freeList.numEntries,
delayedFreeList.numPages,
delayedFreeList.numEntries,
remapQueue.numPages,
remapQueue.numEntries);
return userPages;
});
}
Future<Void> init() override { return recoverFuture; }
Version getLastCommittedVersion() const override { return lastCommittedHeader.committedVersion; }
private:
~DWALPager() {}
// Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use.
void expireSnapshots(Version v);
// Header is the format of page 0 of the database
struct PagerCommitHeader {
constexpr static FileIdentifier file_identifier = 11836690;
constexpr static unsigned int FORMAT_VERSION = 10;
uint16_t formatVersion;
uint32_t queueCount;
uint32_t pageSize;
int64_t pageCount;
uint32_t extentSize;
Version committedVersion;
Version oldestVersion;
Value userCommitRecord;
FIFOQueue<LogicalPageID>::QueueState freeList;
FIFOQueue<LogicalPageID>::QueueState extentFreeList; // free list for extents
FIFOQueue<ExtentUsedListEntry>::QueueState extentUsedList; // in-use list for extents
FIFOQueue<DelayedFreePage>::QueueState delayedFreeList;
FIFOQueue<RemappedPage>::QueueState remapQueue;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar,
formatVersion,
queueCount,
pageSize,
pageCount,
extentSize,
committedVersion,
oldestVersion,
userCommitRecord,
freeList,
extentFreeList,
extentUsedList,
delayedFreeList,
remapQueue);
}
};
ACTOR static Future<Void> clearRemapQueue_impl(DWALPager* self) {
// Wait for outstanding commit.
wait(self->commitFuture);
// Set remap cleanup window to 0 to allow the remap queue to drain.
state int64_t remapCleanupWindowBytes = self->remapCleanupWindowBytes;
self->remapCleanupWindowBytes = 0;
// Try twice to commit and advance version. The first commit should trigger a remap cleanup actor, which picks
// up the new remap cleanup window being 0. The second commit waits for the remap cleanup actor to finish.
state int attempt = 0;
for (attempt = 0; attempt < 2; attempt++) {
self->setOldestReadableVersion(self->getLastCommittedVersion());
wait(self->commit(self->getLastCommittedVersion() + 1, self->getCommitRecord()));
}
ASSERT(self->remapQueue.numEntries == 0);
// Restore remap cleanup window.
if (remapCleanupWindowBytes != 0)
self->remapCleanupWindowBytes = remapCleanupWindowBytes;
TraceEvent e("RedwoodClearRemapQueue");
self->toTraceEvent(e);
e.log();
return Void();
}
Future<Void> clearRemapQueue() override { return clearRemapQueue_impl(this); }
private:
// Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires
// this in simulation, and it also makes sense for current SSDs.
// Allowing a smaller 'logical' page size is very useful for testing.
constexpr static int smallestPhysicalBlock = 4096;
int physicalPageSize;
int logicalPageSize; // In simulation testing it can be useful to use a small logical page size
// Extents are multi-page blocks used by the FIFO queues
int physicalExtentSize;
int pagesPerExtent;
Reference<IPageEncryptionKeyProvider> keyProvider;
PriorityMultiLock ioLock;
int64_t pageCacheBytes;
// The header will be written to / read from disk as a smallestPhysicalBlock sized chunk.
Reference<ArenaPage> headerPage;
PagerCommitHeader header;
Reference<ArenaPage> lastCommittedHeaderPage;
PagerCommitHeader lastCommittedHeader;
// Pages - pages known to be in the file, truncations complete to that size
int64_t filePageCount;
// Pages that will be in file once fileExtension is ready
int64_t filePageCountPending;
// Future representing the end of all pending truncations
Future<Void> fileExtension;
int desiredPageSize;
int desiredExtentSize;
Version recoveryVersion;
std::string filename;
bool memoryOnly;
PageCacheT pageCache;
// The extent cache isn't a normal cache, it isn't allowed to evict things. It is populated
// during recovery with remap queue extents and then cleared.
PageCacheT::Evictor extentCacheDummyEvictor{ std::numeric_limits<int64_t>::max() };
PageCacheT extentCache{ &extentCacheDummyEvictor };
Promise<Void> closedPromise;
Promise<Void> errorPromise;
Future<Void> commitFuture;
// The operations vector is used to hold all disk writes made by the Pager, but could also hold
// other operations that need to be waited on before a commit can finish.
std::vector<Future<Void>> operations;
Future<Void> recoverFuture;
Future<Void> remapCleanupFuture;
bool remapCleanupStop;
Reference<IAsyncFile> pageFile;
LogicalPageQueueT freeList;
// The delayed free list will be approximately in Version order.
// TODO: Make this an ordered container some day.
DelayedFreePageQueueT delayedFreeList;
RemapQueueT remapQueue;
LogicalPageQueueT extentFreeList;
ExtentUsedListQueueT extentUsedList;
uint64_t remapCleanupWindowBytes;
Reference<FlowLock> concurrentExtentReads;
std::unordered_set<PhysicalPageID> remapDestinationsSimOnly;
struct SnapshotEntry {
Version version;
Reference<DWALPagerSnapshot> snapshot;
};
struct SnapshotEntryLessThanVersion {
bool operator()(Version v, const SnapshotEntry& snapshot) { return v < snapshot.version; }
bool operator()(const SnapshotEntry& snapshot, Version v) { return snapshot.version < v; }
};
// TODO: Better data structure
PageToVersionedMapT remappedPages;
// Readable snapshots in version order
std::deque<SnapshotEntry> snapshots;
Version oldestSnapshotVersion;
};
// Prevents pager from reusing freed pages from version until the snapshot is destroyed
class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted<DWALPagerSnapshot> {
public:
DWALPagerSnapshot(DWALPager* pager, Key meta, Version version) : pager(pager), version(version), metaKey(meta) {}
~DWALPagerSnapshot() override {}
Future<Reference<const ArenaPage>> getPhysicalPage(PagerEventReasons reason,
unsigned int level,
LogicalPageID pageID,
int priority,
bool cacheable,
bool noHit) override {
return map(pager->readPageAtVersion(reason, level, pageID, priority, version, cacheable, noHit),
[=](Reference<ArenaPage> p) { return Reference<const ArenaPage>(std::move(p)); });
}
Future<Reference<const ArenaPage>> getMultiPhysicalPage(PagerEventReasons reason,
unsigned int level,
VectorRef<PhysicalPageID> pageIDs,
int priority,
bool cacheable,
bool noHit) override {
return map(pager->readMultiPage(reason, level, pageIDs, priority, cacheable, noHit),
[=](Reference<ArenaPage> p) { return Reference<const ArenaPage>(std::move(p)); });
}
Key getMetaKey() const override { return metaKey; }
Version getVersion() const override { return version; }
void addref() override { ReferenceCounted<DWALPagerSnapshot>::addref(); }
void delref() override { ReferenceCounted<DWALPagerSnapshot>::delref(); }
DWALPager* pager;
Version version;
Key metaKey;
};
void DWALPager::expireSnapshots(Version v) {
debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n",
filename.c_str(),
v,
(int)snapshots.size());
// While there is more than one snapshot and the front snapshot is older than v and has no other reference holders
while (snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) {
debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n",
filename.c_str(),
snapshots.front().version,
snapshots.front().snapshot->isSoleOwner());
// Expire the snapshot and update the oldest snapshot version
snapshots.pop_front();
oldestSnapshotVersion = snapshots.front().version;
}
}
Reference<IPagerSnapshot> DWALPager::getReadSnapshot(Version v) {
auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion());
if (i == snapshots.begin()) {
throw version_invalid();
}
--i;
return i->snapshot;
}
// TODO: Move this to a flow header once it is mature.
struct SplitStringRef {
StringRef a;
StringRef b;
SplitStringRef(StringRef a = StringRef(), StringRef b = StringRef()) : a(a), b(b) {}
SplitStringRef(Arena& arena, const SplitStringRef& toCopy) : a(toStringRef(arena)), b() {}
SplitStringRef prefix(int len) const {
if (len <= a.size()) {
return SplitStringRef(a.substr(0, len));
}
len -= a.size();
return SplitStringRef(a, b.substr(0, len));
}
StringRef toStringRef(Arena& arena) const {
StringRef c = makeString(size(), arena);
memcpy(mutateString(c), a.begin(), a.size());
memcpy(mutateString(c) + a.size(), b.begin(), b.size());
return c;
}
Standalone<StringRef> toStringRef() const {
Arena a;
return Standalone<StringRef>(toStringRef(a), a);
}
int size() const { return a.size() + b.size(); }
int expectedSize() const { return size(); }
std::string toString() const { return format("%s%s", a.toString().c_str(), b.toString().c_str()); }
std::string toHexString() const { return format("%s%s", a.toHexString().c_str(), b.toHexString().c_str()); }
struct const_iterator {
const uint8_t* ptr;
const uint8_t* end{ nullptr };
const uint8_t* next{ nullptr };
inline bool operator==(const const_iterator& rhs) const { return ptr == rhs.ptr; }
inline bool operator!=(const const_iterator& rhs) const { return !(*this == rhs); }
inline const_iterator& operator++() {
++ptr;
if (ptr == end) {
ptr = next;
}
return *this;
}
inline const_iterator& operator+(int n) {
ptr += n;
if (ptr >= end) {
ptr = next + (ptr - end);
}
return *this;
}
inline uint8_t operator*() const { return *ptr; }
};
inline const_iterator begin() const { return { a.begin(), a.end(), b.begin() }; }
inline const_iterator end() const { return { b.end() }; }
template <typename StringT>
int compare(const StringT& rhs) const {
auto j = begin();
auto k = rhs.begin();
auto jEnd = end();
auto kEnd = rhs.end();
while (j != jEnd && k != kEnd) {
int cmp = *j - *k;
if (cmp != 0) {
return cmp;
}
}
// If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less
// than rhs
if (j == jEnd) {
return k == kEnd ? 0 : -1;
}
return 1;
}
};
// A BTree node link is a list of LogicalPageID's whose contents should be concatenated together.
typedef VectorRef<LogicalPageID> BTreeNodeLinkRef;
typedef Standalone<BTreeNodeLinkRef> BTreeNodeLink;
constexpr LogicalPageID maxPageID = (LogicalPageID)-1;
std::string toString(BTreeNodeLinkRef id) {
return std::string("BTreePageID") + toString(id.begin(), id.end());
}
struct RedwoodRecordRef {
typedef uint8_t byte;
RedwoodRecordRef(KeyRef key = KeyRef(), Optional<ValueRef> value = {}) : key(key), value(value) {}
RedwoodRecordRef(Arena& arena, const RedwoodRecordRef& toCopy) : key(arena, toCopy.key) {
if (toCopy.value.present()) {
value = ValueRef(arena, toCopy.value.get());
}
}
typedef KeyRef Partial;
void updateCache(Optional<Partial>& cache, Arena& arena) const { cache = KeyRef(arena, key); }
KeyValueRef toKeyValueRef() const { return KeyValueRef(key, value.get()); }
// RedwoodRecordRefs are used for both internal and leaf pages of the BTree.
// Boundary records in internal pages are made from leaf records.
// These functions make creating and working with internal page records more convenient.
inline BTreeNodeLinkRef getChildPage() const {
ASSERT(value.present());
return BTreeNodeLinkRef((LogicalPageID*)value.get().begin(), value.get().size() / sizeof(LogicalPageID));
}
inline void setChildPage(BTreeNodeLinkRef id) {
value = ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID));
}
inline void setChildPage(Arena& arena, BTreeNodeLinkRef id) {
value = ValueRef(arena, (const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID));
}
inline RedwoodRecordRef withPageID(BTreeNodeLinkRef id) const {
return RedwoodRecordRef(key, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)));
}
inline RedwoodRecordRef withoutValue() const { return RedwoodRecordRef(key); }
inline RedwoodRecordRef withMaxPageID() const {
return RedwoodRecordRef(key, StringRef((uint8_t*)&maxPageID, sizeof(maxPageID)));
}
// Truncate (key, version, part) tuple to len bytes.
void truncate(int len) {
ASSERT(len <= key.size());
key = key.substr(0, len);
}
// Find the common key prefix between two records, assuming that the first skipLen bytes are the same
inline int getCommonPrefixLen(const RedwoodRecordRef& other, int skipLen = 0) const {
return skipLen + commonPrefixLength(key, other.key, skipLen);
}
// Compares and orders by key, version, chunk.total, chunk.start, value
// This is the same order that delta compression uses for prefix borrowing
int compare(const RedwoodRecordRef& rhs, int skip = 0) const {
int keySkip = std::min(skip, key.size());
int cmp = key.compareSuffix(rhs.key, keySkip);
if (cmp == 0) {
cmp = value.compare(rhs.value);
}
return cmp;
}
bool sameUserKey(const StringRef& k, int skipLen) const {
// Keys are the same if the sizes are the same and either the skipLen is longer or the non-skipped suffixes are
// the same.
return (key.size() == k.size()) && (key.substr(skipLen) == k.substr(skipLen));
}
bool sameExceptValue(const RedwoodRecordRef& rhs, int skipLen = 0) const { return sameUserKey(rhs.key, skipLen); }
// TODO: Use SplitStringRef (unless it ends up being slower)
KeyRef key;
Optional<ValueRef> value;
int expectedSize() const { return key.expectedSize() + value.expectedSize(); }
int kvBytes() const { return expectedSize(); }
#pragma pack(push, 1)
struct Delta {
uint8_t flags;
// Four field sizing schemes ranging from 3 to 8 bytes, with 3 being the most common.
union {
struct {
uint8_t prefixLength;
uint8_t suffixLength;
uint8_t valueLength;
} LengthFormat0;
struct {
uint8_t prefixLength;
uint8_t suffixLength;
uint16_t valueLength;
} LengthFormat1;
struct {
uint8_t prefixLength;
uint8_t suffixLength;
uint32_t valueLength;
} LengthFormat2;
struct {
uint16_t prefixLength;
uint16_t suffixLength;
uint32_t valueLength;
} LengthFormat3;
};
constexpr static int LengthFormatSizes[] = { sizeof(LengthFormat0),
sizeof(LengthFormat1),
sizeof(LengthFormat2),
sizeof(LengthFormat3) };
// Serialized Format
//
// Flags - 1 byte
// 1 bit - borrow source is prev ancestor (otherwise next ancestor)
// 1 bit - item is deleted
// 1 bit - has value (different from a zero-length value, which is still a value)
// 3 unused bits
// 2 bits - length fields format
//
// Length fields using 3 to 8 bytes total depending on length fields format
//
// Byte strings
// Value bytes
// Key suffix bytes
enum EFlags {
PREFIX_SOURCE_PREV = 0x80,
IS_DELETED = 0x40,
HAS_VALUE = 0x20,
// 3 unused bits
LENGTHS_FORMAT = 0x03
};
// Figure out which length format must be used for the given lengths
static inline int determineLengthFormat(int prefixLength, int suffixLength, int valueLength) {
// Large prefix or suffix length, which should be rare, is format 3
if (prefixLength > 0xFF || suffixLength > 0xFF) {
return 3;
} else if (valueLength < 0x100) {
return 0;
} else if (valueLength < 0x10000) {
return 1;
} else {
return 2;
}
}
// Large prefix or suffix length, which should be rare, is format 3
byte* data() const {
switch (flags & LENGTHS_FORMAT) {
case 0:
return (byte*)(&LengthFormat0 + 1);
case 1:
return (byte*)(&LengthFormat1 + 1);
case 2:
return (byte*)(&LengthFormat2 + 1);
case 3:
default:
return (byte*)(&LengthFormat3 + 1);
}
}
int getKeyPrefixLength() const {
switch (flags & LENGTHS_FORMAT) {
case 0:
return LengthFormat0.prefixLength;
case 1:
return LengthFormat1.prefixLength;
case 2:
return LengthFormat2.prefixLength;
case 3:
default:
return LengthFormat3.prefixLength;
}
}
int getKeySuffixLength() const {
switch (flags & LENGTHS_FORMAT) {
case 0:
return LengthFormat0.suffixLength;
case 1:
return LengthFormat1.suffixLength;
case 2:
return LengthFormat2.suffixLength;
case 3:
default:
return LengthFormat3.suffixLength;
}
}
int getValueLength() const {
switch (flags & LENGTHS_FORMAT) {
case 0:
return LengthFormat0.valueLength;
case 1:
return LengthFormat1.valueLength;
case 2:
return LengthFormat2.valueLength;
case 3:
default:
return LengthFormat3.valueLength;
}
}
StringRef getKeySuffix() const { return StringRef(data() + getValueLength(), getKeySuffixLength()); }
StringRef getValue() const { return StringRef(data(), getValueLength()); }
bool hasValue() const { return flags & HAS_VALUE; }
void setPrefixSource(bool val) {
if (val) {
flags |= PREFIX_SOURCE_PREV;
} else {
flags &= ~PREFIX_SOURCE_PREV;
}
}
bool getPrefixSource() const { return flags & PREFIX_SOURCE_PREV; }
void setDeleted(bool val) {
if (val) {
flags |= IS_DELETED;
} else {
flags &= ~IS_DELETED;
}
}
bool getDeleted() const { return flags & IS_DELETED; }
// DeltaTree interface
RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const {
int keyPrefixLen = getKeyPrefixLength();
int keySuffixLen = getKeySuffixLength();
int valueLen = hasValue() ? getValueLength() : 0;
byte* pData = data();
StringRef k;
// If there is a key suffix, reconstitute the complete key into a contiguous string
if (keySuffixLen > 0) {
k = makeString(keyPrefixLen + keySuffixLen, arena);
memcpy(mutateString(k), base.key.begin(), keyPrefixLen);
memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen);
} else {
// Otherwise just reference the base key's memory
k = base.key.substr(0, keyPrefixLen);
}
return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional<ValueRef>());
}
// DeltaTree interface
RedwoodRecordRef apply(const Partial& cache) {
return RedwoodRecordRef(cache, hasValue() ? Optional<ValueRef>(getValue()) : Optional<ValueRef>());
}
RedwoodRecordRef apply(Arena& arena, const Partial& baseKey, Optional<Partial>& cache) {
int keyPrefixLen = getKeyPrefixLength();
int keySuffixLen = getKeySuffixLength();
int valueLen = hasValue() ? getValueLength() : 0;
byte* pData = data();
StringRef k;
// If there is a key suffix, reconstitute the complete key into a contiguous string
if (keySuffixLen > 0) {
k = makeString(keyPrefixLen + keySuffixLen, arena);
memcpy(mutateString(k), baseKey.begin(), keyPrefixLen);
memcpy(mutateString(k) + keyPrefixLen, pData + valueLen, keySuffixLen);
} else {
// Otherwise just reference the base key's memory
k = baseKey.substr(0, keyPrefixLen);
}
cache = k;
return RedwoodRecordRef(k, hasValue() ? ValueRef(pData, valueLen) : Optional<ValueRef>());
}
RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional<Partial>& cache) {
return apply(arena, base.key, cache);
}
int size() const {
int size = 1;
switch (flags & LENGTHS_FORMAT) {
case 0:
return size + sizeof(LengthFormat0) + LengthFormat0.suffixLength + LengthFormat0.valueLength;
case 1:
return size + sizeof(LengthFormat1) + LengthFormat1.suffixLength + LengthFormat1.valueLength;
case 2:
return size + sizeof(LengthFormat2) + LengthFormat2.suffixLength + LengthFormat2.valueLength;
case 3:
default:
return size + sizeof(LengthFormat3) + LengthFormat3.suffixLength + LengthFormat3.valueLength;
}
}
std::string toString() const {
std::string flagString = " ";
if (flags & PREFIX_SOURCE_PREV) {
flagString += "PrefixSource|";
}
if (flags & IS_DELETED) {
flagString += "IsDeleted|";
}
if (hasValue()) {
flagString += "HasValue|";
}
int lengthFormat = flags & LENGTHS_FORMAT;
int prefixLen = getKeyPrefixLength();
int keySuffixLen = getKeySuffixLength();
int valueLen = getValueLength();
return format("lengthFormat: %d totalDeltaSize: %d flags: %s prefixLen: %d keySuffixLen: %d "
"valueLen %d raw: %s",
lengthFormat,
size(),
flagString.c_str(),
prefixLen,
keySuffixLen,
valueLen,
StringRef((const uint8_t*)this, size()).toHexString().c_str());
}
};
// Using this class as an alternative for Delta enables reading a DeltaTree2<RecordRef> while only decoding
// its values, so the Reader does not require the original prev/next ancestors.
struct DeltaValueOnly : Delta {
RedwoodRecordRef apply(const RedwoodRecordRef& base, Arena& arena) const {
return RedwoodRecordRef(KeyRef(), hasValue() ? Optional<ValueRef>(getValue()) : Optional<ValueRef>());
}
RedwoodRecordRef apply(const Partial& cache) {
return RedwoodRecordRef(KeyRef(), hasValue() ? Optional<ValueRef>(getValue()) : Optional<ValueRef>());
}
RedwoodRecordRef apply(Arena& arena, const RedwoodRecordRef& base, Optional<Partial>& cache) {
cache = KeyRef();
return RedwoodRecordRef(KeyRef(), hasValue() ? Optional<ValueRef>(getValue()) : Optional<ValueRef>());
}
};
#pragma pack(pop)
bool operator==(const RedwoodRecordRef& rhs) const { return compare(rhs) == 0; }
bool operator!=(const RedwoodRecordRef& rhs) const { return compare(rhs) != 0; }
bool operator<(const RedwoodRecordRef& rhs) const { return compare(rhs) < 0; }
bool operator>(const RedwoodRecordRef& rhs) const { return compare(rhs) > 0; }
bool operator<=(const RedwoodRecordRef& rhs) const { return compare(rhs) <= 0; }
bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; }
// Worst case overhead means to assume that either the prefix length or the suffix length
// could contain the full key size
int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const {
int prefixLen = getCommonPrefixLen(base, skipLen);
int keySuffixLen = key.size() - prefixLen;
int valueLen = value.present() ? value.get().size() : 0;
int formatType;
if (worstCaseOverhead) {
formatType = Delta::determineLengthFormat(key.size(), key.size(), valueLen);
} else {
formatType = Delta::determineLengthFormat(prefixLen, keySuffixLen, valueLen);
}
return 1 + Delta::LengthFormatSizes[formatType] + keySuffixLen + valueLen;
}
// commonPrefix between *this and base can be passed if known
int writeDelta(Delta& d, const RedwoodRecordRef& base, int keyPrefixLen = -1) const {
d.flags = value.present() ? Delta::HAS_VALUE : 0;
if (keyPrefixLen < 0) {
keyPrefixLen = getCommonPrefixLen(base, 0);
}
StringRef keySuffix = key.substr(keyPrefixLen);
int valueLen = value.present() ? value.get().size() : 0;
int formatType = Delta::determineLengthFormat(keyPrefixLen, keySuffix.size(), valueLen);
d.flags |= formatType;
switch (formatType) {
case 0:
d.LengthFormat0.prefixLength = keyPrefixLen;
d.LengthFormat0.suffixLength = keySuffix.size();
d.LengthFormat0.valueLength = valueLen;
break;
case 1:
d.LengthFormat1.prefixLength = keyPrefixLen;
d.LengthFormat1.suffixLength = keySuffix.size();
d.LengthFormat1.valueLength = valueLen;
break;
case 2:
d.LengthFormat2.prefixLength = keyPrefixLen;
d.LengthFormat2.suffixLength = keySuffix.size();
d.LengthFormat2.valueLength = valueLen;
break;
case 3:
default:
d.LengthFormat3.prefixLength = keyPrefixLen;
d.LengthFormat3.suffixLength = keySuffix.size();
d.LengthFormat3.valueLength = valueLen;
break;
}
uint8_t* wptr = d.data();
// Write value bytes
if (valueLen > 0) {
wptr = value.get().copyTo(wptr);
}
// Write key suffix string
wptr = keySuffix.copyTo(wptr);
return wptr - (uint8_t*)&d;
}
static std::string kvformat(StringRef s, int hexLimit = -1) {
bool hex = false;
for (auto c : s) {
if (!isprint(c)) {
hex = true;
break;
}
}
return hex ? s.toHexString(hexLimit) : s.toString();
}
std::string toString(bool leaf = true) const {
std::string r;
r += format("'%s' => ", key.printable().c_str());
if (value.present()) {
if (leaf) {
r += format("'%s'", kvformat(value.get()).c_str());
} else {
r += format("[%s]", ::toString(getChildPage()).c_str());
}
} else {
r += "(absent)";
}
return r;
}
};
struct BTreePage {
typedef DeltaTree2<RedwoodRecordRef> BinaryTree;
typedef DeltaTree2<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly> ValueTree;
#pragma pack(push, 1)
struct {
// treeOffset allows for newer versions to have additional fields but older code to read them
uint8_t treeOffset;
uint8_t height;
uint32_t kvBytes;
};
#pragma pack(pop)
void init(unsigned int height, unsigned int kvBytes) {
treeOffset = sizeof(BTreePage);
this->height = height;
this->kvBytes = kvBytes;
}
int size() const { return treeOffset + tree()->size(); }
uint8_t* treeBuffer() const { return (uint8_t*)this + treeOffset; }
BinaryTree* tree() { return (BinaryTree*)treeBuffer(); }
BinaryTree* tree() const { return (BinaryTree*)treeBuffer(); }
ValueTree* valueTree() const { return (ValueTree*)treeBuffer(); }
bool isLeaf() const { return height == 1; }
std::string toString(const char* context,
BTreeNodeLinkRef id,
Version ver,
const RedwoodRecordRef& lowerBound,
const RedwoodRecordRef& upperBound) const {
std::string r;
r += format("BTreePage op=%s %s @%" PRId64
" ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n",
context,
::toString(id).c_str(),
ver,
this,
height,
(int)tree()->numItems,
(int)kvBytes,
lowerBound.toString(false).c_str(),
upperBound.toString(false).c_str());
try {
if (tree()->numItems > 0) {
// This doesn't use the cached reader for the page because it is only for debugging purposes,
// a cached reader may not exist
BinaryTree::Cursor c(makeReference<BinaryTree::DecodeCache>(lowerBound, upperBound), tree());
c.moveFirst();
ASSERT(c.valid());
do {
r += " ";
r += c.get().toString(height == 1);
// Out of range entries are annotated but can actually be valid, as they can be the result of
// subtree deletion followed by incremental insertions of records in the deleted range being added
// to an adjacent subtree which is logically expanded encompass the deleted range but still is using
// the original subtree boundaries as DeltaTree2 boundaries.
bool tooLow = c.get().withoutValue() < lowerBound.withoutValue();
bool tooHigh = c.get().withoutValue() >= upperBound.withoutValue();
if (tooLow || tooHigh) {
if (tooLow) {
r += " (below decode lower bound)";
}
if (tooHigh) {
r += " (at or above decode upper bound)";
}
}
r += "\n";
} while (c.moveNext());
}
} catch (Error& e) {
debug_printf("BTreePage::toString ERROR: %s\n", e.what());
debug_printf("BTreePage::toString partial result: %s\n", r.c_str());
throw;
}
// All appends to r end in a linefeed, remove the final one.
r.resize(r.size() - 1);
return r;
}
};
struct BoundaryRefAndPage {
Standalone<RedwoodRecordRef> lowerBound;
Reference<ArenaPage> firstPage;
std::vector<Reference<ArenaPage>> extPages;
std::string toString() const {
return format("[%s, %d pages]", lowerBound.toString().c_str(), extPages.size() + (firstPage ? 1 : 0));
}
};
// DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between
// reads and writes by using a static structure to track boundaries used during DeltaTree generation
// for all writes and updates across cold starts and virtual process restarts.
class DecodeBoundaryVerifier {
struct DecodeBoundaries {
Key lower;
Key upper;
unsigned int height;
Optional<int64_t> domainId;
bool empty() const { return lower.empty() && upper.empty(); }
};
typedef std::map<Version, DecodeBoundaries> BoundariesByVersion;
std::unordered_map<LogicalPageID, BoundariesByVersion> boundariesByPageID;
int boundarySampleSize = 1000;
int boundaryPopulation = 0;
Reference<IPageEncryptionKeyProvider> keyProvider;
public:
std::vector<Key> boundarySamples;
// Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement.
double domainPrefixScanProbability = 0.01;
uint64_t domainPrefixScanCount = 0;
static DecodeBoundaryVerifier* getVerifier(std::string name) {
static std::map<std::string, DecodeBoundaryVerifier> verifiers;
// Only use verifier in a non-restarted simulation so that all page writes are captured
if (g_network->isSimulated() && !g_simulator->restarted) {
return &verifiers[name];
}
return nullptr;
}
void setKeyProvider(Reference<IPageEncryptionKeyProvider> kp) { keyProvider = kp; }
void sampleBoundary(Key b) {
if (boundaryPopulation <= boundarySampleSize) {
boundarySamples.push_back(b);
} else if (deterministicRandom()->random01() < ((double)boundarySampleSize / boundaryPopulation)) {
boundarySamples[deterministicRandom()->randomInt(0, boundarySampleSize)] = b;
}
++boundaryPopulation;
}
Key getSample() const {
if (boundarySamples.empty()) {
return Key();
}
return deterministicRandom()->randomChoice(boundarySamples);
}
bool update(BTreeNodeLinkRef id,
Version v,
Key lowerBound,
Key upperBound,
unsigned int height,
Optional<int64_t> domainId) {
sampleBoundary(lowerBound);
sampleBoundary(upperBound);
debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s', %u, %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
lowerBound.printable().c_str(),
upperBound.printable().c_str(),
height,
Traceable<decltype(domainId)>::toString(domainId).c_str());
if (domainId.present()) {
ASSERT(keyProvider && keyProvider->enableEncryptionDomain());
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId() &&
!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, false)) {
fprintf(stderr,
"Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
lowerBound.printable().c_str());
return false;
}
*/
}
auto& b = boundariesByPageID[id.front()][v];
ASSERT(b.empty());
b = { lowerBound, upperBound, height, domainId };
return true;
}
bool verify(LogicalPageID id,
Version v,
Key lowerBound,
Key upperBound,
Optional<int64_t> domainId,
BTreePage::BinaryTree::Cursor& cursor) {
auto i = boundariesByPageID.find(id);
ASSERT(i != boundariesByPageID.end());
ASSERT(!i->second.empty());
auto b = i->second.upper_bound(v);
--b;
if (b->second.lower != lowerBound || b->second.upper != upperBound) {
fprintf(stderr,
"Boundary mismatch on %s %s\nUsing:\n\t'%s'\n\t'%s'\nWritten %s:\n\t'%s'\n\t'%s'\n",
::toString(id).c_str(),
::toString(v).c_str(),
lowerBound.printable().c_str(),
upperBound.printable().c_str(),
::toString(b->first).c_str(),
b->second.lower.printable().c_str(),
b->second.upper.printable().c_str());
return false;
}
if (!b->second.domainId.present()) {
ASSERT(!keyProvider || !keyProvider->enableEncryptionDomain());
ASSERT(!domainId.present());
} else {
ASSERT(keyProvider->enableEncryptionDomain());
if (b->second.domainId != domainId) {
fprintf(stderr,
"Page encrypted with incorrect domain: %s %s, using %s, written %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
::toString(b->second.domainId).c_str());
return false;
}
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
ASSERT(domainId.present());
auto checkKeyFitsInDomain = [&]() -> bool {
if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) {
fprintf(stderr,
"Encryption domain mismatch on %s, %s, domain: %s, key %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
cursor.get().key.printable().c_str());
return false;
}
return true;
};
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId()) {
cursor.moveFirst();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
cursor.moveLast();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
} else {
if (deterministicRandom()->random01() < domainPrefixScanProbability) {
cursor.moveFirst();
while (cursor.valid()) {
if (!checkKeyFitsInDomain()) {
return false;
}
cursor.moveNext();
}
domainPrefixScanCount++;
}
}
*/
}
return true;
}
void updatePageId(Version v, LogicalPageID oldID, LogicalPageID newID) {
auto& old = boundariesByPageID[oldID];
ASSERT(!old.empty());
auto i = old.end();
--i;
boundariesByPageID[newID][v] = i->second;
debug_printf("decodeBoundariesUpdate copy %s %s to %s '%s' to '%s'\n",
::toString(v).c_str(),
::toString(oldID).c_str(),
::toString(newID).c_str(),
i->second.lower.printable().c_str(),
i->second.upper.printable().c_str());
}
void removeAfterVersion(Version version) {
auto i = boundariesByPageID.begin();
while (i != boundariesByPageID.end()) {
auto v = i->second.upper_bound(version);
while (v != i->second.end()) {
debug_printf("decodeBoundariesUpdate remove %s %s '%s' to '%s'\n",
::toString(v->first).c_str(),
::toString(i->first).c_str(),
v->second.lower.printable().c_str(),
v->second.upper.printable().c_str());
v = i->second.erase(v);
}
if (i->second.empty()) {
debug_printf("decodeBoundariesUpdate remove empty map for %s\n", ::toString(i->first).c_str());
i = boundariesByPageID.erase(i);
} else {
++i;
}
}
}
};
class VersionedBTree {
public:
// The first possible internal record possible in the tree
static RedwoodRecordRef dbBegin;
// A record which is greater than the last possible record in the tree
static RedwoodRecordRef dbEnd;
struct LazyClearQueueEntry {
uint8_t height;
Version version;
BTreeNodeLink pageID;
bool operator<(const LazyClearQueueEntry& rhs) const { return version < rhs.version; }
int readFromBytes(const uint8_t* src) {
height = *(uint8_t*)src;
src += sizeof(uint8_t);
version = *(Version*)src;
src += sizeof(Version);
int count = *src++;
pageID = BTreeNodeLinkRef((LogicalPageID*)src, count);
return bytesNeeded();
}
int bytesNeeded() const {
return sizeof(uint8_t) + sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID));
}
int writeToBytes(uint8_t* dst) const {
*(uint8_t*)dst = height;
dst += sizeof(uint8_t);
*(Version*)dst = version;
dst += sizeof(Version);
*dst++ = pageID.size();
memcpy(dst, pageID.begin(), pageID.size() * sizeof(LogicalPageID));
return bytesNeeded();
}
std::string toString() const { return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); }
};
typedef FIFOQueue<LazyClearQueueEntry> LazyClearQueueT;
struct ParentInfo {
ParentInfo() {
count = 0;
bits = 0;
}
void clear() {
count = 0;
bits = 0;
}
static uint32_t mask(LogicalPageID id) { return 1 << (id & 31); }
void pageUpdated(LogicalPageID child) {
auto m = mask(child);
if ((bits & m) == 0) {
bits |= m;
++count;
}
}
bool maybeUpdated(LogicalPageID child) const { return (mask(child) & bits) != 0; }
uint32_t bits;
int count;
};
typedef std::unordered_map<LogicalPageID, ParentInfo> ParentInfoMapT;
struct BTreeCommitHeader {
constexpr static FileIdentifier file_identifier = 10847329;
constexpr static unsigned int FORMAT_VERSION = 17;
// Maximum size of the root pointer
constexpr static int maxRootPointerSize = 3000 / sizeof(LogicalPageID);
// This serves as the format version for the entire tree, individual pages will not be versioned
uint32_t formatVersion;
EncodingType encodingType;
uint8_t height;
LazyClearQueueT::QueueState lazyDeleteQueue;
BTreeNodeLink root;
std::string toString() {
return format("{formatVersion=%d height=%d root=%s lazyDeleteQueue=%s}",
(int)formatVersion,
(int)height,
::toString(root).c_str(),
lazyDeleteQueue.toString().c_str());
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, formatVersion, encodingType, height, lazyDeleteQueue, root);
}
};
// All async opts on the btree are based on pager reads, writes, and commits, so
// we can mostly forward these next few functions to the pager
Future<Void> getError() const { return m_pager->getError(); }
Future<Void> onClosed() const { return m_pager->onClosed(); }
void close_impl(bool dispose) {
auto* pager = m_pager;
delete this;
if (dispose)
pager->dispose();
else
pager->close();
}
void dispose() { return close_impl(true); }
void close() { return close_impl(false); }
StorageBytes getStorageBytes() const { return m_pager->getStorageBytes(); }
// Set key to value as of the next commit
// The new value is not readable until after the next commit is completed.
void set(KeyValueRef keyValue) {
++m_mutationCount;
++g_redwoodMetrics.metric.opSet;
g_redwoodMetrics.metric.opSetKeyBytes += keyValue.key.size();
g_redwoodMetrics.metric.opSetValueBytes += keyValue.value.size();
m_pBuffer->insert(keyValue.key).mutation().setBoundaryValue(m_pBuffer->copyToArena(keyValue.value));
}
void clear(KeyRangeRef clearedRange) {
++m_mutationCount;
// Optimization for single key clears to create just one mutation boundary instead of two
if (clearedRange.begin.size() == clearedRange.end.size() - 1 &&
clearedRange.end[clearedRange.end.size() - 1] == 0 && clearedRange.end.startsWith(clearedRange.begin)) {
++g_redwoodMetrics.metric.opClear;
++g_redwoodMetrics.metric.opClearKey;
m_pBuffer->insert(clearedRange.begin).mutation().clearBoundary();
return;
}
++g_redwoodMetrics.metric.opClear;
MutationBuffer::iterator iBegin = m_pBuffer->insert(clearedRange.begin);
MutationBuffer::iterator iEnd = m_pBuffer->insert(clearedRange.end);
iBegin.mutation().clearAll();
++iBegin;
m_pBuffer->erase(iBegin, iEnd);
}
void setOldestReadableVersion(Version v) { m_newOldestVersion = v; }
Version getOldestReadableVersion() const { return m_pager->getOldestReadableVersion(); }
Version getLastCommittedVersion() const { return m_pager->getLastCommittedVersion(); }
// VersionedBTree takes ownership of pager
VersionedBTree(IPager2* pager,
std::string name,
EncodingType defaultEncodingType,
Reference<IPageEncryptionKeyProvider> keyProvider)
: m_pager(pager), m_encodingType(defaultEncodingType), m_enforceEncodingType(false), m_keyProvider(keyProvider),
m_pBuffer(nullptr), m_mutationCount(0), m_name(name) {
// For encrypted encoding types, enforce that BTree nodes read from disk use the default encoding type
// This prevents an attack where an encrypted page is replaced by an attacker with an unencrypted page
// or an encrypted page fabricated using a compromised scheme.
if (ArenaPage::isEncodingTypeEncrypted(m_encodingType)) {
ASSERT(keyProvider.isValid());
m_enforceEncodingType = true;
}
// If key provider isn't given, instantiate the null provider
if (!m_keyProvider) {
m_keyProvider = makeReference<NullKeyProvider>();
}
m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name);
if (m_pBoundaryVerifier != nullptr) {
m_pBoundaryVerifier->setKeyProvider(m_keyProvider);
}
m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource();
m_lazyClearActor = 0;
m_init = init_impl(this);
m_latestCommit = m_init;
}
ACTOR static Future<Reference<ArenaPage>> makeEmptyRoot(VersionedBTree* self) {
state Reference<ArenaPage> page = self->m_pager->newPageBuffer();
page->init(self->m_encodingType, PageType::BTreeNode, 1);
if (page->isEncrypted()) {
ArenaPage::EncryptionKey k = wait(self->m_keyProvider->getLatestDefaultEncryptionKey());
page->encryptionKey = k;
}
BTreePage* btpage = (BTreePage*)page->mutateData();
btpage->init(1, 0);
btpage->tree()->build(page->dataSize() - sizeof(BTreePage), nullptr, nullptr, nullptr, nullptr);
return page;
}
void toTraceEvent(TraceEvent& e) const {
m_pager->toTraceEvent(e);
m_lazyClearQueue.toTraceEvent(e, "LazyClearQueue");
}
ACTOR static Future<int> incrementalLazyClear(VersionedBTree* self) {
ASSERT(self->m_lazyClearActor.isReady());
self->m_lazyClearStop = false;
// TODO: Is it contractually okay to always to read at the latest version?
state Reference<IPagerSnapshot> snapshot =
self->m_pager->getReadSnapshot(self->m_pager->getLastCommittedVersion());
state int freedPages = 0;
loop {
state int toPop = SERVER_KNOBS->REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES;
state std::vector<std::pair<LazyClearQueueEntry, Future<Reference<const ArenaPage>>>> entries;
entries.reserve(toPop);
// Take up to batchSize pages from front of queue
while (toPop > 0) {
Optional<LazyClearQueueEntry> q = wait(self->m_lazyClearQueue.pop());
debug_printf("LazyClear: popped %s\n", toString(q).c_str());
if (!q.present()) {
break;
}
// Start reading the page, without caching
entries.emplace_back(q.get(),
self->readPage(self,
PagerEventReasons::LazyClear,
q.get().height,
snapshot.getPtr(),
q.get().pageID,
ioLeafPriority,
true,
false));
--toPop;
}
state int i;
for (i = 0; i < entries.size(); ++i) {
Reference<const ArenaPage> p = wait(entries[i].second);
const LazyClearQueueEntry& entry = entries[i].first;
const BTreePage& btPage = *(const BTreePage*)p->data();
ASSERT(btPage.height == entry.height);
auto& metrics = g_redwoodMetrics.level(entry.height).metrics;
debug_printf("LazyClear: processing %s\n", toString(entry).c_str());
// Level 1 (leaf) nodes should never be in the lazy delete queue
ASSERT(entry.height > 1);
// Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses
// RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding
BTreePage::ValueTree::Cursor c(makeReference<BTreePage::ValueTree::DecodeCache>(dbBegin, dbEnd),
btPage.valueTree());
ASSERT(c.moveFirst());
Version v = entry.version;
while (1) {
if (c.get().value.present()) {
BTreeNodeLinkRef btChildPageID = c.get().getChildPage();
// If this page is height 2, then the children are leaves so free them directly
if (entry.height == 2) {
debug_printf("LazyClear: freeing leaf child %s\n", toString(btChildPageID).c_str());
self->freeBTreePage(1, btChildPageID, v);
freedPages += btChildPageID.size();
metrics.lazyClearFree += 1;
metrics.lazyClearFreeExt += (btChildPageID.size() - 1);
} else {
// Otherwise, queue them for lazy delete.
debug_printf("LazyClear: queuing child %s\n", toString(btChildPageID).c_str());
self->m_lazyClearQueue.pushFront(
LazyClearQueueEntry{ (uint8_t)(entry.height - 1), v, btChildPageID });
metrics.lazyClearRequeue += 1;
metrics.lazyClearRequeueExt += (btChildPageID.size() - 1);
}
}
if (!c.moveNext()) {
break;
}
}
// Free the page, now that its children have either been freed or queued
debug_printf("LazyClear: freeing queue entry %s\n", toString(entry.pageID).c_str());
self->freeBTreePage(entry.height, entry.pageID, v);
freedPages += entry.pageID.size();
metrics.lazyClearFree += 1;
metrics.lazyClearFreeExt += entry.pageID.size() - 1;
}
// Stop if
// - the poppable items in the queue have already been exhausted
// - stop flag is set and we've freed the minimum number of pages required
// - maximum number of pages to free met or exceeded
if (toPop > 0 || (freedPages >= SERVER_KNOBS->REDWOOD_LAZY_CLEAR_MIN_PAGES && self->m_lazyClearStop) ||
(freedPages >= SERVER_KNOBS->REDWOOD_LAZY_CLEAR_MAX_PAGES)) {
break;
}
}
debug_printf("LazyClear: freed %d pages, %s has %" PRId64 " entries\n",
freedPages,
self->m_lazyClearQueue.name.c_str(),
self->m_lazyClearQueue.numEntries);
return freedPages;
}
ACTOR static Future<Void> init_impl(VersionedBTree* self) {
wait(self->m_pager->init());
self->m_pBuffer.reset(new MutationBuffer());
self->m_blockSize = self->m_pager->getLogicalPageSize();
self->m_newOldestVersion = self->m_pager->getOldestReadableVersion();
debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n",
self->getLastCommittedVersion(),
self->m_newOldestVersion);
// Clear any changes that occurred after the latest committed version
if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->removeAfterVersion(self->getLastCommittedVersion());
}
state Value btreeHeader = self->m_pager->getCommitRecord();
if (btreeHeader.size() == 0) {
// Create new BTree
self->m_header.formatVersion = BTreeCommitHeader::FORMAT_VERSION;
self->m_header.encodingType = self->m_encodingType;
self->m_header.height = 1;
LogicalPageID id = wait(self->m_pager->newPageID());
self->m_header.root = BTreeNodeLinkRef((LogicalPageID*)&id, 1);
debug_printf("new root %s\n", toString(self->m_header.root).c_str());
Reference<ArenaPage> page = wait(makeEmptyRoot(self));
// Newly allocated page so logical id = physical id and it's a new empty root so no parent
page->setLogicalPageInfo(self->m_header.root.front(), invalidLogicalPageID);
self->m_pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, self->m_header.root, page);
LogicalPageID newQueuePage = wait(self->m_pager->newPageID());
self->m_lazyClearQueue.create(
self->m_pager, newQueuePage, "LazyClearQueue", self->m_pager->newLastQueueID(), false);
self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState();
debug_printf("BTree created (but not committed)\n");
} else {
self->m_header = ObjectReader::fromStringRef<BTreeCommitHeader>(btreeHeader, Unversioned());
if (self->m_header.formatVersion != BTreeCommitHeader::FORMAT_VERSION) {
Error e = unsupported_format_version();
TraceEvent(SevWarn, "RedwoodBTreeVersionUnsupported")
.error(e)
.detail("Version", self->m_header.formatVersion)
.detail("ExpectedVersion", BTreeCommitHeader::FORMAT_VERSION);
throw e;
}
self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered");
debug_printf("BTree recovered.\n");
if (ArenaPage::isEncodingTypeEncrypted(self->m_header.encodingType) &&
self->m_encodingType == EncodingType::XXHash64) {
// On restart the encryption config of the cluster could be unknown. In that case if we find the Redwood
// instance is encrypted, we should use the same encryption encoding.
self->m_encodingType = self->m_header.encodingType;
self->m_enforceEncodingType = true;
TraceEvent("RedwoodBTreeNodeForceEncryption")
.detail("InstanceName", self->m_pager->getName())
.detail("EncodingFound", self->m_header.encodingType)
.detail("EncodingDesired", self->m_encodingType);
}
if (self->m_header.encodingType != self->m_encodingType) {
TraceEvent(SevWarn, "RedwoodBTreeNodeEncodingMismatch")
.detail("InstanceName", self->m_pager->getName())
.detail("EncodingFound", self->m_header.encodingType)
.detail("EncodingDesired", self->m_encodingType);
}
}
self->m_lazyClearActor = 0;
TraceEvent e(SevInfo, "RedwoodRecoveredBTree");
e.detail("FileName", self->m_name);
e.detail("OpenedExisting", btreeHeader.size() != 0);
e.detail("LatestVersion", self->m_pager->getLastCommittedVersion());
self->m_lazyClearQueue.toTraceEvent(e, "LazyClearQueue");
e.log();
debug_printf("Recovered btree at version %" PRId64 ": %s\n",
self->m_pager->getLastCommittedVersion(),
self->m_header.toString().c_str());
return Void();
}
Future<Void> init() { return m_init; }
virtual ~VersionedBTree() {
// DecodeBoundaryVerifier objects outlive simulated processes.
// Thus, if we did not clear the key providers here, each DecodeBoundaryVerifier object might
// maintain references to untracked peers through its key provider. This would result in
// errors when FlowTransport::removePeerReference is called to remove a peer that is no
// longer tracked by FlowTransport::transport().
if (m_pBoundaryVerifier != nullptr) {
m_pBoundaryVerifier->setKeyProvider(Reference<IPageEncryptionKeyProvider>());
}
// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
// uncommitted writes so it should not be committed.
m_init.cancel();
m_latestCommit.cancel();
}
Future<Void> commit(Version v) {
// Replace latest commit with a new one which waits on the old one
m_latestCommit = commit_impl(this, v, m_latestCommit);
return m_latestCommit;
}
// Clear all btree data, allow pager remap to fully process its queue, and verify final
// page counts in pager and queues.
ACTOR static Future<Void> clearAllAndCheckSanity_impl(VersionedBTree* self) {
// Clear and commit
debug_printf("Clearing tree.\n");
self->clear(KeyRangeRef(dbBegin.key, dbEnd.key));
wait(self->commit(self->getLastCommittedVersion() + 1));
// Loop commits until the the lazy delete queue is completely processed.
loop {
wait(self->commit(self->getLastCommittedVersion() + 1));
// If the lazy delete queue is completely processed then the last time the lazy delete actor
// was started it, after the last commit, it would exist immediately and do no work, so its
// future would be ready and its value would be 0.
if (self->m_lazyClearActor.isReady() && self->m_lazyClearActor.get() == 0) {
break;
}
}
// The lazy delete queue should now be empty and contain only the new page to start writing to
// on the next commit.
LazyClearQueueT::QueueState s = self->m_lazyClearQueue.getState();
ASSERT(s.numEntries == 0);
ASSERT(s.numPages == 1);
// The btree should now be a single non-oversized root page.
ASSERT(self->m_header.height == 1);
ASSERT(self->m_header.root.size() == 1);
// Let pager do more commits to finish all cleanup of old pages
wait(self->m_pager->clearRemapQueue());
TraceEvent e("RedwoodDestructiveSanityCheck");
self->toTraceEvent(e);
e.log();
// From the pager's perspective the only pages that should be in use are the btree root and
// the previously mentioned lazy delete queue page.
int64_t userPageCount = wait(self->m_pager->getUserPageCount());
debug_printf("clearAllAndCheckSanity: userPageCount: %" PRId64 "\n", userPageCount);
ASSERT(userPageCount == 2);
return Void();
}
Future<Void> clearAllAndCheckSanity() { return clearAllAndCheckSanity_impl(this); }
private:
// Represents a change to a single key - set, clear, or atomic op
struct SingleKeyMutation {
// Clear
SingleKeyMutation() : op(MutationRef::ClearRange) {}
// Set
SingleKeyMutation(Value val) : op(MutationRef::SetValue), value(val) {}
// Atomic Op
SingleKeyMutation(MutationRef::Type op, Value val) : op(op), value(val) {}
MutationRef::Type op;
Value value;
inline bool isClear() const { return op == MutationRef::ClearRange; }
inline bool isSet() const { return op == MutationRef::SetValue; }
inline bool isAtomicOp() const { return !isSet() && !isClear(); }
inline bool equalToSet(ValueRef val) { return isSet() && value == val; }
inline RedwoodRecordRef toRecord(KeyRef userKey) const {
// No point in serializing an atomic op, it needs to be coalesced to a real value.
ASSERT(!isAtomicOp());
if (isClear())
return RedwoodRecordRef(userKey);
return RedwoodRecordRef(userKey, value);
}
std::string toString() const { return format("op=%d val='%s'", op, printable(value).c_str()); }
};
struct RangeMutation {
RangeMutation() : boundaryChanged(false), clearAfterBoundary(false) {}
bool boundaryChanged;
Optional<ValueRef> boundaryValue; // Not present means cleared
bool clearAfterBoundary;
bool boundaryCleared() const { return boundaryChanged && !boundaryValue.present(); }
bool boundarySet() const { return boundaryChanged && boundaryValue.present(); }
// Returns true if this RangeMutation doesn't actually mutate anything
bool noChanges() const { return !boundaryChanged && !clearAfterBoundary; }
void clearBoundary() {
boundaryChanged = true;
boundaryValue.reset();
}
void clearAll() {
clearBoundary();
clearAfterBoundary = true;
}
void setBoundaryValue(ValueRef v) {
boundaryChanged = true;
boundaryValue = v;
}
std::string toString() const {
return format("boundaryChanged=%d clearAfterBoundary=%d boundaryValue=%s",
boundaryChanged,
clearAfterBoundary,
::toString(boundaryValue).c_str());
}
};
public:
#include "fdbserver/ArtMutationBuffer.h"
struct MutationBufferStdMap {
MutationBufferStdMap() {
// Create range representing the entire keyspace. This reduces edge cases to applying mutations
// because now all existing keys are within some range in the mutation map.
mutations[dbBegin.key];
// Setting the dbEnd key to be cleared prevents having to treat a range clear to dbEnd as a special
// case in order to avoid traversing down the rightmost edge of the tree.
mutations[dbEnd.key].clearBoundary();
}
private:
typedef std::map<KeyRef, RangeMutation> MutationsT;
Arena arena;
MutationsT mutations;
public:
struct iterator : public MutationsT::iterator {
typedef MutationsT::iterator Base;
iterator() = default;
iterator(const MutationsT::iterator& i) : Base(i) {}
const KeyRef& key() { return (*this)->first; }
RangeMutation& mutation() { return (*this)->second; }
};
struct const_iterator : public MutationsT::const_iterator {
typedef MutationsT::const_iterator Base;
const_iterator() = default;
const_iterator(const MutationsT::const_iterator& i) : Base(i) {}
const_iterator(const MutationsT::iterator& i) : Base(i) {}
const KeyRef& key() { return (*this)->first; }
const RangeMutation& mutation() { return (*this)->second; }
};
// Return a T constructed in arena
template <typename T>
T copyToArena(const T& object) {
return T(arena, object);
}
const_iterator upper_bound(const KeyRef& k) const { return mutations.upper_bound(k); }
const_iterator lower_bound(const KeyRef& k) const { return mutations.lower_bound(k); }
// erase [begin, end) from the mutation map
void erase(const const_iterator& begin, const const_iterator& end) { mutations.erase(begin, end); }
// Find or create a mutation buffer boundary for bound and return an iterator to it
iterator insert(KeyRef boundary) {
// Find the first split point in buffer that is >= key
// Since the initial state of the mutation buffer contains the range '' through
// the maximum possible key, our search had to have found something so we
// can assume the iterator is valid.
iterator ib = mutations.lower_bound(boundary);
// If we found the boundary we are looking for, return its iterator
if (ib.key() == boundary) {
return ib;
}
// ib is our insert hint. Copy boundary into arena and insert boundary into buffer
boundary = KeyRef(arena, boundary);
ib = mutations.insert(ib, { boundary, RangeMutation() });
// ib is certainly > begin() because it is guaranteed that the empty string
// boundary exists and the only way to have found that is to look explicitly
// for it in which case we would have returned above.
iterator iPrevious = ib;
--iPrevious;
// If the range we just divided was being cleared, then the dividing boundary key and range after it must
// also be cleared
if (iPrevious.mutation().clearAfterBoundary) {
ib.mutation().clearAll();
}
return ib;
}
};
#define USE_ART_MUTATION_BUFFER 1
#ifdef USE_ART_MUTATION_BUFFER
typedef struct MutationBufferART MutationBuffer;
#else
typedef struct MutationBufferStdMap MutationBuffer;
#endif
private:
/* Mutation Buffer Overview
*
* This structure's organization is meant to put pending updates for the btree in an order
* that makes it efficient to query all pending mutations across all pending versions which are
* relevant to a particular subtree of the btree.
*
* At the top level, it is a map of the start of a range being modified to a RangeMutation.
* The end of the range is map key (which is the next range start in the map).
*
* - The buffer starts out with keys '' and endKVV.key already populated.
*
* - When a new key is inserted into the buffer map, it is by definition
* splitting an existing range so it should take on the rangeClearVersion of
* the immediately preceding key which is the start of that range
*
* - Keys are inserted into the buffer map for every individual operation (set/clear/atomic)
* key and for both the start and end of a range clear.
*
* - To apply a single clear, add it to the individual ops only if the last entry is not also a clear.
*
* - To apply a range clear, after inserting the new range boundaries do the following to the start
* boundary and all successive boundaries < end
* - set the range clear version if not already set
* - add a clear to the startKeyMutations if the final entry is not a clear.
*
* - Note that there are actually TWO valid ways to represent
* set c = val1 at version 1
* clear c\x00 to z at version 2
* with this model. Either
* c = { rangeClearVersion = 2, startKeyMutations = { 1 => val1 }
* z = { rangeClearVersion = <not present>, startKeyMutations = {}
* OR
* c = { rangeClearVersion = <not present>, startKeyMutations = { 1 => val1 }
* c\x00 = { rangeClearVersion = 2, startKeyMutations = { 2 => <not present> }
* z = { rangeClearVersion = <not present>, startKeyMutations = {}
*
* This is because the rangeClearVersion applies to a range begining with the first
* key AFTER the start key, so that the logic for reading the start key is more simple
* as it only involves consulting startKeyMutations. When adding a clear range, the
* boundary key insert/split described above is valid, and is what is currently done,
* but it would also be valid to see if the last key before startKey is equal to
* keyBefore(startKey), and if so that mutation buffer boundary key can be used instead
* without adding an additional key to the buffer.
* TODO: A possible optimization here could be to only use existing btree leaf page boundaries as keys,
* with mutation point keys being stored in an unsorted strucutre under those boundary map keys,
* to be sorted later just before being merged into the existing leaf page.
*/
IPager2* m_pager;
EncodingType m_encodingType;
bool m_enforceEncodingType;
Reference<IPageEncryptionKeyProvider> m_keyProvider;
// Counter to update with DecodeCache memory usage
int64_t* m_pDecodeCacheMemory = nullptr;
// The mutation buffer currently being written to
std::unique_ptr<MutationBuffer> m_pBuffer;
int64_t m_mutationCount;
DecodeBoundaryVerifier* m_pBoundaryVerifier;
struct CommitBatch {
Version readVersion;
Version writeVersion;
Version newOldestVersion;
std::unique_ptr<MutationBuffer> mutations;
int64_t mutationCount;
Reference<IPagerSnapshot> snapshot;
};
Version m_newOldestVersion;
Future<Void> m_latestCommit;
Future<Void> m_init;
std::string m_name;
int m_blockSize;
ParentInfoMapT childUpdateTracker;
BTreeCommitHeader m_header;
LazyClearQueueT m_lazyClearQueue;
Future<int> m_lazyClearActor;
bool m_lazyClearStop;
// Describes a range of a vector of records that should be built into a single BTreePage
struct PageToBuild {
PageToBuild(int index,
int blockSize,
EncodingType encodingType,
unsigned int height,
bool useEncryptionDomain,
bool splitByDomain,
IPageEncryptionKeyProvider* keyProvider)
: startIndex(index), count(0), pageSize(blockSize),
largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
kvBytes(0), encodingType(encodingType), height(height), useEncryptionDomain(useEncryptionDomain),
splitByDomain(splitByDomain), keyProvider(keyProvider) {
// Subtrace Page header overhead, BTreePage overhead, and DeltaTree (BTreePage::BinaryTree) overhead.
bytesLeft =
ArenaPage::getUsableSize(blockSize, encodingType) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree);
}
PageToBuild next() {
return PageToBuild(
endIndex(), blockSize, encodingType, height, useEncryptionDomain, splitByDomain, keyProvider);
}
int startIndex; // Index of the first record
int count; // Number of records added to the page
int pageSize; // Page or Multipage size required to hold a BTreePage of the added records, which is a multiple
// of blockSize
int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far
bool largeDeltaTree; // Whether or not the tree in the generated page is in the 'large' size range
int blockSize; // Base block size by which pageSize can be incremented
int blockCount; // The number of blocks in pageSize
int kvBytes; // The amount of user key/value bytes added to the page
EncodingType encodingType;
unsigned int height;
bool useEncryptionDomain;
bool splitByDomain;
IPageEncryptionKeyProvider* keyProvider;
Optional<int64_t> domainId = Optional<int64_t>();
size_t domainPrefixLength = 0;
bool canUseDefaultDomain = false;
// Number of bytes used by the generated/serialized BTreePage, including all headers
int usedBytes() const { return pageSize - bytesLeft; }
// Used fraction of pageSize bytes
double usedFraction() const { return (double)usedBytes() / pageSize; }
// Unused fraction of pageSize bytes
double slackFraction() const { return (double)bytesLeft / pageSize; }
// Fraction of PageSize in use by key or value string bytes, disregarding all overhead including string sizes
double kvFraction() const { return (double)kvBytes / pageSize; }
// Index of the last record to be included in this page
int lastIndex() const { return endIndex() - 1; }
// Index of the first record NOT included in this page
int endIndex() const { return startIndex + count; }
std::string toString() const {
return format("{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d "
"large=%d, domain=%s}",
startIndex,
count,
usedBytes(),
pageSize,
slackFraction() * 100,
kvBytes,
blockCount,
blockSize,
largeDeltaTree,
::toString(domainId).c_str());
}
// Move an item from a to b if a has 2 or more items and the item fits in b
// a and b must be consecutive pages from the same array of records
static bool shiftItem(PageToBuild& a, PageToBuild& b, int deltaSize, int kvBytes) {
if (a.count < 2) {
return false;
}
// Size of the nodes in A and B, respectively
int aNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(a.largeDeltaTree);
int bNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(b.largeDeltaTree);
if (b.bytesLeft < bNodeSize) {
return false;
}
--a.count;
++b.count;
--b.startIndex;
a.bytesLeft += aNodeSize;
b.bytesLeft -= bNodeSize;
a.kvBytes -= kvBytes;
b.kvBytes += kvBytes;
return true;
}
// Try to add a record of the given delta size to the page.
// If force is true, the page will be expanded to make the record fit if needed.
// Return value is whether or not the record was added to the page.
bool addRecord(const RedwoodRecordRef& rec, const RedwoodRecordRef& nextRecord, int deltaSize, bool force) {
int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree);
// If the record doesn't fit and the page can't be expanded then return false
if (nodeSize > bytesLeft && !force) {
return false;
}
if (useEncryptionDomain) {
int64_t defaultDomainId = keyProvider->getDefaultEncryptionDomainId();
int64_t currentDomainId;
size_t prefixLength;
if (count == 0 || (splitByDomain && count > 0)) {
std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key, domainId);
}
if (count == 0) {
domainId = currentDomainId;
domainPrefixLength = prefixLength;
canUseDefaultDomain =
(height > 1 && (currentDomainId == defaultDomainId || prefixLength == rec.key.size()));
} else if (splitByDomain) {
ASSERT(domainId.present());
if (domainId == currentDomainId) {
// The new record falls in the same domain as the rest of the page.
// Since this is not the first record, the key must contain a non-prefix portion,
// so we cannot use the default domain the encrypt the page (unless domainId is the default
// domain).
if (domainId != defaultDomainId) {
ASSERT(prefixLength < rec.key.size());
canUseDefaultDomain = false;
}
} else if (canUseDefaultDomain &&
(currentDomainId == defaultDomainId ||
(prefixLength == rec.key.size() &&
(nextRecord.key.empty() ||
!nextRecord.key.startsWith(rec.key.substr(0, prefixLength)))))) {
// The new record meets one of the following conditions:
// 1. it falls in the default domain, or
// 2. its key contain only the domain prefix, and
// 2a. the following record doesn't fall in the same domain.
// In this case switch to use the default domain to encrypt the page.
// Condition 2a is needed, because if there are multiple records from the same domain,
// they need to form their own page(s).
domainId = defaultDomainId;
domainPrefixLength = 0;
} else {
// The new record doesn't fit in the same domain as the existing page.
return false;
}
} else {
ASSERT(domainPrefixLength < rec.key.size());
canUseDefaultDomain = false;
}
}
++count;
bytesLeft -= nodeSize;
kvBytes += rec.kvBytes();
// If needed, expand page so that record fits.
// This is a loop because the first expansion may increase per-node overhead which could
// then require a second expansion.
while (bytesLeft < 0) {
int newBlocks = (-bytesLeft + blockSize - 1) / blockSize;
int extraSpace = newBlocks * blockSize;
blockCount += newBlocks;
bytesLeft += extraSpace;
pageSize += extraSpace;
// If size has moved into the "large" range then every node has gotten bigger so adjust bytesLeft
if (!largeDeltaTree && pageSize > BTreePage::BinaryTree::SmallSizeLimit) {
largeDeltaTree = true;
bytesLeft -= (count * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead);
}
}
return true;
}
void finish() {
if (useEncryptionDomain && canUseDefaultDomain) {
domainId = keyProvider->getDefaultEncryptionDomainId();
}
}
};
// Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build
std::vector<PageToBuild> splitPages(const RedwoodRecordRef* lowerBound,
const RedwoodRecordRef* upperBound,
int prefixLen,
VectorRef<RedwoodRecordRef> records,
unsigned int height) {
debug_printf("splitPages height=%d records=%d\n\tlowerBound=%s\n\tupperBound=%s\n",
height,
records.size(),
lowerBound->toString(false).c_str(),
upperBound->toString(false).c_str());
ASSERT(!records.empty());
// Leaves can have just one record if it's large, but internal pages should have at least 4
int minRecords = height == 1 ? 1 : 4;
double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK;
RedwoodRecordRef emptyRecord;
std::vector<PageToBuild> pages;
// Whether encryption is used and we need to set encryption domain for a page.
bool useEncryptionDomain =
ArenaPage::isEncodingTypeEncrypted(m_encodingType) && m_keyProvider->enableEncryptionDomain();
// Whether we may need to split by encryption domain. It is mean to be an optimization to avoid
// unnecessary domain check and may not be exhaust all cases.
bool splitByDomain = false;
if (useEncryptionDomain && records.size() > 1) {
int64_t firstDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[0].key));
int64_t lastDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[records.size() - 1].key));
// If the two record falls in the same non-default domain, we know all the records fall in the
// same domain. Otherwise we may need to split pages by domain.
if (firstDomain != lastDomain || firstDomain == m_keyProvider->getDefaultEncryptionDomainId()) {
splitByDomain = true;
}
}
// deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound]
std::vector<int> deltaSizes(records.size() + 1);
deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true);
deltaSizes.back() = records.back().deltaSize(*upperBound, prefixLen, true);
for (int i = 1; i < records.size(); ++i) {
deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true);
}
PageToBuild p(
0, m_blockSize, m_encodingType, height, useEncryptionDomain, splitByDomain, m_keyProvider.getPtr());
for (int i = 0; i < records.size();) {
bool force = p.count < minRecords || p.slackFraction() > maxSlack;
if (i == 0 || p.count > 0) {
debug_printf(" before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s "
"record=%s",
i,
records.size(),
deltaSizes[i],
records[i].kvBytes(),
force,
p.toString().c_str(),
records[i].toString(height == 1).c_str());
}
if (!p.addRecord(records[i], i + 1 < records.size() ? records[i + 1] : emptyRecord, deltaSizes[i], force)) {
p.finish();
pages.push_back(p);
p = p.next();
} else {
i++;
}
}
if (p.count > 0) {
p.finish();
pages.push_back(p);
}
debug_printf(" Before shift: %s\n", ::toString(pages).c_str());
// If page count is > 1, try to balance slack between last two pages
// In simulation, disable this balance half the time to create more edge cases
// of underfilled pages
if (pages.size() > 1 && !(g_network->isSimulated() && deterministicRandom()->coinflip())) {
PageToBuild& a = pages[pages.size() - 2];
PageToBuild& b = pages.back();
// We can rebalance the two pages only if they are in the same encryption domain.
ASSERT(!useEncryptionDomain || (a.domainId.present() && b.domainId.present()));
if (!useEncryptionDomain || a.domainId.get() == b.domainId.get()) {
// While the last page page has too much slack and the second to last page
// has more than the minimum record count, shift a record from the second
// to last page to the last page.
while (b.slackFraction() > maxSlack && a.count > minRecords) {
int i = a.lastIndex();
if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
break;
}
debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
}
}
}
return pages;
}
// Writes entries to 1 or more pages and return a vector of boundary keys with their ArenaPage(s)
ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> writePages(VersionedBTree* self,
const RedwoodRecordRef* lowerBound,
const RedwoodRecordRef* upperBound,
VectorRef<RedwoodRecordRef> entries,
unsigned int height,
Version v,
BTreeNodeLinkRef previousID,
LogicalPageID parentID) {
ASSERT(entries.size() > 0);
state Standalone<VectorRef<RedwoodRecordRef>> records;
// All records share the prefix shared by the lower and upper boundaries
state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound);
// Whether encryption is used and we need to set encryption domain for a page.
state bool useEncryptionDomain =
ArenaPage::isEncodingTypeEncrypted(self->m_encodingType) && self->m_keyProvider->enableEncryptionDomain();
state std::vector<PageToBuild> pagesToBuild =
self->splitPages(lowerBound, upperBound, prefixLen, entries, height);
ASSERT(pagesToBuild.size() > 0);
debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str());
// Lower bound of the page being added to
state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue();
state RedwoodRecordRef pageUpperBound;
state int sinceYield = 0;
state int pageIndex;
if (useEncryptionDomain) {
ASSERT(pagesToBuild[0].domainId.present());
int64_t domainId = pagesToBuild[0].domainId.get();
// We need to make sure we use the domain prefix as the page lower bound, for the first page
// of a non-default domain on a level. That way we ensure that pages for a domain form a full subtree
// (i.e. have a single root) in the B-tree.
if (domainId != self->m_keyProvider->getDefaultEncryptionDomainId() &&
!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, false)) {
pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength));
}
}
for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) {
debug_printf("building page %d of %zu %s\n",
pageIndex + 1,
pagesToBuild.size(),
pagesToBuild[pageIndex].toString().c_str());
ASSERT(pagesToBuild[pageIndex].count != 0);
// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
int endIndex = pagesToBuild[pageIndex].endIndex();
bool lastPage = endIndex == entries.size();
pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
if (!lastPage) {
PageToBuild& p = pagesToBuild[pageIndex];
PageToBuild& nextPage = pagesToBuild[pageIndex + 1];
if (height == 1) {
// If this is a leaf page, and not the last one to be written, shorten the upper boundary)
int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
pageUpperBound.truncate(commonPrefix + 1);
}
if (useEncryptionDomain) {
ASSERT(p.domainId.present());
ASSERT(nextPage.domainId.present());
if (p.domainId.get() != nextPage.domainId.get() &&
nextPage.domainId.get() != self->m_keyProvider->getDefaultEncryptionDomainId()) {
pageUpperBound =
RedwoodRecordRef(entries[nextPage.startIndex].key.substr(0, nextPage.domainPrefixLength));
}
}
}
// For internal pages, skip first entry if child link is null. Such links only exist
// to maintain a borrow-able prefix for the previous subtree after a subtree deletion.
// If the null link falls on a new page post-split, then the pageLowerBound of the page
// being built now will serve as the previous subtree's upper boundary as it is the same
// key as entries[p.startIndex] and there is no need to actually store the null link in
// the new page.
if (height != 1 && !entries[pagesToBuild[pageIndex].startIndex].value.present()) {
auto& p = pagesToBuild[pageIndex];
p.kvBytes -= entries[p.startIndex].key.size();
++p.startIndex;
--p.count;
debug_printf("Skipping first null record, new count=%d\n", p.count);
// In case encryption or encryption domain is not enabled, if the page is now empty then it must be the
// last page in pagesToBuild, otherwise there would be more than 1 item since internal pages need to
// have multiple children. In case encryption and encryption domain is enabled, however, because of the
// page split by encryption domain, it may not be the last page.
//
// Either way, a record must be added to the output set because the upper boundary of the last
// page built does not match the upper boundary of the original page that this call to writePages() is
// replacing. Put another way, the upper boundary of the rightmost page of the page set that was just
// built does not match the upper boundary of the original page that the page set is replacing, so
// adding the extra null link fixes this.
if (p.count == 0) {
ASSERT(useEncryptionDomain || lastPage);
records.push_back_deep(records.arena(), pageLowerBound);
pageLowerBound = pageUpperBound;
continue;
}
}
// Create and init page here otherwise many variables must become state vars
state Reference<ArenaPage> page = self->m_pager->newPageBuffer(pagesToBuild[pageIndex].blockCount);
page->init(self->m_encodingType,
(pagesToBuild[pageIndex].blockCount == 1) ? PageType::BTreeNode : PageType::BTreeSuperNode,
height);
if (page->isEncrypted()) {
ArenaPage::EncryptionKey k =
wait(useEncryptionDomain
? self->m_keyProvider->getLatestEncryptionKey(pagesToBuild[pageIndex].domainId.get())
: self->m_keyProvider->getLatestDefaultEncryptionKey());
page->encryptionKey = k;
}
auto& p = pagesToBuild[pageIndex];
BTreePage* btPage = (BTreePage*)page->mutateData();
btPage->init(height, p.kvBytes);
g_redwoodMetrics.kvSizeWritten->sample(p.kvBytes);
debug_printf("Building tree for %s\nlower: %s\nupper: %s\n",
p.toString().c_str(),
pageLowerBound.toString(false).c_str(),
pageUpperBound.toString(false).c_str());
int deltaTreeSpace = page->dataSize() - sizeof(BTreePage);
debug_printf("Building tree at %p deltaTreeSpace %d p.usedBytes=%d\n",
btPage->tree(),
deltaTreeSpace,
p.usedBytes());
state int written = btPage->tree()->build(
deltaTreeSpace, &entries[p.startIndex], &entries[p.endIndex()], &pageLowerBound, &pageUpperBound);
if (written > deltaTreeSpace) {
debug_printf("ERROR: Wrote %d bytes to page %s deltaTreeSpace=%d\n",
written,
p.toString().c_str(),
deltaTreeSpace);
TraceEvent(SevError, "RedwoodDeltaTreeOverflow")
.detail("PageSize", p.pageSize)
.detail("BytesWritten", written);
ASSERT(false);
}
auto& metrics = g_redwoodMetrics.level(height);
metrics.metrics.pageBuild += 1;
metrics.metrics.pageBuildExt += p.blockCount - 1;
metrics.buildFillPctSketch->samplePercentage(p.usedFraction());
metrics.buildStoredPctSketch->samplePercentage(p.kvFraction());
metrics.buildItemCountSketch->sampleRecordCounter(p.count);
// Write this btree page, which is made of 1 or more pager pages.
state BTreeNodeLinkRef childPageID;
// If we are only writing 1 BTree node and its block count is 1 and the original node also had 1 block
// then try to update the page atomically so its logical page ID does not change
if (pagesToBuild.size() == 1 && p.blockCount == 1 && previousID.size() == 1) {
page->setLogicalPageInfo(previousID.front(), parentID);
LogicalPageID id = wait(
self->m_pager->atomicUpdatePage(PagerEventReasons::Commit, height, previousID.front(), page, v));
childPageID.push_back(records.arena(), id);
} else {
// Either the original node is being split, or only a single node was produced but
// its size is > 1 block.
//
// If the node was split, then the parent must be updated anyways to add
// any new children so there is no reason to do an atomic update on this node to
// preserve any page IDs.
//
// If the node was not split, then there is still good reason to update the parent
// node to point to new page IDs rather than use atomic update on this multi block
// node to preserve its page IDs. The parent node is almost certainly 1 block, so
// updating it will be 1 or 2 writes depending on whether or not its second write
// can be avoided. Updating this N>=2 block node atomically, however, will be N
// writes plus possibly another N writes if the second writes cannot be avoided.
//
// Therefore, assuming the parent is 1 block which is almost certain, the worst
// case number of writes for updating the parent is equal to the best case number
// of writes for doing an atomic update of this multi-block node.
//
// Additionally, if we did choose to atomically update this mult-block node here
// then the "multiple sibling updates -> single parent update" conversion optimization
// would likely convert the changes to a parent update anyways, skipping the second writes
// on this multi-block node. It is more efficient to just take this path directly.
// Free the old IDs, but only once (before the first output record is added).
if (records.empty()) {
self->freeBTreePage(height, previousID, v);
}
childPageID.resize(records.arena(), p.blockCount);
state int i = 0;
for (i = 0; i < childPageID.size(); ++i) {
LogicalPageID id = wait(self->m_pager->newPageID());
childPageID[i] = id;
}
debug_printf("writePages: newPages %s", toString(childPageID).c_str());
// Newly allocated page so logical id = physical id
page->setLogicalPageInfo(childPageID.front(), parentID);
self->m_pager->updatePage(PagerEventReasons::Commit, height, childPageID, page);
}
if (self->m_pBoundaryVerifier != nullptr) {
ASSERT(self->m_pBoundaryVerifier->update(
childPageID, v, pageLowerBound.key, pageUpperBound.key, height, pagesToBuild[pageIndex].domainId));
}
if (++sinceYield > 100) {
sinceYield = 0;
wait(yield());
}
if (REDWOOD_DEBUG) {
auto& p = pagesToBuild[pageIndex];
debug_printf("Wrote %s %s original=%s deltaTreeSize=%d for %s\nlower: %s\nupper: %s\n",
toString(v).c_str(),
toString(childPageID).c_str(),
toString(previousID).c_str(),
written,
p.toString().c_str(),
pageLowerBound.toString(false).c_str(),
pageUpperBound.toString(false).c_str());
for (int j = p.startIndex; j < p.endIndex(); ++j) {
debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str());
}
ASSERT(pageLowerBound.key <= pageUpperBound.key);
}
// Push a new record onto the results set, without the child page, copying it into the records arena
records.push_back_deep(records.arena(), pageLowerBound.withoutValue());
// Set the child page value of the inserted record to childPageID, which has already been allocated in
// records.arena() above
records.back().setChildPage(childPageID);
pageLowerBound = pageUpperBound;
}
return records;
}
// Takes a list of records commitSubtree() on the root and builds new root pages
// until there is only 1 root records which is small enough to fit in the BTree commit header.
ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> buildNewRootsIfNeeded(
VersionedBTree* self,
Version version,
Standalone<VectorRef<RedwoodRecordRef>> records,
unsigned int height) {
debug_printf("buildNewRoot start version %" PRId64 ", %d records\n", version, records.size());
// While there are multiple child records or there is only one but it is too large to fit in the BTree
// commit record, build a new root page and update records to be a link to that new page.
// Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in
// size.
//
// There's another case. When encryption domain is enabled, we want to make sure the root node is encrypted
// using the default encryption domain. An indication that's not true is when the first record is not using
// dbBegin as key.
while (records.size() > 1 ||
records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize) ||
records[0].key != dbBegin.key) {
CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large");
if (records[0].key != dbBegin.key) {
ASSERT(self->m_keyProvider.isValid() && self->m_keyProvider->enableEncryption() &&
self->m_keyProvider->enableEncryptionDomain());
int64_t domainId;
size_t prefixLength;
std::tie(domainId, prefixLength) = self->m_keyProvider->getEncryptionDomain(records[0].key);
ASSERT(domainId != self->m_keyProvider->getDefaultEncryptionDomainId());
ASSERT(records[0].key.size() == prefixLength);
CODE_PROBE(true,
"Writing a new root because the current root is encrypted with non-default encryption "
"domain cipher key");
}
self->m_header.height = ++height;
ASSERT(height < std::numeric_limits<int8_t>::max());
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(
writePages(self, &dbBegin, &dbEnd, records, height, version, BTreeNodeLinkRef(), invalidLogicalPageID));
debug_printf("Wrote a new root level at version %" PRId64 " height %d size %d pages\n",
version,
height,
newRecords.size());
records = newRecords;
}
return records;
}
ACTOR static Future<Reference<const ArenaPage>> readPage(VersionedBTree* self,
PagerEventReasons reason,
unsigned int level,
IPagerSnapshot* snapshot,
BTreeNodeLinkRef id,
int priority,
bool forLazyClear,
bool cacheable) {
debug_printf("readPage() op=read%s %s @%" PRId64 "\n",
forLazyClear ? "ForDeferredClear" : "",
toString(id).c_str(),
snapshot->getVersion());
state Reference<const ArenaPage> page;
if (id.size() == 1) {
Reference<const ArenaPage> p =
wait(snapshot->getPhysicalPage(reason, level, id.front(), priority, cacheable, false));
page = std::move(p);
} else {
ASSERT(!id.empty());
Reference<const ArenaPage> p =
wait(snapshot->getMultiPhysicalPage(reason, level, id, priority, cacheable, false));
page = std::move(p);
}
debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion());
const BTreePage* btPage = (const BTreePage*)page->data();
auto& metrics = g_redwoodMetrics.level(btPage->height).metrics;
metrics.pageRead += 1;
metrics.pageReadExt += (id.size() - 1);
// If BTree encryption is enabled, pages read must be encrypted using the desired encryption type
if (self->m_enforceEncodingType && (page->getEncodingType() != self->m_encodingType)) {
Error e = unexpected_encoding_type();
TraceEvent(SevError, "RedwoodBTreeUnexpectedNodeEncoding")
.error(e)
.detail("PhysicalPageID", page->getPhysicalPageID())
.detail("IsEncrypted", page->isEncrypted())
.detail("EncodingTypeFound", page->getEncodingType())
.detail("EncodingTypeExpected", self->m_encodingType);
throw e;
}
return std::move(page);
}
// Get cursor into a BTree node, creating decode cache from boundaries if needed
inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page,
const RedwoodRecordRef& lowerBound,
const RedwoodRecordRef& upperBound) {
Reference<BTreePage::BinaryTree::DecodeCache> cache;
if (page->extra.valid()) {
cache = page->extra.getReference<BTreePage::BinaryTree::DecodeCache>();
} else {
cache = makeReference<BTreePage::BinaryTree::DecodeCache>(lowerBound, upperBound, m_pDecodeCacheMemory);
debug_printf("Created DecodeCache for ptr=%p lower=%s upper=%s %s\n",
page->data(),
lowerBound.toString(false).c_str(),
upperBound.toString(false).c_str(),
((BTreePage*)page->data())
->toString("cursor",
lowerBound.value.present() ? lowerBound.getChildPage() : BTreeNodeLinkRef(),
-1,
lowerBound,
upperBound)
.c_str());
// Store decode cache into page based on height
if (((BTreePage*)page->data())->height >= SERVER_KNOBS->REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT) {
page->extra = cache;
}
}
return BTreePage::BinaryTree::Cursor(cache, ((BTreePage*)page->mutateData())->tree());
}
// Get cursor into a BTree node from a child link
inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page, const BTreePage::BinaryTree::Cursor& link) {
if (!page->extra.valid()) {
return getCursor(page, link.get(), link.next().getOrUpperBound());
}
return BTreePage::BinaryTree::Cursor(page->extra.getReference<BTreePage::BinaryTree::DecodeCache>(),
((BTreePage*)page->mutateData())->tree());
}
static void preLoadPage(IPagerSnapshot* snapshot, BTreeNodeLinkRef pageIDs, int priority) {
g_redwoodMetrics.metric.btreeLeafPreload += 1;
g_redwoodMetrics.metric.btreeLeafPreloadExt += (pageIDs.size() - 1);
if (pageIDs.size() == 1) {
snapshot->getPhysicalPage(
PagerEventReasons::RangePrefetch, nonBtreeLevel, pageIDs.front(), priority, true, true);
} else {
snapshot->getMultiPhysicalPage(
PagerEventReasons::RangePrefetch, nonBtreeLevel, pageIDs, priority, true, true);
}
}
void freeBTreePage(int height, BTreeNodeLinkRef btPageID, Version v) {
// Free individual pages at v
for (LogicalPageID id : btPageID) {
m_pager->freePage(id, v);
}
// Stop tracking child updates for deleted internal nodes
if (height > 1 && !btPageID.empty()) {
childUpdateTracker.erase(btPageID.front());
}
}
// Write new version of pageID at version v using page as its data.
// If oldID size is 1, attempts to keep logical page ID via an atomic page update.
// Returns resulting BTreePageID which might be the same as the input
// updateBTreePage is only called from commitSubTree function so write reason is always btree commit
ACTOR static Future<BTreeNodeLinkRef> updateBTreePage(VersionedBTree* self,
BTreeNodeLinkRef oldID,
LogicalPageID parentID,
Arena* arena,
Reference<ArenaPage> page,
Version writeVersion) {
state BTreeNodeLinkRef newID;
newID.resize(*arena, oldID.size());
if (REDWOOD_DEBUG) {
const BTreePage* btPage = (const BTreePage*)page->mutateData();
BTreePage::BinaryTree::DecodeCache* cache = page->extra.getPtr<BTreePage::BinaryTree::DecodeCache>();
debug_printf_always(
"updateBTreePage(%s, %s) start, page:\n%s\n",
::toString(oldID).c_str(),
::toString(writeVersion).c_str(),
cache == nullptr
? "<noDecodeCache>"
: btPage->toString("updateBTreePage", oldID, writeVersion, cache->lowerBound, cache->upperBound)
.c_str());
}
state unsigned int height = (unsigned int)((const BTreePage*)page->data())->height;
if (oldID.size() == 1) {
page->setLogicalPageInfo(oldID.front(), parentID);
LogicalPageID id = wait(
self->m_pager->atomicUpdatePage(PagerEventReasons::Commit, height, oldID.front(), page, writeVersion));
newID.front() = id;
return newID;
}
state int i = 0;
for (i = 0; i < oldID.size(); ++i) {
LogicalPageID id = wait(self->m_pager->newPageID());
newID[i] = id;
}
debug_printf("updateBTreePage(%s, %s): newPages %s",
::toString(oldID).c_str(),
::toString(writeVersion).c_str(),
toString(newID).c_str());
// Newly allocated page so logical id = physical id
page->setLogicalPageInfo(newID.front(), parentID);
self->m_pager->updatePage(PagerEventReasons::Commit, height, newID, page);
if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->updatePageId(writeVersion, oldID.front(), newID.front());
}
self->freeBTreePage(height, oldID, writeVersion);
return newID;
}
// Copy page to a new page which shares the same DecodeCache with the old page
static Reference<ArenaPage> clonePageForUpdate(Reference<const ArenaPage> page) {
Reference<ArenaPage> newPage = page->clone();
if (page->extra.valid()) {
newPage->extra = page->extra.getReference<BTreePage::BinaryTree::DecodeCache>();
}
debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->data(), newPage->data(), page->dataSize());
return newPage;
}
// Each call to commitSubtree() will pass most of its arguments via a this structure because the caller
// will need access to these parameters after commitSubtree() is done.
struct InternalPageSliceUpdate : public FastAllocated<InternalPageSliceUpdate> {
// The logical range for the subtree's contents. Due to subtree clears, these boundaries may not match
// the lower/upper bounds needed to decode the page.
// Subtree clears can cause the boundaries for decoding the page to be more restrictive than the subtree's
// logical boundaries. When a subtree is fully cleared, the link to it is replaced with a null link, but
// the key boundary remains in tact to support decoding of the previous subtree.
RedwoodRecordRef subtreeLowerBound;
RedwoodRecordRef subtreeUpperBound;
// The lower/upper bound for decoding the root of the subtree
RedwoodRecordRef decodeLowerBound;
RedwoodRecordRef decodeUpperBound;
// Returns true of BTree logical boundaries and DeltaTree decoding boundaries are the same.
bool boundariesNormal() const {
// Often these strings will refer to the same memory so same() is used as a faster way of determining
// equality in thec common case, but if it does not match a string comparison is needed as they can
// still be the same. This can happen for the first remaining subtree of an internal page
// after all prior subtree(s) were cleared.
return (
(decodeUpperBound.key.same(subtreeUpperBound.key) || decodeUpperBound.key == subtreeUpperBound.key) &&
(decodeLowerBound.key.same(subtreeLowerBound.key) || decodeLowerBound.key == subtreeLowerBound.key));
}
// The record range of the subtree slice is cBegin to cEnd
// cBegin.get().getChildPage() is guaranteed to be valid
// cEnd can be
// - the next record which also has a child page
// - the next-next record which has a child page because the next record does not and
// only existed to provide the correct upper bound for decoding cBegin's child page
// - a later record with a valid child page, because this slice represents a range of
// multiple subtrees that are either all unchanged or all cleared.
// - invalid, because cBegin is the last child entry in the page or because the range
// being cleared or unchanged extends to the end of the page's entries
BTreePage::BinaryTree::Cursor cBegin;
BTreePage::BinaryTree::Cursor cEnd;
// The prefix length common to the entire logical subtree. Might be shorter than the length common to all
// actual items in the page.
int skipLen;
// Members below this point are "output" members, set by function calls from commitSubtree() once it decides
// what is happening with this slice of the tree.
// If true, present, the contents of newLinks should replace [cBegin, cEnd)
bool childrenChanged;
Standalone<VectorRef<RedwoodRecordRef>> newLinks;
// The upper boundary expected, if any, by the last child in either [cBegin, cEnd) or newLinks
// If the last record in the range has a null link then this will be null.
Optional<RedwoodRecordRef> expectedUpperBound;
bool inPlaceUpdate;
// CommitSubtree will call one of the following three functions based on its exit path
// Subtree was cleared.
void cleared() {
inPlaceUpdate = false;
childrenChanged = true;
expectedUpperBound.reset();
}
// Page was updated in-place through edits and written to maybeNewID
void updatedInPlace(BTreeNodeLinkRef maybeNewID, BTreePage* btPage, int capacity) {
inPlaceUpdate = true;
auto& metrics = g_redwoodMetrics.level(btPage->height);
metrics.metrics.pageModify += 1;
metrics.metrics.pageModifyExt += (maybeNewID.size() - 1);
metrics.modifyFillPctSketch->samplePercentage((double)btPage->size() / capacity);
metrics.modifyStoredPctSketch->samplePercentage((double)btPage->kvBytes / capacity);
metrics.modifyItemCountSketch->sampleRecordCounter(btPage->tree()->numItems);
g_redwoodMetrics.kvSizeWritten->sample(btPage->kvBytes);
// The boundaries can't have changed, but the child page link may have.
if (maybeNewID != decodeLowerBound.getChildPage()) {
// Add page's decode lower bound to newLinks set without its child page, intially
newLinks.push_back_deep(newLinks.arena(), decodeLowerBound.withoutValue());
// Set the child page ID, which has already been allocated in result.arena()
newLinks.back().setChildPage(maybeNewID);
childrenChanged = true;
expectedUpperBound = decodeUpperBound;
} else {
childrenChanged = false;
}
// Expected upper bound remains unchanged.
}
// writePages() was used to build 1 or more replacement pages.
void rebuilt(Standalone<VectorRef<RedwoodRecordRef>> newRecords) {
inPlaceUpdate = false;
newLinks = newRecords;
childrenChanged = true;
// If the replacement records ended on a non-null child page, then the expect upper bound is
// the subtree upper bound since that is what would have been used for the page(s) rebuild,
// otherwise it is null.
expectedUpperBound = newLinks.back().value.present() ? subtreeUpperBound : Optional<RedwoodRecordRef>();
}
// Get the first record for this range AFTER applying whatever changes were made
const RedwoodRecordRef* getFirstBoundary() const {
if (childrenChanged) {
if (newLinks.empty()) {
return nullptr;
}
return &newLinks.front();
}
return &decodeLowerBound;
}
std::string toString() const {
std::string s;
s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d inPlaceUpdate=%d\n",
this,
skipLen,
childrenChanged && newLinks.empty(),
childrenChanged,
inPlaceUpdate);
s += format("SubtreeLower: %s\n", subtreeLowerBound.toString(false).c_str());
s += format(" DecodeLower: %s\n", decodeLowerBound.toString(false).c_str());
s += format(" DecodeUpper: %s\n", decodeUpperBound.toString(false).c_str());
s += format("SubtreeUpper: %s\n", subtreeUpperBound.toString(false).c_str());
s += format("expectedUpperBound: %s\n",
expectedUpperBound.present() ? expectedUpperBound.get().toString(false).c_str() : "(null)");
s += format("newLinks:\n");
for (int i = 0; i < newLinks.size(); ++i) {
s += format(" %i: %s\n", i, newLinks[i].toString(false).c_str());
}
s.resize(s.size() - 1);
return s;
}
};
struct InternalPageModifier {
InternalPageModifier() {}
InternalPageModifier(Reference<const ArenaPage> p,
bool alreadyCloned,
bool updating,
ParentInfo* parentInfo,
Reference<IPageEncryptionKeyProvider> keyProvider,
Optional<int64_t> pageDomainId)
: updating(updating), page(p), clonedPage(alreadyCloned), changesMade(false), parentInfo(parentInfo),
keyProvider(keyProvider), pageDomainId(pageDomainId) {}
// Whether updating the existing page is allowed
bool updating;
Reference<const ArenaPage> page;
// Whether or not page has been cloned for update
bool clonedPage;
Standalone<VectorRef<RedwoodRecordRef>> rebuild;
// Whether there are any changes to the page, either made in place or staged in rebuild
bool changesMade;
ParentInfo* parentInfo;
Reference<IPageEncryptionKeyProvider> keyProvider;
Optional<int64_t> pageDomainId;
BTreePage* btPage() const { return (BTreePage*)page->mutateData(); }
bool empty() const {
if (updating) {
return btPage()->tree()->numItems == 0;
} else {
return rebuild.empty();
}
}
void cloneForUpdate() {
if (!clonedPage) {
page = clonePageForUpdate(page);
clonedPage = true;
}
}
// end is the cursor position of the first record of the unvisited child link range, which
// is needed if the insert requires switching from update to rebuild mode.
void insert(BTreePage::BinaryTree::Cursor end, const VectorRef<RedwoodRecordRef>& recs) {
int i = 0;
if (updating) {
cloneForUpdate();
// Update must be done in the new tree, not the original tree where the end cursor will be from
end.switchTree(btPage()->tree());
// TODO: insert recs in a random order to avoid new subtree being entirely right child links
while (i != recs.size()) {
const RedwoodRecordRef& rec = recs[i];
debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str());
// Fail if the inserted record does not belong to the same encryption domain as the existing page
// data.
bool canInsert = true;
if (page->isEncrypted() && keyProvider->enableEncryptionDomain()) {
ASSERT(keyProvider && pageDomainId.present());
canInsert = keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, true);
}
if (canInsert) {
canInsert = end.insert(rec);
}
if (!canInsert) {
debug_printf("internal page: failed to insert %s, switching to rebuild\n",
rec.toString(false).c_str());
// Update failed, so populate rebuild vector with everything up to but not including end, which
// may include items from recs that were already added.
auto c = end;
if (c.moveFirst()) {
rebuild.reserve(rebuild.arena(), c.tree->numItems);
while (c != end) {
debug_printf(" internal page rebuild: add %s\n", c.get().toString(false).c_str());
rebuild.push_back(rebuild.arena(), c.get());
c.moveNext();
}
}
updating = false;
break;
}
btPage()->kvBytes += rec.kvBytes();
++i;
}
}
// Not updating existing page so just add recs to rebuild vector
if (!updating) {
rebuild.reserve(rebuild.arena(), rebuild.size() + recs.size());
while (i != recs.size()) {
const RedwoodRecordRef& rec = recs[i];
debug_printf("internal page (rebuilding) insert: %s\n", rec.toString(false).c_str());
rebuild.push_back(rebuild.arena(), rec);
++i;
}
}
}
void keep(BTreePage::BinaryTree::Cursor begin, BTreePage::BinaryTree::Cursor end) {
if (!updating) {
while (begin != end) {
debug_printf("internal page (rebuilding) keeping: %s\n", begin.get().toString(false).c_str());
rebuild.push_back(rebuild.arena(), begin.get());
begin.moveNext();
}
} else if (REDWOOD_DEBUG) {
while (begin != end) {
debug_printf("internal page (updating) keeping: %s\n", begin.get().toString(false).c_str());
begin.moveNext();
}
}
}
// This must be called for each of the InternalPageSliceUpdates in sorted order.
void applyUpdate(InternalPageSliceUpdate& u, const RedwoodRecordRef* nextBoundary) {
debug_printf("applyUpdate nextBoundary=(%p) %s\n",
nextBoundary,
(nextBoundary != nullptr) ? nextBoundary->toString(false).c_str() : "");
debug_print(addPrefix("applyUpdate", u.toString()));
// If the children changed, replace [cBegin, cEnd) with newLinks
if (u.childrenChanged) {
cloneForUpdate();
if (updating) {
auto c = u.cBegin;
// must point c to the tree to erase from
c.switchTree(btPage()->tree());
while (c != u.cEnd) {
debug_printf("applyUpdate (updating) erasing: %s\n", c.get().toString(false).c_str());
btPage()->kvBytes -= c.get().kvBytes();
c.erase();
}
// [cBegin, cEnd) is now erased, and cBegin is invalid, so cEnd represents the end
// of the range that comes before any part of newLinks that can't be added if there
// is not enough space.
insert(u.cEnd, u.newLinks);
} else {
// Already in rebuild mode so the cursor parameter is meaningless
insert({}, u.newLinks);
}
// cBegin has been erased so interating from the first entry forward will never see cBegin to use as an
// endpoint.
changesMade = true;
} else {
// If this was an in-place update, where the child page IDs do not change, notify the
// parentInfo that those pages have been updated so it can possibly eliminate their
// second writes later.
if (u.inPlaceUpdate) {
for (auto id : u.decodeLowerBound.getChildPage()) {
parentInfo->pageUpdated(id);
}
}
keep(u.cBegin, u.cEnd);
}
// If there is an expected upper boundary for the next range start after u
if (u.expectedUpperBound.present()) {
// Then if it does not match the next boundary then insert a dummy record
if (nextBoundary == nullptr || (nextBoundary != &u.expectedUpperBound.get() &&
!nextBoundary->sameExceptValue(u.expectedUpperBound.get()))) {
RedwoodRecordRef rec = u.expectedUpperBound.get().withoutValue();
debug_printf("applyUpdate adding dummy record %s\n", rec.toString(false).c_str());
cloneForUpdate();
insert(u.cEnd, { &rec, 1 });
changesMade = true;
}
}
}
};
ACTOR static Future<Void> buildNewSubtree(VersionedBTree* self,
Version version,
LogicalPageID parentID,
unsigned int height,
MutationBuffer::const_iterator mBegin,
MutationBuffer::const_iterator mEnd,
InternalPageSliceUpdate* update) {
ASSERT(height > 1);
debug_printf(
"buildNewSubtree start version %" PRId64 ", height %u, %s\n'", version, height, update->toString().c_str());
state Standalone<VectorRef<RedwoodRecordRef>> records;
while (mBegin != mEnd && mBegin.key() < update->subtreeLowerBound.key) {
++mBegin;
}
while (mBegin != mEnd) {
if (mBegin.mutation().boundarySet()) {
RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get());
records.push_back_deep(records.arena(), rec);
if (REDWOOD_DEBUG) {
debug_printf(" Added %s", rec.toString().c_str());
}
}
++mBegin;
}
if (records.empty()) {
update->cleared();
} else {
state unsigned int h = 1;
debug_printf("buildNewSubtree at level %u\n", h);
while (h < height) {
// Only the parentID at the root is known as we are building the subtree bottom-up.
// We use the parentID for all levels, since the parentID is currently used for
// debug use only.
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(writePages(self,
&update->subtreeLowerBound,
&update->subtreeUpperBound,
records,
h,
version,
BTreeNodeLinkRef(),
parentID));
records = newRecords;
h++;
}
update->rebuilt(records);
}
return Void();
}
ACTOR static Future<Void> commitSubtree(
VersionedBTree* self,
CommitBatch* batch,
BTreeNodeLinkRef rootID,
LogicalPageID parentID,
unsigned int height,
MutationBuffer::const_iterator mBegin, // greatest mutation boundary <= subtreeLowerBound->key
MutationBuffer::const_iterator mEnd, // least boundary >= subtreeUpperBound->key
InternalPageSliceUpdate* update) {
state std::string context;
if (REDWOOD_DEBUG) {
context = format("CommitSubtree(root=%s+%d %s): ",
toString(rootID.front()).c_str(),
rootID.size() - 1,
::toString(batch->writeVersion).c_str());
}
debug_printf("%s rootID=%s\n", context.c_str(), toString(rootID).c_str());
debug_print(addPrefix(context, update->toString()));
if (REDWOOD_DEBUG) {
int c = 0;
auto i = mBegin;
while (1) {
debug_printf("%s Mutation %4d '%s': %s\n",
context.c_str(),
c,
printable(i.key()).c_str(),
i.mutation().toString().c_str());
if (i == mEnd) {
break;
}
++c;
++i;
}
}
state Reference<const ArenaPage> page = wait(
readPage(self, PagerEventReasons::Commit, height, batch->snapshot.getPtr(), rootID, height, false, true));
// If the page exists in the cache, it must be copied before modification.
// That copy will be referenced by pageCopy, as page must stay in scope in case anything references its
// memory and it gets evicted from the cache.
// If the page is not in the cache, then no copy is needed so we will initialize pageCopy to page
state Reference<const ArenaPage> pageCopy;
state BTreePage* btPage = (BTreePage*)page->mutateData();
ASSERT(height == btPage->height);
++g_redwoodMetrics.level(height).metrics.pageCommitStart;
// TODO: Decide if it is okay to update if the subtree boundaries are expanded. It can result in
// records in a DeltaTree being outside its decode boundary range, which isn't actually invalid
// though it is awkward to reason about.
// TryToUpdate indicates insert and erase operations should be tried on the existing page first
state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal();
state bool useEncryptionDomain = page->isEncrypted() && self->m_keyProvider->enableEncryptionDomain();
state Optional<int64_t> pageDomainId;
if (useEncryptionDomain) {
pageDomainId = page->getEncryptionDomainId();
}
debug_printf("%s tryToUpdate=%d\n", context.c_str(), tryToUpdate);
debug_print(addPrefix(context,
btPage->toString("commitSubtreeStart",
rootID,
batch->snapshot->getVersion(),
update->decodeLowerBound,
update->decodeUpperBound)));
state BTreePage::BinaryTree::Cursor cursor = update->cBegin.valid()
? self->getCursor(page.getPtr(), update->cBegin)
: self->getCursor(page.getPtr(), dbBegin, dbEnd);
if (self->m_pBoundaryVerifier != nullptr) {
if (update->cBegin.valid()) {
ASSERT(self->m_pBoundaryVerifier->verify(rootID.front(),
batch->snapshot->getVersion(),
update->cBegin.get().key,
update->cBegin.next().getOrUpperBound().key,
pageDomainId,
cursor));
}
}
// Leaf Page
if (btPage->isLeaf()) {
// When true, we are modifying the existing DeltaTree
// When false, we are accumulating retained and added records in merged vector to build pages from them.
bool updatingDeltaTree = tryToUpdate;
bool changesMade = false;
// Copy page for modification if not already copied
auto copyForUpdate = [&]() {
if (!pageCopy.isValid()) {
pageCopy = clonePageForUpdate(page);
btPage = (BTreePage*)pageCopy->mutateData();
cursor.switchTree(btPage->tree());
}
};
state Standalone<VectorRef<RedwoodRecordRef>> merged;
// The first mutation buffer boundary has a key <= the first key in the page.
cursor.moveFirst();
debug_printf("%s Leaf page, applying changes.\n", context.c_str());
// Now, process each mutation range and merge changes with existing data.
bool firstMutationBoundary = true;
constexpr int maxHeightAllowed = 8;
while (mBegin != mEnd) {
// Apply the change to the mutation buffer start boundary key only if
// - there actually is a change (clear or set to new value)
// - either this is not the first boundary or it is but its key matches our lower bound key
bool applyBoundaryChange = mBegin.mutation().boundaryChanged &&
(!firstMutationBoundary || mBegin.key() == update->subtreeLowerBound.key);
bool boundaryExists = cursor.valid() && cursor.get().key == mBegin.key();
debug_printf("%s New mutation boundary: '%s': %s applyBoundaryChange=%d boundaryExists=%d "
"updatingDeltaTree=%d\n",
context.c_str(),
printable(mBegin.key()).c_str(),
mBegin.mutation().toString().c_str(),
applyBoundaryChange,
boundaryExists,
updatingDeltaTree);
firstMutationBoundary = false;
if (applyBoundaryChange) {
// If the boundary is being set to a value, the new KV record will be inserted
bool shouldInsertBoundary = mBegin.mutation().boundarySet();
// Optimization: In-place value update of new same-sized value
// If the boundary exists in the page and we're in update mode and the boundary is being set to a
// new value of the same length as the old value then just update the value bytes.
if (boundaryExists && updatingDeltaTree && shouldInsertBoundary &&
mBegin.mutation().boundaryValue.get().size() == cursor.get().value.get().size()) {
changesMade = true;
shouldInsertBoundary = false;
debug_printf("%s In-place value update for %s [existing, boundary start]\n",
context.c_str(),
cursor.get().toString().c_str());
copyForUpdate();
memcpy((uint8_t*)cursor.get().value.get().begin(),
mBegin.mutation().boundaryValue.get().begin(),
cursor.get().value.get().size());
cursor.moveNext();
} else if (boundaryExists) {
// An in place update can't be done, so if the boundary exists then erase or skip the record
changesMade = true;
// If updating, erase from the page, otherwise do not add to the output set
if (updatingDeltaTree) {
debug_printf("%s Erasing %s [existing, boundary start]\n",
context.c_str(),
cursor.get().toString().c_str());
copyForUpdate();
btPage->kvBytes -= cursor.get().kvBytes();
cursor.erase();
} else {
debug_printf("%s Skipped %s [existing, boundary start]\n",
context.c_str(),
cursor.get().toString().c_str());
cursor.moveNext();
}
}
// If the boundary value is being set and we must insert it, add it to the page or the output set
if (shouldInsertBoundary) {
RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get());
changesMade = true;
// If updating, first try to add the record to the page
if (updatingDeltaTree) {
bool canInsert = true;
if (useEncryptionDomain) {
ASSERT(pageDomainId.present());
canInsert = self->m_keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, false);
}
if (canInsert) {
copyForUpdate();
canInsert = cursor.insert(rec, update->skipLen, maxHeightAllowed);
}
if (canInsert) {
btPage->kvBytes += rec.kvBytes();
debug_printf("%s Inserted %s [mutation, boundary start]\n",
context.c_str(),
rec.toString().c_str());
} else {
debug_printf("%s Insert failed for %s [mutation, boundary start]\n",
context.c_str(),
rec.toString().c_str());
// Since the insert failed we must switch to a linear merge of existing data and
// mutations, accumulating the new record set in the merge vector and build new pages
// from it. First, we must populate the merged vector with all the records up to but not
// including the current mutation boundary key.
auto c = cursor;
c.moveFirst();
while (c != cursor) {
debug_printf(
"%s catch-up adding %s\n", context.c_str(), c.get().toString().c_str());
merged.push_back(merged.arena(), c.get());
c.moveNext();
}
updatingDeltaTree = false;
}
}
// If not updating, possibly due to insert failure above, then add record to the output set
if (!updatingDeltaTree) {
merged.push_back(merged.arena(), rec);
debug_printf(
"%s Added %s [mutation, boundary start]\n", context.c_str(), rec.toString().c_str());
}
}
} else if (boundaryExists) {
// If the boundary exists in the page but there is no pending change,
// then if updating move past it, otherwise add it to the output set.
if (updatingDeltaTree) {
cursor.moveNext();
} else {
merged.push_back(merged.arena(), cursor.get());
debug_printf("%s Added %s [existing, boundary start]\n",
context.c_str(),
cursor.get().toString().c_str());
cursor.moveNext();
}
}
// Before advancing the iterator, get whether or not the records in the following range must be removed
bool remove = mBegin.mutation().clearAfterBoundary;
// Advance to the next boundary because we need to know the end key for the current range.
++mBegin;
if (mBegin == mEnd) {
update->skipLen = 0;
}
debug_printf("%s Mutation range end: '%s'\n", context.c_str(), printable(mBegin.key()).c_str());
// Now handle the records up through but not including the next mutation boundary key
RedwoodRecordRef end(mBegin.key());
// If the records are being removed and we're not doing an in-place update
// OR if we ARE doing an update but the records are NOT being removed, then just skip them.
if (remove != updatingDeltaTree) {
// If not updating, then the records, if any exist, are being removed. We don't know if there
// actually are any but we must assume there are.
if (!updatingDeltaTree) {
changesMade = true;
}
debug_printf("%s Seeking forward to next boundary (remove=%d updating=%d) %s\n",
context.c_str(),
remove,
updatingDeltaTree,
mBegin.key().toString().c_str());
cursor.seekGreaterThanOrEqual(end, update->skipLen);
} else {
// Otherwise we must visit the records. If updating, the visit is to erase them, and if doing a
// linear merge than the visit is to add them to the output set.
while (cursor.valid() && cursor.get().compare(end, update->skipLen) < 0) {
if (updatingDeltaTree) {
debug_printf("%s Erasing %s [existing, boundary start]\n",
context.c_str(),
cursor.get().toString().c_str());
copyForUpdate();
btPage->kvBytes -= cursor.get().kvBytes();
cursor.erase();
changesMade = true;
} else {
merged.push_back(merged.arena(), cursor.get());
debug_printf(
"%s Added %s [existing, middle]\n", context.c_str(), merged.back().toString().c_str());
cursor.moveNext();
}
}
}
}
// If there are still more records, they have the same key as the end boundary
if (cursor.valid()) {
// If the end boundary is changing, we must remove the remaining records in this page
bool remove = mEnd.mutation().boundaryChanged;
if (remove) {
changesMade = true;
}
// If we don't have to remove the records and we are updating, do nothing.
// If we do have to remove the records and we are not updating, do nothing.
if (remove != updatingDeltaTree) {
debug_printf("%s Ignoring remaining records, remove=%d updatingDeltaTree=%d\n",
context.c_str(),
remove,
updatingDeltaTree);
} else {
// If updating and the key is changing, we must visit the records to erase them.
// If not updating and the key is not changing, we must visit the records to add them to the output
// set.
while (cursor.valid()) {
if (updatingDeltaTree) {
debug_printf(
"%s Erasing %s and beyond [existing, matches changed upper mutation boundary]\n",
context.c_str(),
cursor.get().toString().c_str());
copyForUpdate();
btPage->kvBytes -= cursor.get().kvBytes();
cursor.erase();
} else {
merged.push_back(merged.arena(), cursor.get());
debug_printf(
"%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str());
cursor.moveNext();
}
}
}
} else {
debug_printf("%s No records matching mutation buffer end boundary key\n", context.c_str());
}
// No changes were actually made. This could happen if the only mutations are clear ranges which do not
// match any records.
if (!changesMade) {
debug_printf("%s No changes were made during mutation merge, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
return Void();
} else {
debug_printf(
"%s Changes were made, writing, but subtree may still be unchanged from parent's perspective.\n",
context.c_str());
}
if (updatingDeltaTree) {
// If the tree is now empty, delete the page
if (cursor.tree->numItems == 0) {
update->cleared();
self->freeBTreePage(height, rootID, batch->writeVersion);
debug_printf("%s Page updates cleared all entries, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
} else {
// Otherwise update it.
BTreeNodeLinkRef newID = wait(self->updateBTreePage(self,
rootID,
parentID,
&update->newLinks.arena(),
pageCopy.castTo<ArenaPage>(),
batch->writeVersion));
debug_printf("%s Leaf node updated in-place at version %s, new contents:\n",
context.c_str(),
toString(batch->writeVersion).c_str());
debug_print(addPrefix(context,
btPage->toString("updateLeafNode",
newID,
batch->snapshot->getVersion(),
update->decodeLowerBound,
update->decodeUpperBound)));
update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
debug_printf("%s Leaf node updated in-place, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
}
return Void();
}
// If everything in the page was deleted then this page should be deleted as of the new version
if (merged.empty()) {
update->cleared();
self->freeBTreePage(height, rootID, batch->writeVersion);
debug_printf("%s All leaf page contents were cleared, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
return Void();
}
// Rebuild new page(s).
state Standalone<VectorRef<RedwoodRecordRef>> entries = wait(writePages(self,
&update->subtreeLowerBound,
&update->subtreeUpperBound,
merged,
height,
batch->writeVersion,
rootID,
parentID));
// Put new links into update and tell update that pages were rebuilt
update->rebuilt(entries);
debug_printf("%s Merge complete, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
return Void();
} else {
// Internal Page
std::vector<Future<Void>> recursions;
state std::vector<std::unique_ptr<InternalPageSliceUpdate>> slices;
cursor.moveFirst();
bool first = true;
if (useEncryptionDomain && cursor.valid() && update->subtreeLowerBound.key < cursor.get().key) {
mEnd = batch->mutations->lower_bound(cursor.get().key);
first = false;
if (mBegin != mEnd) {
slices.emplace_back(new InternalPageSliceUpdate());
InternalPageSliceUpdate& u = *slices.back();
u.cBegin = cursor;
u.cEnd = cursor;
u.subtreeLowerBound = update->subtreeLowerBound;
u.decodeLowerBound = u.subtreeLowerBound;
u.subtreeUpperBound = cursor.get();
u.decodeUpperBound = u.subtreeUpperBound;
u.expectedUpperBound = u.subtreeUpperBound;
u.skipLen = 0;
recursions.push_back(
self->buildNewSubtree(self, batch->writeVersion, parentID, height, mBegin, mEnd, &u));
}
}
while (cursor.valid()) {
slices.emplace_back(new InternalPageSliceUpdate());
InternalPageSliceUpdate& u = *slices.back();
// At this point we should never be at a null child page entry because the first entry of a page
// can't be null and this loop will skip over null entries that come after non-null entries.
ASSERT(cursor.get().value.present());
// Subtree lower boundary is this page's subtree lower bound or cursor
u.cBegin = cursor;
u.decodeLowerBound = cursor.get();
if (first) {
u.subtreeLowerBound = update->subtreeLowerBound;
first = false;
// mbegin is already the first mutation that could affect this subtree described by update
} else {
u.subtreeLowerBound = u.decodeLowerBound;
mBegin = mEnd;
// mBegin is either at or greater than subtreeLowerBound->key, which was the subtreeUpperBound->key
// for the previous subtree slice. But we need it to be at or *before* subtreeLowerBound->key
// so if mBegin.key() is not exactly the subtree lower bound key then decrement it.
if (mBegin.key() != u.subtreeLowerBound.key) {
--mBegin;
}
}
BTreeNodeLinkRef pageID = cursor.get().getChildPage();
ASSERT(!pageID.empty());
// The decode upper bound is always the next key after the child link, or the decode upper bound for
// this page
if (cursor.moveNext()) {
u.decodeUpperBound = cursor.get();
// If cursor record has a null child page then it exists only to preserve a previous
// subtree boundary that is now needed for reading the subtree at cBegin.
if (!cursor.get().value.present()) {
// If the upper bound is provided by a dummy record in [cBegin, cEnd) then there is no
// requirement on the next subtree range or the parent page to have a specific upper boundary
// for decoding the subtree. The expected upper bound has not yet been set so it can remain
// empty.
cursor.moveNext();
// If there is another record after the null child record, it must have a child page value
ASSERT(!cursor.valid() || cursor.get().value.present());
} else {
u.expectedUpperBound = u.decodeUpperBound;
}
} else {
u.decodeUpperBound = update->decodeUpperBound;
u.expectedUpperBound = update->decodeUpperBound;
}
u.subtreeUpperBound = cursor.valid() ? cursor.get() : update->subtreeUpperBound;
u.cEnd = cursor;
u.skipLen = 0; // TODO: set this
// Find the mutation buffer range that includes all changes to the range described by u
mEnd = batch->mutations->lower_bound(u.subtreeUpperBound.key);
// If the mutation range described by mBegin extends to mEnd, then see if the part of that range
// that overlaps with u's subtree range is being fully cleared or fully unchanged.
auto next = mBegin;
++next;
if (next == mEnd) {
// Check for uniform clearedness or unchangedness for the range mutation where it overlaps u's
// subtree
const KeyRef& mutationBoundaryKey = mBegin.key();
const RangeMutation& range = mBegin.mutation();
bool uniform;
if (range.clearAfterBoundary) {
// If the mutation range after the boundary key is cleared, then the mutation boundary key must
// be cleared or must be different than the subtree lower bound key so that it doesn't matter
uniform = range.boundaryCleared() || mutationBoundaryKey != u.subtreeLowerBound.key;
} else {
// If the mutation range after the boundary key is unchanged, then the mutation boundary key
// must be also unchanged or must be different than the subtree lower bound key so that it
// doesn't matter
uniform = !range.boundaryChanged || mutationBoundaryKey != u.subtreeLowerBound.key;
}
// If u's subtree is either all cleared or all unchanged
if (uniform) {
// We do not need to recurse to this subtree. Next, let's see if we can embiggen u's range to
// include sibling subtrees also covered by (mBegin, mEnd) so we can not recurse to those, too.
// If the cursor is valid, u.subtreeUpperBound is the cursor's position, which is >= mEnd.key().
// If equal, no range expansion is possible.
if (cursor.valid() && mEnd.key() != u.subtreeUpperBound.key) {
// TODO: If cursor hints are available, use (cursor, 1)
cursor.seekLessThanOrEqual(mEnd.key(), update->skipLen);
// If this seek moved us ahead, to something other than cEnd, then update subtree range
// boundaries
if (cursor != u.cEnd) {
// If the cursor is at a record with a null child, back up one step because it is in the
// middle of the next logical subtree, as null child records are not subtree boundaries.
ASSERT(cursor.valid());
if (!cursor.get().value.present()) {
cursor.movePrev();
}
u.cEnd = cursor;
u.subtreeUpperBound = cursor.get();
u.skipLen = 0; // TODO: set this
// The new decode upper bound is either cEnd or the record before it if it has no child
// link
auto c = u.cEnd;
c.movePrev();
ASSERT(c.valid());
if (!c.get().value.present()) {
u.decodeUpperBound = c.get();
u.expectedUpperBound.reset();
} else {
u.decodeUpperBound = u.subtreeUpperBound;
u.expectedUpperBound = u.subtreeUpperBound;
}
}
}
// The subtree range is either fully cleared or unchanged.
if (range.clearAfterBoundary) {
// Cleared
u.cleared();
auto c = u.cBegin;
while (c != u.cEnd) {
RedwoodRecordRef rec = c.get();
if (rec.value.present()) {
if (height == 2) {
debug_printf("%s freeing child page in cleared subtree range: %s\n",
context.c_str(),
::toString(rec.getChildPage()).c_str());
self->freeBTreePage(height, rec.getChildPage(), batch->writeVersion);
} else {
debug_printf("%s queuing subtree deletion cleared subtree range: %s\n",
context.c_str(),
::toString(rec.getChildPage()).c_str());
self->m_lazyClearQueue.pushBack(LazyClearQueueEntry{
(uint8_t)(height - 1), batch->writeVersion, rec.getChildPage() });
}
}
c.moveNext();
}
} else {
// Subtree range unchanged
}
debug_printf("%s Not recursing, one mutation range covers this slice:\n", context.c_str());
debug_print(addPrefix(context, u.toString()));
// u has already been initialized with the correct result, no recursion needed, so restart the
// loop.
continue;
}
}
// If this page has height of 2 then its children are leaf nodes
debug_printf("%s Recursing for %s\n", context.c_str(), toString(pageID).c_str());
debug_print(addPrefix(context, u.toString()));
recursions.push_back(
self->commitSubtree(self, batch, pageID, rootID.front(), height - 1, mBegin, mEnd, &u));
}
debug_printf("%s Recursions from internal page started. pageSize=%d level=%d children=%d slices=%zu "
"recursions=%zu\n",
context.c_str(),
btPage->size(),
btPage->height,
btPage->tree()->numItems,
slices.size(),
recursions.size());
wait(waitForAll(recursions));
debug_printf("%s Recursions done, processing slice updates.\n", context.c_str());
// ParentInfo could be invalid after a wait and must be re-initialized.
// All uses below occur before waits so no reinitialization is done.
state ParentInfo* parentInfo = &self->childUpdateTracker[rootID.front()];
// InternalPageModifier takes the results of the recursive commitSubtree() calls in order
// and makes changes to page as needed, copying as needed, and generating an array from
// which to build new page(s) if modification is not possible or not allowed.
// If pageCopy is already set it was initialized to page above so the modifier doesn't need
// to copy it
state InternalPageModifier modifier(
page, pageCopy.isValid(), tryToUpdate, parentInfo, self->m_keyProvider, pageDomainId);
// Apply the possible changes for each subtree range recursed to, except the last one.
// For each range, the expected next record, if any, is checked against the first boundary
// of the next range, if any.
for (int i = 0, iEnd = slices.size() - 1; i < iEnd; ++i) {
modifier.applyUpdate(*slices[i], slices[i + 1]->getFirstBoundary());
}
// The expected next record for the final range is checked against one of the upper boundaries passed to
// this commitSubtree() instance. If changes have already been made, then the subtree upper boundary is
// passed, so in the event a different upper boundary is needed it will be added to the already-modified
// page. Otherwise, the decode boundary is used which will prevent this page from being modified for the
// sole purpose of adding a dummy upper bound record.
debug_printf("%s Applying final child range update. changesMade=%d\nSubtree Root Update:\n",
context.c_str(),
modifier.changesMade);
debug_print(addPrefix(context, update->toString()));
// TODO(yiwu): check whether we can pass decodeUpperBound as nextBoundary when the last slice
// have childenChanged=true.
modifier.applyUpdate(*slices.back(),
modifier.changesMade || slices.back()->childrenChanged ? &update->subtreeUpperBound
: &update->decodeUpperBound);
state bool detachChildren = (parentInfo->count > 2);
state bool forceUpdate = false;
// If no changes were made, but we should rewrite it to point directly to remapped child pages
if (!modifier.changesMade && detachChildren) {
debug_printf(
"%s Internal page forced rewrite because at least %d children have been updated in-place.\n",
context.c_str(),
parentInfo->count);
forceUpdate = true;
modifier.updating = true;
// Make sure the modifier cloned the page so we can update the child links in-place below.
modifier.cloneForUpdate();
++g_redwoodMetrics.level(height).metrics.forceUpdate;
}
// If the modifier cloned the page for updating, then update our local pageCopy, btPage, and cursor
if (modifier.clonedPage) {
pageCopy = modifier.page;
btPage = modifier.btPage();
cursor.switchTree(btPage->tree());
}
// If page contents have changed
if (modifier.changesMade || forceUpdate) {
if (modifier.empty()) {
update->cleared();
debug_printf(
"%s All internal page children were deleted so deleting this page too. Returning slice:\n",
context.c_str());
debug_print(addPrefix(context, update->toString()));
self->freeBTreePage(height, rootID, batch->writeVersion);
} else {
if (modifier.updating) {
// Page was updated in place (or being forced to be updated in place to update child page ids)
debug_printf(
"%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n",
context.c_str(),
tryToUpdate,
forceUpdate,
detachChildren);
if (detachChildren) {
int detached = 0;
cursor.moveFirst();
auto& stats = g_redwoodMetrics.level(height);
while (cursor.valid()) {
if (cursor.get().value.present()) {
BTreeNodeLinkRef child = cursor.get().getChildPage();
// Nodes larger than 1 page are never remapped.
if (child.size() == 1) {
LogicalPageID& p = child.front();
if (parentInfo->maybeUpdated(p)) {
PhysicalPageID newID =
self->m_pager->detachRemappedPage(p, batch->writeVersion);
if (newID != invalidPhysicalPageID) {
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->updatePageId(
batch->writeVersion, p, newID);
}
p = newID;
++stats.metrics.detachChild;
++detached;
}
}
}
}
cursor.moveNext();
}
parentInfo->clear();
if (forceUpdate && detached == 0) {
debug_printf("%s No children detached during forced update, returning slice:\n",
context.c_str());
debug_print(addPrefix(context, update->toString()));
return Void();
}
}
BTreeNodeLinkRef newID = wait(self->updateBTreePage(self,
rootID,
parentID,
&update->newLinks.arena(),
pageCopy.castTo<ArenaPage>(),
batch->writeVersion));
debug_printf(
"%s commitSubtree(): Internal node updated in-place at version %s, new contents:\n",
context.c_str(),
toString(batch->writeVersion).c_str());
debug_print(addPrefix(context,
btPage->toString("updateInternalNode",
newID,
batch->snapshot->getVersion(),
update->decodeLowerBound,
update->decodeUpperBound)));
update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
debug_printf("%s Internal node updated in-place, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
} else {
// Page was rebuilt, possibly split.
debug_printf("%s Internal page could not be modified, rebuilding replacement(s).\n",
context.c_str());
if (detachChildren) {
auto& stats = g_redwoodMetrics.level(height);
for (auto& rec : modifier.rebuild) {
if (rec.value.present()) {
BTreeNodeLinkRef oldPages = rec.getChildPage();
// Nodes larger than 1 page are never remapped.
if (oldPages.size() == 1) {
LogicalPageID p = oldPages.front();
if (parentInfo->maybeUpdated(p)) {
PhysicalPageID newID =
self->m_pager->detachRemappedPage(p, batch->writeVersion);
if (newID != invalidPhysicalPageID) {
// Records in the rebuild vector reference original page memory so make
// a new child link in the rebuild arena
BTreeNodeLinkRef newPages = BTreeNodeLinkRef(
modifier.rebuild.arena(), BTreeNodeLinkRef(&newID, 1));
rec.setChildPage(newPages);
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->updatePageId(
batch->writeVersion, p, newID);
}
++stats.metrics.detachChild;
}
}
}
}
}
parentInfo->clear();
}
Standalone<VectorRef<RedwoodRecordRef>> newChildEntries =
wait(writePages(self,
&update->subtreeLowerBound,
&update->subtreeUpperBound,
modifier.rebuild,
height,
batch->writeVersion,
rootID,
parentID));
update->rebuilt(newChildEntries);
debug_printf("%s Internal page rebuilt, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
}
}
} else {
debug_printf("%s Page has no changes, returning slice:\n", context.c_str());
debug_print(addPrefix(context, update->toString()));
}
return Void();
}
}
ACTOR static Future<Void> commit_impl(VersionedBTree* self, Version writeVersion, Future<Void> previousCommit) {
// Take ownership of the current mutation buffer and make a new one
state CommitBatch batch;
batch.mutations = std::move(self->m_pBuffer);
self->m_pBuffer.reset(new MutationBuffer());
batch.mutationCount = self->m_mutationCount;
self->m_mutationCount = 0;
batch.writeVersion = writeVersion;
batch.newOldestVersion = self->m_newOldestVersion;
// Wait for the latest commit to be finished.
wait(previousCommit);
// If the write version has not advanced then there can be no changes pending.
// If there are no changes, then the commit is a no-op.
if (writeVersion == self->m_pager->getLastCommittedVersion()) {
ASSERT(batch.mutationCount == 0);
debug_printf("%s: Empty commit at repeat version %" PRId64 "\n", self->m_name.c_str(), batch.writeVersion);
return Void();
}
// For this commit, use the latest snapshot that was just committed.
batch.readVersion = self->m_pager->getLastCommittedVersion();
self->m_pager->setOldestReadableVersion(batch.newOldestVersion);
debug_printf("%s: Beginning commit of version %" PRId64 ", read version %" PRId64
", new oldest version set to %" PRId64 "\n",
self->m_name.c_str(),
batch.writeVersion,
batch.readVersion,
batch.newOldestVersion);
batch.snapshot = self->m_pager->getReadSnapshot(batch.readVersion);
state BTreeNodeLink rootNodeLink = self->m_header.root;
state InternalPageSliceUpdate all;
state RedwoodRecordRef rootLink = dbBegin.withPageID(rootNodeLink);
all.subtreeLowerBound = rootLink;
all.decodeLowerBound = rootLink;
all.subtreeUpperBound = dbEnd;
all.decodeUpperBound = dbEnd;
all.skipLen = 0;
MutationBuffer::const_iterator mBegin = batch.mutations->upper_bound(all.subtreeLowerBound.key);
--mBegin;
MutationBuffer::const_iterator mEnd = batch.mutations->lower_bound(all.subtreeUpperBound.key);
wait(
commitSubtree(self, &batch, rootNodeLink, invalidLogicalPageID, self->m_header.height, mBegin, mEnd, &all));
// If the old root was deleted, write a new empty tree root node and free the old roots
if (all.childrenChanged) {
if (all.newLinks.empty()) {
debug_printf("Writing new empty root.\n");
LogicalPageID newRootID = wait(self->m_pager->newPageID());
rootNodeLink = BTreeNodeLinkRef((LogicalPageID*)&newRootID, 1);
self->m_header.height = 1;
Reference<ArenaPage> page = wait(makeEmptyRoot(self));
// Newly allocated page so logical id = physical id and there is no parent as this is a new root
page->setLogicalPageInfo(rootNodeLink.front(), invalidLogicalPageID);
self->m_pager->updatePage(PagerEventReasons::Commit, self->m_header.height, rootNodeLink, page);
} else {
Standalone<VectorRef<RedwoodRecordRef>> newRootRecords(all.newLinks, all.newLinks.arena());
// Build new root levels if there are multiple new root records or if the root pointer is too large
Standalone<VectorRef<RedwoodRecordRef>> newRootPage =
wait(buildNewRootsIfNeeded(self, batch.writeVersion, newRootRecords, self->m_header.height));
rootNodeLink = newRootPage.front().getChildPage();
}
}
debug_printf("new root %s\n", toString(rootNodeLink).c_str());
self->m_header.root = rootNodeLink;
self->m_lazyClearStop = true;
wait(success(self->m_lazyClearActor));
debug_printf("Lazy delete freed %u pages\n", self->m_lazyClearActor.get());
wait(self->m_lazyClearQueue.flush());
self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState();
debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion);
wait(self->m_pager->commit(writeVersion, ObjectWriter::toValue(self->m_header, Unversioned())));
debug_printf("%s: Committed version %" PRId64 "\n", self->m_name.c_str(), writeVersion);
++g_redwoodMetrics.metric.opCommit;
self->m_lazyClearActor = incrementalLazyClear(self);
return Void();
}
public:
// Cursor into BTree which enables seeking and iteration in the BTree as a whole, or
// iteration within a specific page and movement across levels for more efficient access.
// Cursor record's memory is only guaranteed to be valid until cursor moves to a different page.
class BTreeCursor {
public:
struct PathEntry {
Reference<const ArenaPage> page;
BTreePage::BinaryTree::Cursor cursor;
#if REDWOOD_DEBUG
BTreeNodeLink id;
#endif
const BTreePage* btPage() const { return (const BTreePage*)page->data(); };
};
private:
PagerEventReasons reason;
Optional<ReadOptions> options;
VersionedBTree* btree;
Reference<IPagerSnapshot> pager;
bool valid;
std::vector<PathEntry> path;
public:
BTreeCursor() : reason(PagerEventReasons::MAXEVENTREASONS) {}
bool intialized() const { return pager.isValid(); }
bool isValid() const { return valid; }
// path entries at dumpHeight or below will have their entire pages printed
std::string toString(int dumpHeight = 0) const {
std::string r = format("{ptr=%p reason=%s %s ",
this,
PagerEventReasonsStrings[(int)reason],
::toString(pager->getVersion()).c_str());
for (int i = 0; i < path.size(); ++i) {
std::string id = "<debugOnly>";
#if REDWOOD_DEBUG
id = ::toString(path[i].id);
#endif
int height = path[i].btPage()->height;
r += format("\n\t[Level=%d ID=%s ptr=%p Cursor=%s] ",
height,
id.c_str(),
path[i].page->data(),
path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage()->isLeaf()).c_str()
: "<invalid>");
if (height <= dumpHeight) {
BTreePage::BinaryTree::Cursor c = path[i].cursor;
c.moveFirst();
int i = 0;
while (c.valid()) {
r += format("\n\t\%7d: %s", ++i, c.get().toString(height == 1).c_str());
c.moveNext();
}
}
}
r += "\n";
if (!valid) {
r += " (invalid) ";
}
r += "}";
return r;
}
const RedwoodRecordRef get() { return path.back().cursor.get(); }
bool inRoot() const { return path.size() == 1; }
// To enable more efficient range scans, caller can read the lowest page
// of the cursor and pop it.
PathEntry& back() { return path.back(); }
void popPath() { path.pop_back(); }
Future<Void> pushPage(const BTreePage::BinaryTree::Cursor& link) {
debug_printf("pushPage(link=%s)\n", link.get().toString(false).c_str());
return map(readPage(btree,
reason,
path.back().btPage()->height - 1,
pager.getPtr(),
link.get().getChildPage(),
ioMaxPriority,
false,
!options.present() || options.get().cacheResult || path.back().btPage()->height != 2),
[=](Reference<const ArenaPage> p) {
BTreePage::BinaryTree::Cursor cursor = btree->getCursor(p.getPtr(), link);
#if REDWOOD_DEBUG
path.push_back({ p, cursor, link.get().getChildPage() });
#else
path.push_back({ p, cursor });
#endif
if (btree->m_pBoundaryVerifier != nullptr) {
Optional<int64_t> domainId;
if (p->isEncrypted() && btree->m_keyProvider->enableEncryptionDomain()) {
domainId = p->getEncryptionDomainId();
}
ASSERT(btree->m_pBoundaryVerifier->verify(link.get().getChildPage().front(),
pager->getVersion(),
link.get().key,
link.next().getOrUpperBound().key,
domainId,
cursor));
}
return Void();
});
}
Future<Void> pushPage(BTreeNodeLinkRef id) {
debug_printf("pushPage(root=%s)\n", ::toString(id).c_str());
return map(readPage(btree, reason, btree->m_header.height, pager.getPtr(), id, ioMaxPriority, false, true),
[=](Reference<const ArenaPage> p) {
#if REDWOOD_DEBUG
path.push_back({ p, btree->getCursor(p.getPtr(), dbBegin, dbEnd), id });
#else
path.push_back({ p, btree->getCursor(p.getPtr(), dbBegin, dbEnd) });
#endif
return Void();
});
}
// Initialize or reinitialize cursor
Future<Void> init(VersionedBTree* btree_in,
PagerEventReasons reason_in,
Optional<ReadOptions> options_in,
Reference<IPagerSnapshot> pager_in,
BTreeNodeLink root) {
btree = btree_in;
reason = reason_in;
options = options_in;
pager = pager_in;
path.clear();
path.reserve(6);
valid = false;
return root.empty() ? Void() : pushPage(root);
}
// Seeks cursor to query if it exists, the record before or after it, or an undefined and invalid
// position between those records
// If 0 is returned, then
// If the cursor is valid then it points to query
// If the cursor is not valid then the cursor points to some place in the btree such that
// If there is a record in the tree < query then movePrev() will move to it, and
// If there is a record in the tree > query then moveNext() will move to it.
// If non-zero is returned then the cursor is valid and the return value is logically equivalent
// to query.compare(cursor.get())
ACTOR Future<int> seek_impl(BTreeCursor* self, RedwoodRecordRef query) {
state RedwoodRecordRef internalPageQuery = query.withMaxPageID();
self->path.resize(1);
debug_printf("seek(%s) start cursor = %s\n", query.toString().c_str(), self->toString().c_str());
loop {
auto& entry = self->path.back();
if (entry.btPage()->isLeaf()) {
int cmp = entry.cursor.seek(query);
self->valid = entry.cursor.valid() && !entry.cursor.isErased();
debug_printf("seek(%s) loop exit cmp=%d cursor=%s\n",
query.toString().c_str(),
cmp,
self->toString().c_str());
return self->valid ? cmp : 0;
}
// Internal page, so seek to the branch where query must be
// Currently, after a subtree deletion internal page boundaries are still strictly adhered
// to and will be updated if anything is inserted into the cleared range, so if the seek fails
// or it finds an entry with a null child page then query does not exist in the BTree.
if (entry.cursor.seekLessThan(internalPageQuery) && entry.cursor.get().value.present()) {
debug_printf(
"seek(%s) loop seek success cursor=%s\n", query.toString().c_str(), self->toString().c_str());
Future<Void> f = self->pushPage(entry.cursor);
wait(f);
} else {
self->valid = false;
debug_printf(
"seek(%s) loop exit cmp=0 cursor=%s\n", query.toString().c_str(), self->toString().c_str());
return 0;
}
}
}
Future<int> seek(RedwoodRecordRef query) { return path.empty() ? 0 : seek_impl(this, query); }
ACTOR Future<Void> seekGTE_impl(BTreeCursor* self, RedwoodRecordRef query) {
debug_printf("seekGTE(%s) start\n", query.toString().c_str());
int cmp = wait(self->seek(query));
if (cmp > 0 || (cmp == 0 && !self->isValid())) {
wait(self->moveNext());
}
return Void();
}
Future<Void> seekGTE(RedwoodRecordRef query) { return seekGTE_impl(this, query); }
// Start fetching sibling nodes in the forward or backward direction, stopping after recordLimit or byteLimit
void prefetch(KeyRef rangeEnd, bool directionForward, int recordLimit, int byteLimit) {
// Prefetch scans level 2 so if there are less than 2 nodes in the path there is no level 2
if (path.size() < 2) {
return;
}
auto firstLeaf = path.back().btPage();
// We know the first leaf's record count, so assume they are all relevant to the query,
// even though some may not be.
int recordsRead = firstLeaf->tree()->numItems;
// We can't know for sure how many records are in a node without reading it, so just guess
// that siblings have about the same record count as the first leaf.
int estRecordsPerPage = recordsRead;
// Use actual KVBytes stored for the first leaf, but use node capacity for siblings below
int bytesRead = firstLeaf->kvBytes;
// Cursor for moving through siblings.
// Note that only immediate siblings under the same parent are considered for prefetch so far.
BTreePage::BinaryTree::Cursor c = path[path.size() - 2].cursor;
ASSERT(path[path.size() - 2].btPage()->height == 2);
// The loop conditions are split apart into different if blocks for readability.
// While query limits are not exceeded
while (recordsRead < recordLimit && bytesRead < byteLimit) {
// If prefetching right siblings
if (directionForward) {
// If there is no right sibling or its lower boundary is greater
// or equal to than the range end then stop.
if (!c.moveNext() || c.get().key >= rangeEnd) {
break;
}
} else {
// Prefetching left siblings
// If the current leaf lower boundary is less than or equal to the range end
// or there is no left sibling then stop
if (c.get().key <= rangeEnd || !c.movePrev()) {
break;
}
}
// Prefetch the sibling if the link is not null
if (c.get().value.present()) {
BTreeNodeLinkRef childPage = c.get().getChildPage();
if (childPage.size() > 0)
preLoadPage(pager.getPtr(), childPage, ioLeafPriority);
recordsRead += estRecordsPerPage;
// Use sibling node capacity as an estimate of bytes read.
bytesRead += childPage.size() * this->btree->m_blockSize;
}
}
}
ACTOR Future<Void> seekLT_impl(BTreeCursor* self, RedwoodRecordRef query) {
debug_printf("seekLT(%s) start\n", query.toString().c_str());
int cmp = wait(self->seek(query));
if (cmp <= 0) {
wait(self->movePrev());
}
return Void();
}
Future<Void> seekLT(RedwoodRecordRef query) { return seekLT_impl(this, query); }
ACTOR Future<Void> move_impl(BTreeCursor* self, bool forward) {
// Try to the move cursor at the end of the path in the correct direction
debug_printf("move%s() start cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
while (1) {
debug_printf("move%s() first loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
auto& entry = self->path.back();
bool success;
if (entry.cursor.valid()) {
success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
} else {
success = forward ? entry.cursor.moveFirst() : false;
}
// Skip over internal page entries that do not link to child pages. There should never be two in a row.
if (success && !entry.btPage()->isLeaf() && !entry.cursor.get().value.present()) {
success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
ASSERT(!success || entry.cursor.get().value.present());
}
// Stop if successful
if (success) {
break;
}
if (self->path.size() == 1) {
self->valid = false;
debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString(1).c_str());
return Void();
}
// Move to parent
self->path.pop_back();
}
// While not on a leaf page, move down to get to one.
while (1) {
debug_printf("move%s() second loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
auto& entry = self->path.back();
if (entry.btPage()->isLeaf()) {
break;
}
// The last entry in an internal page could be a null link, if so move back
if (!forward && !entry.cursor.get().value.present()) {
UNSTOPPABLE_ASSERT(entry.cursor.movePrev());
UNSTOPPABLE_ASSERT(entry.cursor.get().value.present());
}
wait(self->pushPage(entry.cursor));
auto& newEntry = self->path.back();
UNSTOPPABLE_ASSERT(forward ? newEntry.cursor.moveFirst() : newEntry.cursor.moveLast());
}
self->valid = true;
debug_printf("move%s() exit cursor=%s\n", forward ? "Next" : "Prev", self->toString(1).c_str());
return Void();
}
Future<Void> moveNext() { return path.empty() ? Void() : move_impl(this, true); }
Future<Void> movePrev() { return path.empty() ? Void() : move_impl(this, false); }
};
Future<Void> initBTreeCursor(BTreeCursor* cursor,
Version snapshotVersion,
PagerEventReasons reason,
Optional<ReadOptions> options = Optional<ReadOptions>()) {
Reference<IPagerSnapshot> snapshot = m_pager->getReadSnapshot(snapshotVersion);
BTreeNodeLinkRef root;
// Parse the metakey just once to get the root pointer and store it as the extra object
if (!snapshot->extra.valid()) {
KeyRef m = snapshot->getMetaKey();
if (!m.empty()) {
BTreeCommitHeader h = ObjectReader::fromStringRef<VersionedBTree::BTreeCommitHeader>(m, Unversioned());
root = h.root;
// Copy the BTreeNodeLink but keep the same arena and BTreeNodeLinkRef
snapshot->extra = new BTreeNodeLink(h.root, h.root.arena());
}
} else {
root = *snapshot->extra.getPtr<BTreeNodeLink>();
}
return cursor->init(this, reason, options, snapshot, root);
}
};
#include "fdbserver/art_impl.h"
RedwoodRecordRef VersionedBTree::dbBegin(""_sr);
RedwoodRecordRef VersionedBTree::dbEnd("\xff\xff\xff\xff\xff"_sr);
class KeyValueStoreRedwood : public IKeyValueStore {
public:
KeyValueStoreRedwood(std::string filename, UID logID, Reference<IPageEncryptionKeyProvider> encryptionKeyProvider)
: m_filename(filename), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) {
int pageSize =
BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE;
int extentSize = SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE;
int64_t pageCacheBytes =
g_network->isSimulated()
? (BUGGIFY ? deterministicRandom()->randomInt(pageSize, FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K)
: FLOW_KNOBS->SIM_PAGE_CACHE_4K)
: FLOW_KNOBS->PAGE_CACHE_4K;
// Rough size of pages to keep in remap cleanup queue before being cleanup.
int64_t remapCleanupWindowBytes =
g_network->isSimulated()
? (BUGGIFY ? (deterministicRandom()->coinflip()
? deterministicRandom()->randomInt64(0, 100 * 1024) // small window
: deterministicRandom()->randomInt64(0, 100 * 1024 * 1024)) // large window
: 100 * 1024 * 1024) // 100M
: SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW_BYTES;
EncodingType encodingType = EncodingType::XXHash64;
// When reopening Redwood on restart, the cluser encryption config could be unknown at this point,
// for which shouldEnableEncryption will return false. In that case, if the Redwood instance was encrypted
// before, the encoding type in the header page will be used instead.
//
// TODO(yiwu): When the cluster encryption config is available later, fail if the cluster is configured to
// enable encryption, but the Redwood instance is unencrypted.
if (encryptionKeyProvider && encryptionKeyProvider->enableEncryption()) {
ASSERT(encryptionKeyProvider->expectedEncodingType() == EncodingType::AESEncryptionV1);
encodingType = EncodingType::AESEncryptionV1;
m_keyProvider = encryptionKeyProvider;
} else if (g_network->isSimulated() && logID.hash() % 2 == 0) {
// Simulation only. Deterministically enable encryption based on uid
encodingType = EncodingType::XOREncryption_TestOnly;
m_keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(filename);
}
IPager2* pager = new DWALPager(pageSize,
extentSize,
filename,
pageCacheBytes,
remapCleanupWindowBytes,
SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS,
false,
m_keyProvider,
m_error);
m_tree = new VersionedBTree(pager, filename, encodingType, m_keyProvider);
m_init = catchError(init_impl(this));
}
Future<Void> init() override { return m_init; }
ACTOR Future<Void> init_impl(KeyValueStoreRedwood* self) {
TraceEvent(SevInfo, "RedwoodInit").detail("FilePrefix", self->m_filename);
wait(self->m_tree->init());
TraceEvent(SevInfo, "RedwoodInitComplete")
.detail("Filename", self->m_filename)
.detail("Version", self->m_tree->getLastCommittedVersion());
self->m_nextCommitVersion = self->m_tree->getLastCommittedVersion() + 1;
return Void();
}
ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) {
TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose);
g_redwoodMetrics.ioLock = nullptr;
// In simulation, if the instance is being disposed of then sometimes run destructive sanity check.
if (g_network->isSimulated() && dispose && BUGGIFY) {
// Only proceed if the last commit is a success, but don't throw if it's not because shutdown
// should not throw.
wait(ready(self->m_lastCommit));
if (!self->m_lastCommit.isError()) {
// Run the destructive sanity check, but don't throw.
ErrorOr<Void> err = wait(errorOr(self->m_tree->clearAllAndCheckSanity()));
// If the test threw an error, it must be an injected fault or something has gone wrong.
ASSERT(!err.isError() || err.getError().isInjectedFault());
}
} else {
// The KVS user shouldn't be holding a commit future anymore so self shouldn't either.
self->m_lastCommit = Void();
}
if (self->m_error.canBeSet()) {
self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress
}
self->m_init.cancel();
Future<Void> closedFuture = self->m_tree->onClosed();
if (dispose)
self->m_tree->dispose();
else
self->m_tree->close();
wait(closedFuture);
self->m_closed.send(Void());
TraceEvent(SevInfo, "RedwoodShutdownComplete").detail("Filename", self->m_filename).detail("Dispose", dispose);
delete self;
}
void close() override { shutdown(this, false); }
void dispose() override { shutdown(this, true); }
Future<Void> onClosed() const override { return m_closed.getFuture(); }
Future<Void> commit(bool sequential = false) override {
m_lastCommit = catchError(m_tree->commit(m_nextCommitVersion));
// Currently not keeping history
m_tree->setOldestReadableVersion(m_nextCommitVersion);
++m_nextCommitVersion;
return m_lastCommit;
}
KeyValueStoreType getType() const override { return KeyValueStoreType::SSD_REDWOOD_V1; }
StorageBytes getStorageBytes() const override { return m_tree->getStorageBytes(); }
Future<Void> getError() const override { return delayed(m_error.getFuture()); };
void clear(KeyRangeRef range,
const StorageServerMetrics* storageMetrics = nullptr,
const Arena* arena = 0) override {
debug_printf("CLEAR %s\n", printable(range).c_str());
m_tree->clear(range);
}
void set(KeyValueRef keyValue, const Arena* arena = nullptr) override {
debug_printf("SET %s\n", printable(keyValue).c_str());
m_tree->set(keyValue);
}
Future<RangeResult> readRange(KeyRangeRef keys,
int rowLimit,
int byteLimit,
Optional<ReadOptions> options) override {
debug_printf("READRANGE %s\n", printable(keys).c_str());
return catchError(readRange_impl(this, keys, rowLimit, byteLimit, options));
}
ACTOR static Future<RangeResult> readRange_impl(KeyValueStoreRedwood* self,
KeyRange keys,
int rowLimit,
int byteLimit,
Optional<ReadOptions> options) {
state PagerEventReasons reason = PagerEventReasons::RangeRead;
state VersionedBTree::BTreeCursor cur;
if (options.present() && options.get().type == ReadType::FETCH) {
reason = PagerEventReasons::FetchRange;
}
wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion(), reason, options));
state PriorityMultiLock::Lock lock;
state Future<Void> f;
++g_redwoodMetrics.metric.opGetRange;
state RangeResult result;
state int accumulatedBytes = 0;
ASSERT(byteLimit > 0);
if (rowLimit == 0) {
return result;
}
if (rowLimit > 0) {
f = cur.seekGTE(keys.begin);
if (f.isReady()) {
CODE_PROBE(true, "Cached forward range read seek");
f.get();
} else {
CODE_PROBE(true, "Uncached forward range read seek");
wait(f);
}
if (self->prefetch) {
cur.prefetch(keys.end, true, rowLimit, byteLimit);
}
while (cur.isValid()) {
// Read leaf page contents without using waits by using the leaf page cursor directly
// and advancing it until it is no longer valid
BTreePage::BinaryTree::Cursor& leafCursor = cur.back().cursor;
// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
// > because both query end and page upper bound are exclusive of the query results and page contents,
// respectively
bool checkBounds = leafCursor.cache->upperBound > keys.end;
// Whether or not any results from this page were added to results
bool usedPage = false;
while (leafCursor.valid()) {
KeyValueRef kv = leafCursor.get().toKeyValueRef();
if (checkBounds && kv.key.compare(keys.end) >= 0) {
break;
}
accumulatedBytes += kv.expectedSize();
result.push_back(result.arena(), kv);
usedPage = true;
if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
break;
}
leafCursor.moveNext();
}
// If the page was used, results must depend on the ArenaPage arena and the Mirror arena.
// This must be done after visiting all the results in case the Mirror arena changes.
if (usedPage) {
result.arena().dependsOn(leafCursor.cache->arena);
result.arena().dependsOn(cur.back().page->getArena());
}
// Stop if the leaf cursor is still valid which means we hit a key or size limit or
// if the cursor is in the root page, in which case there are no more pages.
if (leafCursor.valid() || cur.inRoot()) {
break;
}
cur.popPath();
wait(cur.moveNext());
}
} else {
f = cur.seekLT(keys.end);
if (f.isReady()) {
CODE_PROBE(true, "Cached reverse range read seek");
f.get();
} else {
CODE_PROBE(true, "Uncached reverse range read seek");
wait(f);
}
if (self->prefetch) {
cur.prefetch(keys.begin, false, -rowLimit, byteLimit);
}
while (cur.isValid()) {
// Read leaf page contents without using waits by using the leaf page cursor directly
// and advancing it until it is no longer valid
BTreePage::BinaryTree::Cursor& leafCursor = cur.back().cursor;
// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
// < because both query begin and page lower bound are inclusive of the query results and page contents,
// respectively
bool checkBounds = leafCursor.cache->lowerBound < keys.begin;
// Whether or not any results from this page were added to results
bool usedPage = false;
while (leafCursor.valid()) {
KeyValueRef kv = leafCursor.get().toKeyValueRef();
if (checkBounds && kv.key.compare(keys.begin) < 0) {
break;
}
accumulatedBytes += kv.expectedSize();
result.push_back(result.arena(), kv);
usedPage = true;
if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
break;
}
leafCursor.movePrev();
}
// If the page was used, results must depend on the ArenaPage arena and the Mirror arena.
// This must be done after visiting all the results in case the Mirror arena changes.
if (usedPage) {
result.arena().dependsOn(leafCursor.cache->arena);
result.arena().dependsOn(cur.back().page->getArena());
}
// Stop if the leaf cursor is still valid which means we hit a key or size limit or
// if we started in the root page
if (leafCursor.valid() || cur.inRoot()) {
break;
}
cur.popPath();
wait(cur.movePrev());
}
}
result.more = rowLimit == 0 || accumulatedBytes >= byteLimit;
if (result.more) {
ASSERT(result.size() > 0);
result.readThrough = result[result.size() - 1].key;
}
g_redwoodMetrics.kvSizeReadByGetRange->sample(accumulatedBytes);
return result;
}
ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwood* self,
Key key,
Optional<ReadOptions> options) {
state VersionedBTree::BTreeCursor cur;
wait(self->m_tree->initBTreeCursor(
&cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead, options));
++g_redwoodMetrics.metric.opGet;
wait(cur.seekGTE(key));
if (cur.isValid() && cur.get().key == key) {
// Return a Value whose arena depends on the source page arena
Value v;
v.arena().dependsOn(cur.back().page->getArena());
v.contents() = cur.get().value.get();
g_redwoodMetrics.kvSizeReadByGet->sample(cur.get().kvBytes());
return v;
}
return Optional<Value>();
}
Future<Optional<Value>> readValue(KeyRef key, Optional<ReadOptions> options) override {
return catchError(readValue_impl(this, key, options));
}
Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<ReadOptions> options) override {
return catchError(map(readValue_impl(this, key, options), [maxLength](Optional<Value> v) {
if (v.present() && v.get().size() > maxLength) {
v.get().contents() = v.get().substr(0, maxLength);
}
return v;
}));
}
~KeyValueStoreRedwood() override{};
private:
std::string m_filename;
VersionedBTree* m_tree;
Future<Void> m_init;
Promise<Void> m_closed;
Promise<Void> m_error;
bool prefetch;
Version m_nextCommitVersion;
Reference<IPageEncryptionKeyProvider> m_keyProvider;
Future<Void> m_lastCommit = Void();
template <typename T>
inline Future<T> catchError(Future<T> f) {
return forwardError(f, m_error);
}
};
IKeyValueStore* keyValueStoreRedwoodV1(std::string const& filename,
UID logID,
Reference<IPageEncryptionKeyProvider> encryptionKeyProvider) {
return new KeyValueStoreRedwood(filename, logID, encryptionKeyProvider);
}
int randomSize(int max) {
int n = pow(deterministicRandom()->random01(), 3) * max;
return n;
}
StringRef randomString(Arena& arena, int len, char firstChar = 'a', char lastChar = 'z') {
++lastChar;
StringRef s = makeString(len, arena);
for (int i = 0; i < len; ++i) {
*(uint8_t*)(s.begin() + i) = (uint8_t)deterministicRandom()->randomInt(firstChar, lastChar);
}
return s;
}
Standalone<StringRef> randomString(int len, char firstChar = 'a', char lastChar = 'z') {
Standalone<StringRef> s;
(StringRef&)s = randomString(s.arena(), len, firstChar, lastChar);
return s;
}
KeyValue randomKV(int maxKeySize = 10, int maxValueSize = 5) {
int kLen = randomSize(1 + maxKeySize);
int vLen = maxValueSize > 0 ? randomSize(maxValueSize) : 0;
KeyValue kv;
kv.key = randomString(kv.arena(), kLen, 'a', 'm');
for (int i = 0; i < kLen; ++i)
mutateString(kv.key)[i] = (uint8_t)deterministicRandom()->randomInt('a', 'm');
if (vLen > 0) {
kv.value = randomString(kv.arena(), vLen, 'n', 'z');
for (int i = 0; i < vLen; ++i)
mutateString(kv.value)[i] = (uint8_t)deterministicRandom()->randomInt('o', 'z');
}
return kv;
}
// Verify a range using a BTreeCursor.
// Assumes that the BTree holds a single data version and the version is 0.
ACTOR Future<Void> verifyRangeBTreeCursor(VersionedBTree* btree,
Key start,
Key end,
Version v,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
int64_t* pRecordsRead) {
if (end <= start)
end = keyAfter(start);
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i =
written->lower_bound(std::make_pair(start.toString(), 0));
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd =
written->upper_bound(std::make_pair(end.toString(), initialVersion));
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iLast;
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead));
debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Start\n", v, start.printable().c_str(), end.printable().c_str());
// Randomly use the cursor for something else first.
if (deterministicRandom()->coinflip()) {
state Key randomKey = randomKV().key;
debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
randomKey.toString().c_str());
wait(success(cur.seek(randomKey)));
}
debug_printf(
"VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(), end.printable().c_str());
wait(cur.seekGTE(start));
state Standalone<VectorRef<KeyValueRef>> results;
while (cur.isValid() && cur.get().key < end) {
// Find the next written kv pair that would be present at this version
while (1) {
// Since the written map grows, range scans become less efficient so count all records written
// at any version against the records read count
++*pRecordsRead;
iLast = i;
if (i == iEnd)
break;
++i;
if (iLast->first.second <= v && iLast->second.present() &&
(i == iEnd || i->first.first != iLast->first.first || i->first.second > v)) {
debug_printf("VerifyRange(@%" PRId64 ", %s, %s) Found key in written map: %s\n",
v,
start.printable().c_str(),
end.printable().c_str(),
iLast->first.first.c_str());
break;
}
}
if (iLast == iEnd) {
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' vs nothing in written map.\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str());
ASSERT(false);
}
if (cur.get().key != iLast->first.first) {
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' but expected '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str(),
iLast->first.first.c_str());
ASSERT(false);
}
if (cur.get().value.get() != iLast->second.get()) {
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' has tree value '%s' but expected '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str(),
cur.get().value.get().toString().c_str(),
iLast->second.get().c_str());
ASSERT(false);
}
results.push_back(results.arena(), cur.get().toKeyValueRef());
results.arena().dependsOn(cur.back().cursor.cache->arena);
results.arena().dependsOn(cur.back().page->getArena());
wait(cur.moveNext());
}
// Make sure there are no further written kv pairs that would be present at this version.
while (1) {
iLast = i;
if (i == iEnd)
break;
++i;
if (iLast->first.second <= v && iLast->second.present() &&
(i == iEnd || i->first.first != iLast->first.first || i->first.second > v))
break;
}
if (iLast != iEnd) {
printf("VerifyRange(@%" PRId64 ", %s, %s) ERROR: BTree range ended but written has @%" PRId64 " '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
iLast->first.second,
iLast->first.first.c_str());
ASSERT(false);
}
debug_printf(
"VerifyRangeReverse(@%" PRId64 ", %s, %s): start\n", v, start.printable().c_str(), end.printable().c_str());
// Randomly use a new cursor at the same version for the reverse range read, if the version is still available for
// opening new cursors
if (v >= btree->getOldestReadableVersion() && deterministicRandom()->coinflip()) {
cur = VersionedBTree::BTreeCursor();
wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead));
}
// Now read the range from the tree in reverse order and compare to the saved results
wait(cur.seekLT(end));
state std::reverse_iterator<const KeyValueRef*> r = results.rbegin();
while (cur.isValid() && cur.get().key >= start) {
++*pRecordsRead;
if (r == results.rend()) {
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' vs nothing in written map.\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str());
ASSERT(false);
}
if (cur.get().key != r->key) {
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR:BTree key '%s' but expected '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str(),
r->key.toString().c_str());
ASSERT(false);
}
if (cur.get().value.get() != r->value) {
printf("VerifyRangeReverse(@%" PRId64
", %s, %s) ERROR:BTree key '%s' has tree value '%s' but expected '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
cur.get().key.toString().c_str(),
cur.get().value.get().toString().c_str(),
r->value.toString().c_str());
ASSERT(false);
}
++r;
wait(cur.movePrev());
}
if (r != results.rend()) {
printf("VerifyRangeReverse(@%" PRId64 ", %s, %s) ERROR: BTree range ended but written has '%s'\n",
v,
start.printable().c_str(),
end.printable().c_str(),
r->key.toString().c_str());
ASSERT(false);
}
return Void();
}
// Verify the result of point reads for every set or cleared key change made at exactly v
ACTOR Future<Void> seekAllBTreeCursor(VersionedBTree* btree,
Version v,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
int64_t* pRecordsRead) {
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator i = written->cbegin();
state std::map<std::pair<std::string, Version>, Optional<std::string>>::const_iterator iEnd = written->cend();
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead));
while (i != iEnd) {
// Since the written map gets larger and takes longer to scan each time, count visits to all written recs
++*pRecordsRead;
state std::string key = i->first.first;
state Version ver = i->first.second;
if (ver == v) {
state Optional<std::string> val = i->second;
debug_printf("Verifying @%" PRId64 " '%s'\n", ver, key.c_str());
state Arena arena;
wait(cur.seekGTE(RedwoodRecordRef(KeyRef(arena, key))));
bool foundKey = cur.isValid() && cur.get().key == key;
bool hasValue = foundKey && cur.get().value.present();
if (val.present()) {
bool valueMatch = hasValue && cur.get().value.get() == val.get();
if (!foundKey || !hasValue || !valueMatch) {
if (!foundKey) {
printf("Verify ERROR: key_not_found: '%s' -> '%s' @%" PRId64 "\n",
key.c_str(),
val.get().c_str(),
ver);
} else if (!hasValue) {
printf("Verify ERROR: value_not_found: '%s' -> '%s' @%" PRId64 "\n",
key.c_str(),
val.get().c_str(),
ver);
} else if (!valueMatch) {
printf("Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%" PRId64 "\n",
key.c_str(),
cur.get().value.get().toString().c_str(),
val.get().c_str(),
ver);
}
ASSERT(false);
}
} else if (foundKey && hasValue) {
printf("Verify ERROR: cleared_key_found: '%s' -> '%s' @%" PRId64 "\n",
key.c_str(),
cur.get().value.get().toString().c_str(),
ver);
ASSERT(false);
}
}
++i;
}
return Void();
}
ACTOR Future<Void> verify(VersionedBTree* btree,
FutureStream<Version> vStream,
std::map<std::pair<std::string, Version>, Optional<std::string>>* written,
int64_t* pRecordsRead,
bool serial) {
// Queue of committed versions still readable from btree
state std::deque<Version> committedVersions;
try {
loop {
state Version v = waitNext(vStream);
committedVersions.push_back(v);
// Remove expired versions
while (!committedVersions.empty() && committedVersions.front() < btree->getOldestReadableVersion()) {
committedVersions.pop_front();
}
// Continue if the versions list is empty, which won't wait until it reaches the oldest readable
// btree version which will already be in vStream.
if (committedVersions.empty()) {
continue;
}
// Choose a random committed version.
v = committedVersions[deterministicRandom()->randomInt(0, committedVersions.size())];
debug_printf("Using committed version %" PRId64 "\n", v);
// Get a cursor at v so that v doesn't get expired between the possibly serial steps below.
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, v, PagerEventReasons::RangeRead));
debug_printf("Verifying entire key range at version %" PRId64 "\n", v);
state Future<Void> fRangeAll =
verifyRangeBTreeCursor(btree, ""_sr, "\xff\xff"_sr, v, written, pRecordsRead);
if (serial) {
wait(fRangeAll);
}
Key begin = randomKV().key;
Key end = randomKV().key;
debug_printf(
"Verifying range (%s, %s) at version %" PRId64 "\n", toString(begin).c_str(), toString(end).c_str(), v);
state Future<Void> fRangeRandom = verifyRangeBTreeCursor(btree, begin, end, v, written, pRecordsRead);
if (serial) {
wait(fRangeRandom);
}
debug_printf("Verifying seeks to each changed key at version %" PRId64 "\n", v);
state Future<Void> fSeekAll = seekAllBTreeCursor(btree, v, written, pRecordsRead);
if (serial) {
wait(fSeekAll);
}
wait(fRangeAll && fRangeRandom && fSeekAll);
printf("Verified version %" PRId64 "\n", v);
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
return Void();
}
// Does a random range read, doesn't trap/report errors
ACTOR Future<Void> randomReader(VersionedBTree* btree, int64_t* pRecordsRead) {
state VersionedBTree::BTreeCursor cur;
loop {
wait(yield());
if (!cur.intialized() || deterministicRandom()->random01() > .01) {
wait(btree->initBTreeCursor(&cur, btree->getLastCommittedVersion(), PagerEventReasons::RangeRead));
}
state KeyValue kv = randomKV(10, 0);
wait(cur.seekGTE(kv.key));
state int c = deterministicRandom()->randomInt(0, 100);
state bool direction = deterministicRandom()->coinflip();
while (cur.isValid() && c-- > 0) {
++*pRecordsRead;
wait(success(direction ? cur.moveNext() : cur.movePrev()));
wait(yield());
}
}
}
struct IntIntPair {
IntIntPair() {}
IntIntPair(int k, int v) : k(k), v(v) {}
IntIntPair(Arena& arena, const IntIntPair& toCopy) { *this = toCopy; }
typedef IntIntPair Partial;
void updateCache(Optional<Partial> cache, Arena& arena) const {}
struct Delta {
bool prefixSource;
bool deleted;
int dk;
int dv;
IntIntPair apply(const IntIntPair& base, Arena& arena) { return { base.k + dk, base.v + dv }; }
IntIntPair apply(const Partial& cache) { return cache; }
IntIntPair apply(Arena& arena, const IntIntPair& base, Optional<Partial>& cache) {
cache = IntIntPair(base.k + dk, base.v + dv);
return cache.get();
}
void setPrefixSource(bool val) { prefixSource = val; }
bool getPrefixSource() const { return prefixSource; }
void setDeleted(bool val) { deleted = val; }
bool getDeleted() const { return deleted; }
int size() const { return sizeof(Delta); }
std::string toString() const {
return format(
"DELTA{prefixSource=%d deleted=%d dk=%d(0x%x) dv=%d(0x%x)}", prefixSource, deleted, dk, dk, dv, dv);
}
};
// For IntIntPair, skipLen will be in units of fields, not bytes
int getCommonPrefixLen(const IntIntPair& other, int skip = 0) const {
if (k == other.k) {
if (v == other.v) {
return 2;
}
return 1;
}
return 0;
}
int compare(const IntIntPair& rhs, int skip = 0) const {
if (skip == 2) {
return 0;
}
int cmp = (skip > 0) ? 0 : (k - rhs.k);
if (cmp == 0) {
cmp = v - rhs.v;
}
return cmp;
}
bool operator==(const IntIntPair& rhs) const { return compare(rhs) == 0; }
bool operator!=(const IntIntPair& rhs) const { return compare(rhs) != 0; }
bool operator<(const IntIntPair& rhs) const { return compare(rhs) < 0; }
bool operator>(const IntIntPair& rhs) const { return compare(rhs) > 0; }
bool operator<=(const IntIntPair& rhs) const { return compare(rhs) <= 0; }
bool operator>=(const IntIntPair& rhs) const { return compare(rhs) >= 0; }
int deltaSize(const IntIntPair& base, int skipLen, bool worstcase) const { return sizeof(Delta); }
int writeDelta(Delta& d, const IntIntPair& base, int commonPrefix = -1) const {
d.prefixSource = false;
d.deleted = false;
d.dk = k - base.k;
d.dv = v - base.v;
return sizeof(Delta);
}
int k;
int v;
std::string toString() const { return format("{k=%d(0x%x) v=%d(0x%x)}", k, k, v, v); }
};
int deltaTest(RedwoodRecordRef rec, RedwoodRecordRef base) {
std::vector<uint8_t> buf(rec.key.size() + rec.value.orDefault(StringRef()).size() + 20);
RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)&buf.front();
Arena mem;
int expectedSize = rec.deltaSize(base, 0, false);
int deltaSize = rec.writeDelta(d, base);
RedwoodRecordRef decoded = d.apply(base, mem);
if (decoded != rec || expectedSize != deltaSize || d.size() != deltaSize) {
printf("\n");
printf("Base: %s\n", base.toString().c_str());
printf("Record: %s\n", rec.toString().c_str());
printf("Decoded: %s\n", decoded.toString().c_str());
printf("deltaSize(): %d\n", expectedSize);
printf("writeDelta(): %d\n", deltaSize);
printf("d.size(): %d\n", d.size());
printf("DeltaToString: %s\n", d.toString().c_str());
printf("RedwoodRecordRef::Delta test failure!\n");
ASSERT(false);
}
return deltaSize;
}
RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std::string& valueBuffer) {
RedwoodRecordRef rec;
rec.key = StringRef((uint8_t*)keyBuffer.data(), deterministicRandom()->randomInt(0, keyBuffer.size()));
if (deterministicRandom()->coinflip()) {
rec.value = StringRef((uint8_t*)valueBuffer.data(), deterministicRandom()->randomInt(0, valueBuffer.size()));
}
return rec;
}
void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) {
std::pair<const char*, unsigned int> metrics[] = { { "BTreePreload", metric.btreeLeafPreload },
{ "BTreePreloadExt", metric.btreeLeafPreloadExt },
{ "", 0 },
{ "OpSet", metric.opSet },
{ "OpSetKeyBytes", metric.opSetKeyBytes },
{ "OpSetValueBytes", metric.opSetValueBytes },
{ "OpClear", metric.opClear },
{ "OpClearKey", metric.opClearKey },
{ "", 0 },
{ "OpGet", metric.opGet },
{ "OpGetRange", metric.opGetRange },
{ "OpCommit", metric.opCommit },
{ "", 0 },
{ "PagerDiskWrite", metric.pagerDiskWrite },
{ "PagerDiskRead", metric.pagerDiskRead },
{ "PagerCacheHit", metric.pagerCacheHit },
{ "PagerCacheMiss", metric.pagerCacheMiss },
{ "", 0 },
{ "PagerProbeHit", metric.pagerProbeHit },
{ "PagerProbeMiss", metric.pagerProbeMiss },
{ "PagerEvictUnhit", metric.pagerEvictUnhit },
{ "PagerEvictFail", metric.pagerEvictFail },
{ "", 0 },
{ "PagerRemapFree", metric.pagerRemapFree },
{ "PagerRemapCopy", metric.pagerRemapCopy },
{ "PagerRemapSkip", metric.pagerRemapSkip },
{ "", 0 } };
double elapsed = now() - startTime;
if (e != nullptr) {
for (auto& m : metrics) {
char c = m.first[0];
if (c != 0 && (!skipZeroes || m.second != 0)) {
e->detail(m.first, m.second);
}
}
levels[0].metrics.events.toTraceEvent(e, 0);
}
if (s != nullptr) {
for (auto& m : metrics) {
if (*m.first == '\0') {
*s += "\n";
} else if (!skipZeroes || m.second != 0) {
*s += format("%-15s %-8u %8" PRId64 "/s ", m.first, m.second, int64_t(m.second / elapsed));
}
}
*s += levels[0].metrics.events.toString(0, elapsed);
}
auto const& evictor = DWALPager::PageCacheT::Evictor::getEvictor();
std::pair<const char*, int64_t> cacheMetrics[] = { { "PageCacheCount", evictor->getCountUsed() },
{ "PageCacheMoved", evictor->getCountMoved() },
{ "PageCacheSize", evictor->getSizeUsed() },
{ "DecodeCacheSize", evictor->reservedSize } };
if (e != nullptr) {
for (auto& m : cacheMetrics) {
e->detail(m.first, m.second);
}
}
if (s != nullptr) {
for (auto& m : cacheMetrics) {
*s += format("%-15s %-14" PRId64 " ", m.first, m.second);
}
*s += "\n";
}
for (int i = 1; i < btreeLevels + 1; ++i) {
auto& metric = levels[i].metrics;
std::pair<const char*, unsigned int> metrics[] = {
{ "PageBuild", metric.pageBuild },
{ "PageBuildExt", metric.pageBuildExt },
{ "PageModify", metric.pageModify },
{ "PageModifyExt", metric.pageModifyExt },
{ "", 0 },
{ "PageRead", metric.pageRead },
{ "PageReadExt", metric.pageReadExt },
{ "PageCommitStart", metric.pageCommitStart },
{ "", 0 },
{ "LazyClearInt", metric.lazyClearRequeue },
{ "LazyClearIntExt", metric.lazyClearRequeueExt },
{ "LazyClear", metric.lazyClearFree },
{ "LazyClearExt", metric.lazyClearFreeExt },
{ "", 0 },
{ "ForceUpdate", metric.forceUpdate },
{ "DetachChild", metric.detachChild },
{ "", 0 },
};
if (e != nullptr) {
for (auto& m : metrics) {
char c = m.first[0];
if (c != 0 && (!skipZeroes || m.second != 0)) {
e->detail(format("L%d%s", i, m.first + (c == '-' ? 1 : 0)), m.second);
}
}
metric.events.toTraceEvent(e, i);
}
if (s != nullptr) {
*s += format("\nLevel %d\n\t", i);
for (auto& m : metrics) {
const char* name = m.first;
bool rate = elapsed != 0;
if (*name == '-') {
++name;
rate = false;
}
if (*name == '\0') {
*s += "\n\t";
} else if (!skipZeroes || m.second != 0) {
*s += format("%-15s %8u %8u/s ", name, m.second, rate ? int(m.second / elapsed) : 0);
}
}
*s += metric.events.toString(i, elapsed);
}
}
}
void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
if (ioLock == nullptr)
return;
int maxPriority = ioLock->maxPriority();
if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners());
e->detail("AwaitReads", ioLock->totalWaiters());
for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
}
}
if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
}
*s += "\n";
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
}
}
}
TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3);
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4);
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6);
ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[3] == 8);
fmt::print("sizeof(RedwoodRecordRef) = {}\n", sizeof(RedwoodRecordRef));
// Test pageID stuff.
{
LogicalPageID ids[] = { 1, 5 };
BTreeNodeLinkRef id(ids, 2);
RedwoodRecordRef r;
r.setChildPage(id);
ASSERT(r.getChildPage() == id);
ASSERT(r.getChildPage().begin() == id.begin());
Standalone<RedwoodRecordRef> r2 = r;
ASSERT(r2.getChildPage() == id);
ASSERT(r2.getChildPage().begin() != id.begin());
}
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
deltaTest(RedwoodRecordRef("abc"_sr, ""_sr), RedwoodRecordRef("abc"_sr, ""_sr));
deltaTest(RedwoodRecordRef("abc"_sr, ""_sr), RedwoodRecordRef("abcd"_sr, ""_sr));
deltaTest(RedwoodRecordRef("abcd"_sr, ""_sr), RedwoodRecordRef("abc"_sr, ""_sr));
deltaTest(RedwoodRecordRef(std::string(300, 'k'), std::string(1e6, 'v')),
RedwoodRecordRef(std::string(300, 'k'), ""_sr));
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
deltaTest(RedwoodRecordRef(""_sr, ""_sr), RedwoodRecordRef(""_sr, ""_sr));
Arena mem;
double start;
uint64_t total;
uint64_t count;
uint64_t i;
int64_t bytes;
std::string keyBuffer(30000, 'k');
std::string valueBuffer(70000, 'v');
start = timer();
count = 1000;
bytes = 0;
for (i = 0; i < count; ++i) {
RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer);
RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
bytes += deltaTest(a, b);
}
double elapsed = timer() - start;
printf("DeltaTest() on random large records %f M/s %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
keyBuffer.resize(30);
valueBuffer.resize(100);
start = timer();
count = 1e6;
bytes = 0;
for (i = 0; i < count; ++i) {
RedwoodRecordRef a = randomRedwoodRecordRef(keyBuffer, valueBuffer);
RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
bytes += deltaTest(a, b);
}
printf("DeltaTest() on random small records %f M/s %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
RedwoodRecordRef rec1;
RedwoodRecordRef rec2;
rec1.key = "alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf1"_sr;
rec2.key = "alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf234"_sr;
start = timer();
total = 0;
count = 100e6;
for (i = 0; i < count; ++i) {
total += rec1.getCommonPrefixLen(rec2, 50);
}
printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
start = timer();
total = 0;
count = 100e6;
for (i = 0; i < count; ++i) {
total += rec1.getCommonPrefixLen(rec2, 0);
}
printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
char buf[1000];
RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf;
start = timer();
total = 0;
count = 100e6;
int commonPrefix = rec1.getCommonPrefixLen(rec2, 0);
for (i = 0; i < count; ++i) {
total += rec1.writeDelta(d, rec2, commonPrefix);
}
printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
start = timer();
total = 0;
count = 10e6;
for (i = 0; i < count; ++i) {
total += rec1.writeDelta(d, rec2);
}
printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
start = timer();
total = 0;
count = 10e6;
for (i = 0; i < count; ++i) {
total += rec1.compare(rec2, 0);
}
printf("%" PRId64 " compare(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
start = timer();
total = 0;
count = 10e6;
for (i = 0; i < count; ++i) {
total += rec1.compare(rec2, 50);
}
printf("%" PRId64 " compare(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
return Void();
}
TEST_CASE("Lredwood/correctness/unit/deltaTree/RedwoodRecordRef") {
// Sanity check on delta tree node format
ASSERT(DeltaTree2<RedwoodRecordRef>::Node::headerSize(false) == 4);
ASSERT(DeltaTree2<RedwoodRecordRef>::Node::headerSize(true) == 8);
const int N = deterministicRandom()->randomInt(200, 1000);
RedwoodRecordRef prev;
RedwoodRecordRef next("\xff\xff\xff\xff"_sr);
Arena arena;
std::set<RedwoodRecordRef> uniqueItems;
// Add random items to uniqueItems until its size is N
while (uniqueItems.size() < N) {
std::string k = deterministicRandom()->randomAlphaNumeric(30);
std::string v = deterministicRandom()->randomAlphaNumeric(30);
RedwoodRecordRef rec;
rec.key = StringRef(arena, k);
if (deterministicRandom()->coinflip()) {
rec.value = StringRef(arena, v);
}
if (uniqueItems.count(rec) == 0) {
uniqueItems.insert(rec);
}
}
std::vector<RedwoodRecordRef> items(uniqueItems.begin(), uniqueItems.end());
int bufferSize = N * 100;
bool largeTree = bufferSize > DeltaTree<RedwoodRecordRef>::SmallSizeLimit;
DeltaTree<RedwoodRecordRef>* tree = (DeltaTree<RedwoodRecordRef>*)new uint8_t[bufferSize];
tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next);
printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n",
(int)items.size(),
(int)tree->size(),
(int)tree->initialHeight,
largeTree);
debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str());
DeltaTree<RedwoodRecordRef>::Mirror r(tree, &prev, &next);
// Test delete/insert behavior for each item, making no net changes
printf("Testing seek/delete/insert for existing keys with random values\n");
ASSERT(tree->numItems == items.size());
for (auto rec : items) {
// Insert existing should fail
ASSERT(!r.insert(rec));
ASSERT(tree->numItems == items.size());
// Erase existing should succeed
ASSERT(r.erase(rec));
ASSERT(tree->numItems == items.size() - 1);
// Erase deleted should fail
ASSERT(!r.erase(rec));
ASSERT(tree->numItems == items.size() - 1);
// Insert deleted should succeed
ASSERT(r.insert(rec));
ASSERT(tree->numItems == items.size());
// Insert existing should fail
ASSERT(!r.insert(rec));
ASSERT(tree->numItems == items.size());
}
DeltaTree<RedwoodRecordRef>::Cursor fwd = r.getCursor();
DeltaTree<RedwoodRecordRef>::Cursor rev = r.getCursor();
DeltaTree<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly>::Mirror rValuesOnly(tree, &prev, &next);
DeltaTree<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly>::Cursor fwdValueOnly = rValuesOnly.getCursor();
printf("Verifying tree contents using forward, reverse, and value-only iterators\n");
ASSERT(fwd.moveFirst());
ASSERT(fwdValueOnly.moveFirst());
ASSERT(rev.moveLast());
int i = 0;
while (1) {
if (fwd.get() != items[i]) {
printf("forward iterator i=%d\n %s found\n %s expected\n",
i,
fwd.get().toString().c_str(),
items[i].toString().c_str());
printf("Delta: %s\n", fwd.node->raw->delta(largeTree).toString().c_str());
ASSERT(false);
}
if (rev.get() != items[items.size() - 1 - i]) {
printf("reverse iterator i=%d\n %s found\n %s expected\n",
i,
rev.get().toString().c_str(),
items[items.size() - 1 - i].toString().c_str());
printf("Delta: %s\n", rev.node->raw->delta(largeTree).toString().c_str());
ASSERT(false);
}
if (fwdValueOnly.get().value != items[i].value) {
printf("forward values-only iterator i=%d\n %s found\n %s expected\n",
i,
fwdValueOnly.get().toString().c_str(),
items[i].toString().c_str());
printf("Delta: %s\n", fwdValueOnly.node->raw->delta(largeTree).toString().c_str());
ASSERT(false);
}
++i;
bool more = fwd.moveNext();
ASSERT(fwdValueOnly.moveNext() == more);
ASSERT(rev.movePrev() == more);
ASSERT(fwd.valid() == more);
ASSERT(fwdValueOnly.valid() == more);
ASSERT(rev.valid() == more);
if (!fwd.valid()) {
break;
}
}
ASSERT(i == items.size());
{
DeltaTree<RedwoodRecordRef>::Mirror mirror(tree, &prev, &next);
DeltaTree<RedwoodRecordRef>::Cursor c = mirror.getCursor();
printf("Doing 20M random seeks using the same cursor from the same mirror.\n");
double start = timer();
for (int i = 0; i < 20000000; ++i) {
const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())];
if (!c.seekLessThanOrEqual(query)) {
printf("Not found! query=%s\n", query.toString().c_str());
ASSERT(false);
}
if (c.get() != query) {
printf("Found incorrect node! query=%s found=%s\n",
query.toString().c_str(),
c.get().toString().c_str());
ASSERT(false);
}
}
double elapsed = timer() - start;
printf("Elapsed %f\n", elapsed);
}
{
printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n");
double start = timer();
std::vector<DeltaTree<RedwoodRecordRef>::Mirror*> mirrors;
std::vector<DeltaTree<RedwoodRecordRef>::Cursor> cursors;
for (int i = 0; i < 10000; ++i) {
mirrors.push_back(new DeltaTree<RedwoodRecordRef>::Mirror(tree, &prev, &next));
cursors.push_back(mirrors.back()->getCursor());
}
for (int i = 0; i < 5000000; ++i) {
const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())];
DeltaTree<RedwoodRecordRef>::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())];
if (!c.seekLessThanOrEqual(query)) {
printf("Not found! query=%s\n", query.toString().c_str());
ASSERT(false);
}
if (c.get() != query) {
printf("Found incorrect node! query=%s found=%s\n",
query.toString().c_str(),
c.get().toString().c_str());
ASSERT(false);
}
}
double elapsed = timer() - start;
printf("Elapsed %f\n", elapsed);
}
return Void();
}
TEST_CASE("Lredwood/correctness/unit/deltaTree/RedwoodRecordRef2") {
// Sanity check on delta tree node format
ASSERT(DeltaTree2<RedwoodRecordRef>::Node::headerSize(false) == 4);
ASSERT(DeltaTree2<RedwoodRecordRef>::Node::headerSize(true) == 8);
ASSERT(sizeof(DeltaTree2<RedwoodRecordRef>::DecodedNode) == 28);
const int N = deterministicRandom()->randomInt(200, 1000);
RedwoodRecordRef prev;
RedwoodRecordRef next("\xff\xff\xff\xff"_sr);
Arena arena;
std::set<RedwoodRecordRef> uniqueItems;
// Add random items to uniqueItems until its size is N
while (uniqueItems.size() < N) {
std::string k = deterministicRandom()->randomAlphaNumeric(30);
std::string v = deterministicRandom()->randomAlphaNumeric(30);
RedwoodRecordRef rec;
rec.key = StringRef(arena, k);
if (deterministicRandom()->coinflip()) {
rec.value = StringRef(arena, v);
}
if (uniqueItems.count(rec) == 0) {
uniqueItems.insert(rec);
}
}
std::vector<RedwoodRecordRef> items(uniqueItems.begin(), uniqueItems.end());
int bufferSize = N * 100;
bool largeTree = bufferSize > DeltaTree2<RedwoodRecordRef>::SmallSizeLimit;
DeltaTree2<RedwoodRecordRef>* tree = (DeltaTree2<RedwoodRecordRef>*)new uint8_t[bufferSize];
tree->build(bufferSize, &items[0], &items[items.size()], &prev, &next);
printf("Count=%d Size=%d InitialHeight=%d largeTree=%d\n",
(int)items.size(),
(int)tree->size(),
(int)tree->initialHeight,
largeTree);
debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str());
DeltaTree2<RedwoodRecordRef>::Cursor c(makeReference<DeltaTree2<RedwoodRecordRef>::DecodeCache>(prev, next), tree);
// Test delete/insert behavior for each item, making no net changes
printf("Testing seek/delete/insert for existing keys with random values\n");
ASSERT(tree->numItems == items.size());
for (auto rec : items) {
// Insert existing should fail
ASSERT(!c.insert(rec));
ASSERT(tree->numItems == items.size());
// Erase existing should succeed
ASSERT(c.erase(rec));
ASSERT(tree->numItems == items.size() - 1);
// Erase deleted should fail
ASSERT(!c.erase(rec));
ASSERT(tree->numItems == items.size() - 1);
// Insert deleted should succeed
ASSERT(c.insert(rec));
ASSERT(tree->numItems == items.size());
// Insert existing should fail
ASSERT(!c.insert(rec));
ASSERT(tree->numItems == items.size());
}
DeltaTree2<RedwoodRecordRef>::Cursor fwd = c;
DeltaTree2<RedwoodRecordRef>::Cursor rev = c;
DeltaTree2<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly>::Cursor fwdValueOnly(
makeReference<DeltaTree2<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly>::DecodeCache>(prev, next),
(DeltaTree2<RedwoodRecordRef, RedwoodRecordRef::DeltaValueOnly>*)tree);
printf("Verifying tree contents using forward, reverse, and value-only iterators\n");
ASSERT(fwd.moveFirst());
ASSERT(fwdValueOnly.moveFirst());
ASSERT(rev.moveLast());
int i = 0;
while (1) {
if (fwd.get() != items[i]) {
printf("forward iterator i=%d\n %s found\n %s expected\n",
i,
fwd.get().toString().c_str(),
items[i].toString().c_str());
printf("Cursor: %s\n", fwd.toString().c_str());
ASSERT(false);
}
if (rev.get() != items[items.size() - 1 - i]) {
printf("reverse iterator i=%d\n %s found\n %s expected\n",
i,
rev.get().toString().c_str(),
items[items.size() - 1 - i].toString().c_str());
printf("Cursor: %s\n", rev.toString().c_str());
ASSERT(false);
}
if (fwdValueOnly.get().value != items[i].value) {
printf("forward values-only iterator i=%d\n %s found\n %s expected\n",
i,
fwdValueOnly.get().toString().c_str(),
items[i].toString().c_str());
printf("Cursor: %s\n", fwdValueOnly.toString().c_str());
ASSERT(false);
}
++i;
bool more = fwd.moveNext();
ASSERT(fwdValueOnly.moveNext() == more);
ASSERT(rev.movePrev() == more);
ASSERT(fwd.valid() == more);
ASSERT(fwdValueOnly.valid() == more);
ASSERT(rev.valid() == more);
if (!fwd.valid()) {
break;
}
}
ASSERT(i == items.size());
{
DeltaTree2<RedwoodRecordRef>::Cursor c(makeReference<DeltaTree2<RedwoodRecordRef>::DecodeCache>(prev, next),
tree);
printf("Doing 20M random seeks using the same cursor from the same mirror.\n");
double start = timer();
for (int i = 0; i < 20000000; ++i) {
const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())];
if (!c.seekLessThanOrEqual(query)) {
printf("Not found! query=%s\n", query.toString().c_str());
ASSERT(false);
}
if (c.get() != query) {
printf("Found incorrect node! query=%s found=%s\n",
query.toString().c_str(),
c.get().toString().c_str());
ASSERT(false);
}
}
double elapsed = timer() - start;
printf("Elapsed %f\n", elapsed);
}
// {
// printf("Doing 5M random seeks using 10k random cursors, each from a different mirror.\n");
// double start = timer();
// std::vector<DeltaTree2<RedwoodRecordRef>::Mirror*> mirrors;
// std::vector<DeltaTree2<RedwoodRecordRef>::Cursor> cursors;
// for (int i = 0; i < 10000; ++i) {
// mirrors.push_back(new DeltaTree2<RedwoodRecordRef>::Mirror(tree, &prev, &next));
// cursors.push_back(mirrors.back()->getCursor());
// }
// for (int i = 0; i < 5000000; ++i) {
// const RedwoodRecordRef& query = items[deterministicRandom()->randomInt(0, items.size())];
// DeltaTree2<RedwoodRecordRef>::Cursor& c = cursors[deterministicRandom()->randomInt(0, cursors.size())];
// if (!c.seekLessThanOrEqual(query)) {
// printf("Not found! query=%s\n", query.toString().c_str());
// ASSERT(false);
// }
// if (c.get() != query) {
// printf("Found incorrect node! query=%s found=%s\n",
// query.toString().c_str(),
// c.get().toString().c_str());
// ASSERT(false);
// }
// }
// double elapsed = timer() - start;
// printf("Elapsed %f\n", elapsed);
// }
return Void();
}
TEST_CASE("Lredwood/correctness/unit/deltaTree/IntIntPair") {
const int N = 200;
IntIntPair lowerBound = { 0, 0 };
IntIntPair upperBound = { 1000, 1000 };
state std::function<IntIntPair()> randomPair = [&]() {
// Generate a pair >= lowerBound and < upperBound
int k = deterministicRandom()->randomInt(lowerBound.k, upperBound.k + 1);
int v = deterministicRandom()->randomInt(lowerBound.v, upperBound.v);
// Only generate even values so the tests below can approach and find each
// key with a directional seek of the adjacent absent value on either side.
v -= v % 2;
return IntIntPair(k, v);
};
// Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior
// tests.
std::set<IntIntPair> uniqueItems;
while (uniqueItems.size() < N) {
IntIntPair p = randomPair();
auto nextP = p; // also check if next highest/lowest key is not in set
nextP.v++;
auto prevP = p;
prevP.v--;
if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) {
uniqueItems.insert(p);
}
}
// Build tree of items
std::vector<IntIntPair> items(uniqueItems.begin(), uniqueItems.end());
int bufferSize = N * 2 * 30;
DeltaTree<IntIntPair>* tree = (DeltaTree<IntIntPair>*)new uint8_t[bufferSize];
int builtSize = tree->build(bufferSize, &items[0], &items[items.size()], &lowerBound, &upperBound);
ASSERT(builtSize <= bufferSize);
DeltaTree<IntIntPair>::Mirror r(tree, &lowerBound, &upperBound);
DeltaTree2<IntIntPair>* tree2 = (DeltaTree2<IntIntPair>*)new uint8_t[bufferSize];
int builtSize2 = tree2->build(bufferSize, &items[0], &items[0] + items.size(), &lowerBound, &upperBound);
ASSERT(builtSize2 <= bufferSize);
auto cache = makeReference<DeltaTree2<IntIntPair>::DecodeCache>(lowerBound, upperBound);
DeltaTree2<IntIntPair>::Cursor cur2(cache, tree2);
auto printItems = [&] {
for (int k = 0; k < items.size(); ++k) {
debug_printf("%d/%zu %s\n", k + 1, items.size(), items[k].toString().c_str());
}
};
auto printTrees = [&] {
printf("DeltaTree: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n",
(int)tree->numItems,
(int)tree->size(),
(int)tree->initialHeight,
(int)tree->maxHeight);
debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str());
printf("DeltaTree2: Count=%d Size=%d InitialHeight=%d MaxHeight=%d\n",
(int)tree2->numItems,
(int)tree2->size(),
(int)tree2->initialHeight,
(int)tree2->maxHeight);
debug_printf("Data(%p): %s\n", tree2, StringRef((uint8_t*)tree2, tree2->size()).toHexString().c_str());
};
// Iterate through items and tree forward and backward, verifying tree contents.
auto scanAndVerify = [&]() {
printf("Verify DeltaTree contents.\n");
DeltaTree<IntIntPair>::Cursor fwd = r.getCursor();
DeltaTree<IntIntPair>::Cursor rev = r.getCursor();
ASSERT(fwd.moveFirst());
ASSERT(rev.moveLast());
for (int i = 0; i < items.size(); ++i) {
if (fwd.get() != items[i]) {
printItems();
printf("forward iterator i=%d\n %s found\n %s expected\n",
i,
fwd.get().toString().c_str(),
items[i].toString().c_str());
ASSERT(false);
}
if (rev.get() != items[items.size() - 1 - i]) {
printItems();
printf("reverse iterator i=%d\n %s found\n %s expected\n",
i,
rev.get().toString().c_str(),
items[items.size() - 1 - i].toString().c_str());
ASSERT(false);
}
// Advance iterator, check scanning cursors for correct validity state
int j = i + 1;
bool end = j == items.size();
ASSERT(fwd.moveNext() == !end);
ASSERT(rev.movePrev() == !end);
ASSERT(fwd.valid() == !end);
ASSERT(rev.valid() == !end);
if (end) {
break;
}
}
};
// Iterate through items and tree forward and backward, verifying tree contents.
auto scanAndVerify2 = [&]() {
printf("Verify DeltaTree2 contents.\n");
DeltaTree2<IntIntPair>::Cursor fwd(cache, tree2);
DeltaTree2<IntIntPair>::Cursor rev(cache, tree2);
ASSERT(fwd.moveFirst());
ASSERT(rev.moveLast());
for (int i = 0; i < items.size(); ++i) {
if (fwd.get() != items[i]) {
printItems();
printf("forward iterator i=%d\n %s found\n %s expected\n",
i,
fwd.get().toString().c_str(),
items[i].toString().c_str());
ASSERT(false);
}
if (rev.get() != items[items.size() - 1 - i]) {
printItems();
printf("reverse iterator i=%d\n %s found\n %s expected\n",
i,
rev.get().toString().c_str(),
items[items.size() - 1 - i].toString().c_str());
ASSERT(false);
}
// Advance iterator, check scanning cursors for correct validity state
int j = i + 1;
bool end = j == items.size();
ASSERT(fwd.moveNext() == !end);
ASSERT(rev.movePrev() == !end);
ASSERT(fwd.valid() == !end);
ASSERT(rev.valid() == !end);
if (end) {
break;
}
}
};
printItems();
printTrees();
// Verify tree contents
scanAndVerify();
scanAndVerify2();
// Grow uniqueItems until tree is full, adding half of new items to toDelete
std::vector<IntIntPair> toDelete;
int maxInsert = 9999999;
bool shouldBeFull = false;
while (maxInsert-- > 0) {
IntIntPair p = randomPair();
// Insert record if it, its predecessor, and its successor are not present.
// Test data is intentionally sparse to test finding each record with a directional
// seek from each adjacent possible but not present record.
if (uniqueItems.count(p) == 0 && uniqueItems.count(IntIntPair(p.k, p.v - 1)) == 0 &&
uniqueItems.count(IntIntPair(p.k, p.v + 1)) == 0) {
if (!cur2.insert(p)) {
shouldBeFull = true;
break;
};
ASSERT(r.insert(p));
uniqueItems.insert(p);
if (deterministicRandom()->coinflip()) {
toDelete.push_back(p);
}
// printf("Inserted %s size=%d\n", items.back().toString().c_str(), tree->size());
}
}
// If the tree refused to insert an item, the count should be at least 2*N
ASSERT(!shouldBeFull || tree->numItems > 2 * N);
ASSERT(tree->size() <= bufferSize);
// Update items vector
items = std::vector<IntIntPair>(uniqueItems.begin(), uniqueItems.end());
printItems();
printTrees();
// Verify tree contents
scanAndVerify();
scanAndVerify2();
// Create a new mirror, decoding the tree from scratch since insert() modified both the tree and the mirror
r = DeltaTree<IntIntPair>::Mirror(tree, &lowerBound, &upperBound);
cache->clear();
scanAndVerify();
scanAndVerify2();
// For each randomly selected new item to be deleted, delete it from the DeltaTree2 and from uniqueItems
printf("Deleting some items\n");
for (auto p : toDelete) {
uniqueItems.erase(p);
DeltaTree<IntIntPair>::Cursor c = r.getCursor();
ASSERT(c.seekLessThanOrEqual(p));
c.erase();
ASSERT(cur2.seekLessThanOrEqual(p));
cur2.erase();
}
// Update items vector
items = std::vector<IntIntPair>(uniqueItems.begin(), uniqueItems.end());
printItems();
printTrees();
// Verify tree contents after deletions
scanAndVerify();
scanAndVerify2();
printf("Verifying insert/erase behavior for existing items\n");
// Test delete/insert behavior for each item, making no net changes
for (auto p : items) {
// Insert existing should fail
ASSERT(!r.insert(p));
ASSERT(!cur2.insert(p));
// Erase existing should succeed
ASSERT(r.erase(p));
ASSERT(cur2.erase(p));
// Erase deleted should fail
ASSERT(!r.erase(p));
ASSERT(!cur2.erase(p));
// Insert deleted should succeed
ASSERT(r.insert(p));
ASSERT(cur2.insert(p));
// Insert existing should fail
ASSERT(!r.insert(p));
ASSERT(!cur2.insert(p));
}
printItems();
printTrees();
// Tree contents should still match items vector
scanAndVerify();
scanAndVerify2();
printf("Verifying seek behaviors\n");
DeltaTree<IntIntPair>::Cursor s = r.getCursor();
DeltaTree2<IntIntPair>::Cursor s2(cache, tree2);
// SeekLTE to each element
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
ASSERT(s.seekLessThanOrEqual(q));
if (s.get() != p) {
printItems();
printf("seekLessThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
ASSERT(s2.seekLessThanOrEqual(q));
if (s2.get() != p) {
printItems();
printf("seekLessThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s2.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
}
// SeekGTE to each element
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
ASSERT(s.seekGreaterThanOrEqual(q));
if (s.get() != p) {
printItems();
printf("seekGreaterThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
ASSERT(s2.seekGreaterThanOrEqual(q));
if (s2.get() != p) {
printItems();
printf("seekGreaterThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s2.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
}
// SeekLTE to the next possible int pair value after each element to make sure the base element is found
// Assumes no consecutive items are present in the set
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
q.v++;
ASSERT(s.seekLessThanOrEqual(q));
if (s.get() != p) {
printItems();
printf("seekLessThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
ASSERT(s2.seekLessThanOrEqual(q));
if (s2.get() != p) {
printItems();
printf("seekLessThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s2.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
}
// SeekGTE to the previous possible int pair value after each element to make sure the base element is found
// Assumes no consecutive items are present in the set
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
q.v--;
ASSERT(s.seekGreaterThanOrEqual(q));
if (s.get() != p) {
printItems();
printf("seekGreaterThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
ASSERT(s2.seekGreaterThanOrEqual(q));
if (s2.get() != p) {
printItems();
printf("seekGreaterThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s2.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
}
// SeekLTE to each element N times, using every element as a hint
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
for (int j = 0; j < items.size(); ++j) {
ASSERT(s.seekLessThanOrEqual(items[j]));
ASSERT(s.seekLessThanOrEqual(q, 0, &s));
if (s.get() != p) {
printItems();
printf("i=%d j=%d\n", i, j);
printf("seekLessThanOrEqual(%s) found %s expected %s\n",
q.toString().c_str(),
s.get().toString().c_str(),
p.toString().c_str());
ASSERT(false);
}
}
}
// SeekLTE to each element's next possible value, using each element as a hint
// Assumes no consecutive items are present in the set
for (int i = 0; i < items.size(); ++i) {
IntIntPair p = items[i];
IntIntPair q = p;
q.v++;
for (int j = 0; j < items.size(); ++j) {
ASSERT(s.seekLessThanOrEqual(items[j]));
ASSERT(s.seekLessThanOrEqual(q, 0, &s));
if (s.get() != p) {
printItems();
printf("i=%d j=%d\n", i, j);
ASSERT(false);
}
}
}
auto skipSeekPerformance = [&](int jumpMax, bool old, bool useHint, int count) {
// Skip to a series of increasing items, jump by up to jumpMax units forward in the
// items, wrapping around to 0.
double start = timer();
s.moveFirst();
auto first = s;
int pos = 0;
for (int c = 0; c < count; ++c) {
int jump = deterministicRandom()->randomInt(0, jumpMax);
int newPos = pos + jump;
if (newPos >= items.size()) {
pos = 0;
newPos = jump;
s = first;
}
IntIntPair q = items[newPos];
++q.v;
if (old) {
if (useHint) {
s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos);
} else {
s.seekLessThanOrEqualOld(q, 0, nullptr, 0);
}
} else {
if (useHint) {
s.seekLessThanOrEqual(q, 0, &s, newPos - pos);
} else {
s.seekLessThanOrEqual(q);
}
}
pos = newPos;
}
double elapsed = timer() - start;
fmt::print("Seek/skip test, count={0} jumpMax={1}, items={2}, oldSeek={3} useHint={4}: Elapsed {5} seconds "
"{6:.2f} M/s\n",
count,
jumpMax,
items.size(),
old,
useHint,
elapsed,
double(count) / elapsed / 1e6);
};
auto skipSeekPerformance2 = [&](int jumpMax, bool old, bool useHint, int count) {
// Skip to a series of increasing items, jump by up to jumpMax units forward in the
// items, wrapping around to 0.
double start = timer();
s2.moveFirst();
auto first = s2;
int pos = 0;
for (int c = 0; c < count; ++c) {
int jump = deterministicRandom()->randomInt(0, jumpMax);
int newPos = pos + jump;
if (newPos >= items.size()) {
pos = 0;
newPos = jump;
s2 = first;
}
IntIntPair q = items[newPos];
++q.v;
if (old) {
if (useHint) {
// s.seekLessThanOrEqualOld(q, 0, &s, newPos - pos);
} else {
// s.seekLessThanOrEqualOld(q, 0, nullptr, 0);
}
} else {
if (useHint) {
// s.seekLessThanOrEqual(q, 0, &s, newPos - pos);
} else {
s2.seekLessThanOrEqual(q);
}
}
pos = newPos;
}
double elapsed = timer() - start;
fmt::print("DeltaTree2 Seek/skip test, count={0} jumpMax={1}, items={2}, oldSeek={3} useHint={4}: Elapsed {5} "
"seconds "
"{6:.2f} M/s\n",
count,
jumpMax,
items.size(),
old,
useHint,
elapsed,
double(count) / elapsed / 1e6);
};
// Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods.
// TODO: Once seekLessThanOrEqual() with a hint is as fast as seekLessThanOrEqualOld, remove it.
skipSeekPerformance(8, false, false, 80e6);
skipSeekPerformance2(8, false, false, 80e6);
skipSeekPerformance(8, true, false, 80e6);
skipSeekPerformance(8, true, true, 80e6);
skipSeekPerformance(8, false, true, 80e6);
// Repeatedly seek for one of a set of pregenerated random pairs and time it.
std::vector<IntIntPair> randomPairs;
randomPairs.reserve(10 * N);
for (int i = 0; i < 10 * N; ++i) {
randomPairs.push_back(randomPair());
}
// Random seeks
double start = timer();
for (int i = 0; i < 20000000; ++i) {
IntIntPair p = randomPairs[i % randomPairs.size()];
// Verify the result is less than or equal, and if seek fails then p must be lower than lowest (first) item
if (!s.seekLessThanOrEqual(p)) {
if (p >= items.front()) {
printf("Seek failed! query=%s front=%s\n", p.toString().c_str(), items.front().toString().c_str());
ASSERT(false);
}
} else if (s.get() > p) {
printf("Found incorrect node! query=%s found=%s\n", p.toString().c_str(), s.get().toString().c_str());
ASSERT(false);
}
}
double elapsed = timer() - start;
printf("Random seek test: Elapsed %f\n", elapsed);
return Void();
}
struct SimpleCounter {
SimpleCounter() : x(0), t(timer()), start(t), xt(0) {}
void operator+=(int n) { x += n; }
void operator++() { x++; }
int64_t get() { return x; }
double rate() {
double t2 = timer();
int r = (x - xt) / (t2 - t);
xt = x;
t = t2;
return r;
}
double avgRate() { return x / (timer() - start); }
int64_t x;
double t;
double start;
int64_t xt;
std::string toString() { return format("%" PRId64 "/%.2f/%.2f", x, rate() / 1e6, avgRate() / 1e6); }
};
TEST_CASE(":/redwood/performance/mutationBuffer") {
// This test uses pregenerated short random keys
int count = 10e6;
printf("Generating %d strings...\n", count);
Arena arena;
std::vector<KeyRef> strings;
while (strings.size() < count) {
strings.push_back(randomString(arena, 5));
}
fmt::print("Inserting {} elements and then finding each string...\n", count);
double start = timer();
VersionedBTree::MutationBuffer m;
for (int i = 0; i < count; ++i) {
KeyRef key = strings[i];
auto a = m.insert(key);
auto b = m.lower_bound(key);
ASSERT(a == b);
m.erase(a, b);
}
double elapsed = timer() - start;
printf("count=%d elapsed=%f\n", count, elapsed);
return Void();
}
// This test is only useful with Arena debug statements which show when aligned buffers are allocated and freed.
TEST_CASE(":/redwood/pager/ArenaPage") {
Arena x;
printf("Making p\n");
Reference<ArenaPage> p = makeReference<ArenaPage>(4096, 4096);
printf("Made p=%p\n", p->data());
printf("Clearing p\n");
p.clear();
printf("Making p\n");
p = makeReference<ArenaPage>(4096, 4096);
printf("Made p=%p\n", p->data());
printf("Making x depend on p\n");
x.dependsOn(p->getArena());
printf("Clearing p\n");
p.clear();
printf("Clearing x\n");
x = Arena();
printf("Pointer should be freed\n");
return Void();
}
TEST_CASE("Lredwood/correctness/btree") {
g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
g_redwoodMetrics.clear();
state std::string file = params.get("file").orDefault("unittest_pageFile.redwood-v1");
IPager2* pager;
state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->random01() < 0.25);
state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25);
state int encoding =
params.getInt("encodingType").orDefault(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
state unsigned int encryptionDomainMode =
params.getInt("domainMode")
.orDefault(deterministicRandom()->randomInt(0, RandomEncryptionKeyProvider::EncryptionDomainMode::MAX));
state int pageSize =
shortTest ? 250 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(250, 400));
state int extentSize =
params.getInt("extentSize")
.orDefault(deterministicRandom()->coinflip() ? SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE
: deterministicRandom()->randomInt(4096, 32768));
state bool pagerMemoryOnly =
params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001));
state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2));
state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
state int maxCommitSize =
params.getInt("maxCommitSize")
.orDefault(shortTest
? 1000
: randomSize((int)std::min<int64_t>((maxKeySize + maxValueSize) * int64_t(20000), 10e6)));
state double setExistingKeyProbability =
params.getDouble("setExistingKeyProbability").orDefault(deterministicRandom()->random01() * .5);
state double clearProbability =
params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1);
state double clearExistingBoundaryProbability =
params.getDouble("clearExistingBoundaryProbability").orDefault(deterministicRandom()->random01() * .5);
state double clearSingleKeyProbability =
params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01() * .1);
state double clearKnownNodeBoundaryProbability =
params.getDouble("clearKnownNodeBoundaryProbability").orDefault(deterministicRandom()->random01() * .1);
state double clearPostSetProbability =
params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
state double coldStartProbability =
params.getDouble("coldStartProbability").orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01()));
state double advanceOldVersionProbability =
params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
state int64_t pageCacheBytes =
params.getInt("pageCacheBytes")
.orDefault(pagerMemoryOnly ? 2e9
: (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 10 : 10000) + 1)));
state Version versionIncrement =
params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8));
state int64_t remapCleanupWindowBytes =
params.getInt("remapCleanupWindowBytes")
.orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, 100) * 1024 * 1024);
state int concurrentExtentReads =
params.getInt("concurrentExtentReads").orDefault(SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS);
// These settings are an attempt to keep the test execution real reasonably short
state int64_t maxPageOps = params.getInt("maxPageOps").orDefault((shortTest || serialTest) ? 50e3 : 1e6);
state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3);
state int maxColdStarts = params.getInt("maxColdStarts").orDefault(300);
// Max number of records in the BTree or the versioned written map to visit
state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6);
state EncodingType encodingType = static_cast<EncodingType>(encoding);
state Reference<IPageEncryptionKeyProvider> keyProvider;
if (encodingType == EncodingType::AESEncryptionV1) {
keyProvider = makeReference<RandomEncryptionKeyProvider>(
RandomEncryptionKeyProvider::EncryptionDomainMode(encryptionDomainMode));
} else if (encodingType == EncodingType::XOREncryption_TestOnly) {
keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(file);
}
printf("\n");
printf("file: %s\n", file.c_str());
printf("maxPageOps: %" PRId64 "\n", maxPageOps);
printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries);
printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead);
printf("pagerMemoryOnly: %d\n", pagerMemoryOnly);
printf("serialTest: %d\n", serialTest);
printf("shortTest: %d\n", shortTest);
printf("encodingType: %d\n", encodingType);
printf("domainMode: %d\n", encryptionDomainMode);
printf("pageSize: %d\n", pageSize);
printf("extentSize: %d\n", extentSize);
printf("maxKeySize: %d\n", maxKeySize);
printf("maxValueSize: %d\n", maxValueSize);
printf("maxCommitSize: %d\n", maxCommitSize);
printf("setExistingKeyProbability: %f\n", setExistingKeyProbability);
printf("clearProbability: %f\n", clearProbability);
printf("clearExistingBoundaryProbability: %f\n", clearExistingBoundaryProbability);
printf("clearKnownNodeBoundaryProbability: %f\n", clearKnownNodeBoundaryProbability);
printf("clearSingleKeyProbability: %f\n", clearSingleKeyProbability);
printf("clearPostSetProbability: %f\n", clearPostSetProbability);
printf("coldStartProbability: %f\n", coldStartProbability);
printf("maxColdStarts: %d\n", maxColdStarts);
printf("advanceOldVersionProbability: %f\n", advanceOldVersionProbability);
printf("pageCacheBytes: %s\n", pageCacheBytes == 0 ? "default" : format("%" PRId64, pageCacheBytes).c_str());
printf("versionIncrement: %" PRId64 "\n", versionIncrement);
printf("remapCleanupWindowBytes: %" PRId64 "\n", remapCleanupWindowBytes);
printf("\n");
printf("Deleting existing test data...\n");
deleteFile(file);
printf("Initializing...\n");
pager = new DWALPager(pageSize,
extentSize,
file,
pageCacheBytes,
remapCleanupWindowBytes,
concurrentExtentReads,
pagerMemoryOnly,
keyProvider);
state VersionedBTree* btree = new VersionedBTree(pager, file, encodingType, keyProvider);
wait(btree->init());
state DecodeBoundaryVerifier* pBoundaries = DecodeBoundaryVerifier::getVerifier(file);
state std::map<std::pair<std::string, Version>, Optional<std::string>> written;
state int64_t totalRecordsRead = 0;
state std::set<Key> keys;
state int coldStarts = 0;
Version lastVer = btree->getLastCommittedVersion();
printf("Starting from version: %" PRId64 "\n", lastVer);
state Version version = lastVer + 1;
state SimpleCounter mutationBytes;
state SimpleCounter keyBytesInserted;
state SimpleCounter valueBytesInserted;
state SimpleCounter sets;
state SimpleCounter rangeClears;
state SimpleCounter keyBytesCleared;
state int mutationBytesThisCommit = 0;
state int mutationBytesTargetThisCommit = randomSize(maxCommitSize);
state PromiseStream<Version> committedVersions;
state Future<Void> verifyTask =
verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest);
state Future<Void> randomTask = serialTest ? Void() : (randomReader(btree, &totalRecordsRead) || btree->getError());
committedVersions.send(lastVer);
// Sometimes do zero-change commit at last version
if (deterministicRandom()->coinflip()) {
wait(btree->commit(lastVer));
}
state Future<Void> commit = Void();
state int64_t totalPageOps = 0;
// Check test op limits
state std::function<bool()> testFinished = [=]() {
return !(totalPageOps < maxPageOps && written.size() < maxVerificationMapEntries &&
totalRecordsRead < maxRecordsRead);
};
while (!testFinished()) {
// Sometimes increment the version
if (deterministicRandom()->random01() < 0.10) {
++version;
}
// Sometimes do a clear range
if (deterministicRandom()->random01() < clearProbability) {
Key start = randomKV(maxKeySize, 1).key;
Key end = (deterministicRandom()->random01() < .01) ? keyAfter(start) : randomKV(maxKeySize, 1).key;
// Sometimes replace start and/or end with a close actual (previously used) value
if (deterministicRandom()->random01() < clearExistingBoundaryProbability) {
auto i = keys.upper_bound(start);
if (i != keys.end())
start = *i;
}
if (deterministicRandom()->random01() < clearExistingBoundaryProbability) {
auto i = keys.upper_bound(end);
if (i != keys.end())
end = *i;
}
if (!pBoundaries->boundarySamples.empty() &&
deterministicRandom()->random01() < clearKnownNodeBoundaryProbability) {
start = pBoundaries->getSample();
// Can't allow the end boundary to be a start, so just convert to empty string.
if (start == VersionedBTree::dbEnd.key) {
start = Key();
}
}
if (!pBoundaries->boundarySamples.empty() &&
deterministicRandom()->random01() < clearKnownNodeBoundaryProbability) {
end = pBoundaries->getSample();
}
// Do a single key clear based on probability or end being randomly chosen to be the same as begin
// (unlikely)
if (deterministicRandom()->random01() < clearSingleKeyProbability || end == start) {
end = keyAfter(start);
} else if (end < start) {
std::swap(end, start);
}
// Apply clear range to verification map
++rangeClears;
KeyRangeRef range(start, end);
debug_printf(" Mutation: Clear '%s' to '%s' @%" PRId64 "\n",
start.toString().c_str(),
end.toString().c_str(),
version);
auto e = written.lower_bound(std::make_pair(start.toString(), 0));
if (e != written.end()) {
auto last = e;
auto eEnd = written.lower_bound(std::make_pair(end.toString(), 0));
while (e != eEnd) {
auto w = *e;
++e;
// If e key is different from last and last was present then insert clear for last's key at version
if (last != eEnd &&
((e == eEnd || e->first.first != last->first.first) && last->second.present())) {
debug_printf(
" Mutation: Clearing key '%s' @%" PRId64 "\n", last->first.first.c_str(), version);
keyBytesCleared += last->first.first.size();
mutationBytes += last->first.first.size();
mutationBytesThisCommit += last->first.first.size();
// If the last set was at version then just make it not present
if (last->first.second == version) {
last->second.reset();
} else {
written[std::make_pair(last->first.first, version)].reset();
}
}
last = e;
}
}
btree->clear(range);
// Sometimes set the range start after the clear
if (deterministicRandom()->random01() < clearPostSetProbability) {
KeyValue kv = randomKV(0, maxValueSize);
kv.key = range.begin;
btree->set(kv);
written[std::make_pair(kv.key.toString(), version)] = kv.value.toString();
}
} else {
// Set a key
KeyValue kv = randomKV(maxKeySize, maxValueSize);
// Sometimes change key to a close previously used key
if (deterministicRandom()->random01() < setExistingKeyProbability) {
auto i = keys.upper_bound(kv.key);
if (i != keys.end())
kv.key = StringRef(kv.arena(), *i);
}
debug_printf(" Mutation: Set '%s' -> '%s' @%" PRId64 "\n",
kv.key.toString().c_str(),
kv.value.toString().c_str(),
version);
++sets;
keyBytesInserted += kv.key.size();
valueBytesInserted += kv.value.size();
mutationBytes += (kv.key.size() + kv.value.size());
mutationBytesThisCommit += (kv.key.size() + kv.value.size());
btree->set(kv);
written[std::make_pair(kv.key.toString(), version)] = kv.value.toString();
keys.insert(kv.key);
}
// Commit after any limits for this commit or the total test are reached
if (mutationBytesThisCommit >= mutationBytesTargetThisCommit || testFinished()) {
// Wait for previous commit to finish
wait(commit);
printf("Commit complete. Next commit %d bytes, %" PRId64 " bytes committed so far.",
mutationBytesThisCommit,
mutationBytes.get() - mutationBytesThisCommit);
printf(" Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f\n",
(keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6,
keyBytesCleared.rate() / 1e6,
mutationBytes.rate() / 1e6);
// Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random
// amount.
if (deterministicRandom()->random01() < advanceOldVersionProbability) {
btree->setOldestReadableVersion(
btree->getLastCommittedVersion() -
deterministicRandom()->randomInt64(
0, btree->getLastCommittedVersion() - btree->getOldestReadableVersion() + 1));
}
commit = map(btree->commit(version), [&, v = version](Void) {
// Update pager ops before clearing metrics
totalPageOps += g_redwoodMetrics.pageOps();
fmt::print("Committed {0} PageOps {1}/{2} ({3:.2f}%) VerificationMapEntries {4}/{5} ({6:.2f}%) "
"RecordsRead {7}/{8} ({9:.2f}%)\n",
toString(v).c_str(),
totalPageOps,
maxPageOps,
totalPageOps * 100.0 / maxPageOps,
written.size(),
maxVerificationMapEntries,
written.size() * 100.0 / maxVerificationMapEntries,
totalRecordsRead,
maxRecordsRead,
totalRecordsRead * 100.0 / maxRecordsRead);
printf("Committed:\n%s\n", g_redwoodMetrics.toString(true).c_str());
// Notify the background verifier that version is committed and therefore readable
committedVersions.send(v);
return Void();
});
if (serialTest) {
// Wait for commit, wait for verification, then start new verification
wait(commit);
committedVersions.sendError(end_of_stream());
debug_printf("Waiting for verification to complete.\n");
wait(verifyTask);
committedVersions = PromiseStream<Version>();
verifyTask = verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest);
}
mutationBytesThisCommit = 0;
mutationBytesTargetThisCommit = randomSize(maxCommitSize);
// Recover from disk at random
if (!pagerMemoryOnly && coldStarts < maxColdStarts &&
deterministicRandom()->random01() < coldStartProbability) {
++coldStarts;
printf("Recovering from disk after next commit.\n");
// Wait for outstanding commit
debug_printf("Waiting for outstanding commit\n");
wait(commit);
// Stop and wait for the verifier task
committedVersions.sendError(end_of_stream());
debug_printf("Waiting for verification to complete.\n");
wait(verifyTask);
debug_printf("Closing btree\n");
Future<Void> closedFuture = btree->onClosed();
btree->close();
wait(closedFuture);
printf("Reopening btree from disk.\n");
IPager2* pager = new DWALPager(pageSize,
extentSize,
file,
pageCacheBytes,
remapCleanupWindowBytes,
concurrentExtentReads,
false,
keyProvider);
btree = new VersionedBTree(pager, file, encodingType, keyProvider);
wait(btree->init());
Version v = btree->getLastCommittedVersion();
printf("Recovered from disk. Latest recovered version %" PRId64 " highest written version %" PRId64
"\n",
v,
version);
ASSERT(v == version);
// Sometimes do zero-change commit at last version
if (deterministicRandom()->coinflip()) {
wait(btree->commit(version));
}
// Create new promise stream and start the verifier again
committedVersions = PromiseStream<Version>();
verifyTask = verify(btree, committedVersions.getFuture(), &written, &totalRecordsRead, serialTest);
if (!serialTest) {
randomTask = randomReader(btree, &totalRecordsRead) || btree->getError();
}
committedVersions.send(version);
}
version += versionIncrement;
}
}
debug_printf("Waiting for outstanding commit\n");
wait(commit);
committedVersions.sendError(end_of_stream());
randomTask.cancel();
debug_printf("Waiting for verification to complete.\n");
wait(verifyTask);
// Sometimes close and reopen before destructive sanity check
if (deterministicRandom()->coinflip()) {
Future<Void> closedFuture = btree->onClosed();
btree->close();
wait(closedFuture);
btree = new VersionedBTree(new DWALPager(pageSize,
extentSize,
file,
pageCacheBytes,
(BUGGIFY ? 0 : remapCleanupWindowBytes),
concurrentExtentReads,
pagerMemoryOnly,
keyProvider),
file,
encodingType,
keyProvider);
wait(btree->init());
}
wait(btree->clearAllAndCheckSanity());
Future<Void> closedFuture = btree->onClosed();
btree->close();
debug_printf("Closing.\n");
wait(closedFuture);
wait(delay(0));
ASSERT(DWALPager::PageCacheT::Evictor::getEvictor()->empty());
return Void();
}
ACTOR Future<Void> randomSeeks(VersionedBTree* btree,
Optional<Version> v,
bool reInitCursor,
int count,
char firstChar,
char lastChar,
int keyLen) {
state int c = 0;
state double readStart = timer();
state VersionedBTree::BTreeCursor cur;
state Version readVer = v.orDefault(btree->getLastCommittedVersion());
wait(btree->initBTreeCursor(&cur, readVer, PagerEventReasons::PointRead));
while (c < count) {
if (!v.present()) {
readVer = btree->getLastCommittedVersion();
}
if (reInitCursor) {
wait(btree->initBTreeCursor(&cur, readVer, PagerEventReasons::PointRead));
}
state Key k = randomString(keyLen, firstChar, lastChar);
wait(cur.seekGTE(k));
++c;
}
double elapsed = timer() - readStart;
printf("Random seek speed %d/s\n", int(count / elapsed));
return Void();
}
ACTOR Future<Void> randomScans(VersionedBTree* btree,
int count,
int width,
int prefetchBytes,
char firstChar,
char lastChar) {
state Version readVer = btree->getLastCommittedVersion();
state int c = 0;
state double readStart = timer();
state VersionedBTree::BTreeCursor cur;
wait(btree->initBTreeCursor(&cur, readVer, PagerEventReasons::RangeRead));
state int totalScanBytes = 0;
while (c++ < count) {
state Key k = randomString(20, firstChar, lastChar);
wait(cur.seekGTE(k));
state int w = width;
state bool directionFwd = deterministicRandom()->coinflip();
if (prefetchBytes > 0) {
cur.prefetch(directionFwd ? VersionedBTree::dbEnd.key : VersionedBTree::dbBegin.key,
directionFwd,
width,
prefetchBytes);
}
while (w > 0 && cur.isValid()) {
totalScanBytes += cur.get().expectedSize();
wait(success(directionFwd ? cur.moveNext() : cur.movePrev()));
--w;
}
}
double elapsed = timer() - readStart;
fmt::print(
"Completed {0} scans: width={1} totalbytesRead={2} prefetchBytes={3} scansRate={4} scans/s {5:.2f} MB/s\n",
count,
width,
totalScanBytes,
prefetchBytes,
int(count / elapsed),
double(totalScanBytes) / 1e6 / elapsed);
return Void();
}
template <int size>
struct ExtentQueueEntry {
uint8_t entry[size];
bool operator<(const ExtentQueueEntry& rhs) const { return entry < rhs.entry; }
std::string toString() const {
return format("{%s}", StringRef((const uint8_t*)entry, size).toHexString().c_str());
}
};
typedef FIFOQueue<ExtentQueueEntry<16>> ExtentQueueT;
TEST_CASE(":/redwood/performance/extentQueue") {
state ExtentQueueT m_extentQueue;
state ExtentQueueT::QueueState extentQueueState;
state DWALPager* pager;
// If a test file is passed in by environment then don't write new data to it.
state bool reload = getenv("TESTFILE") == nullptr;
state std::string fileName = reload ? "unittest.redwood-v1" : getenv("TESTFILE");
if (reload) {
printf("Deleting old test data\n");
deleteFile(fileName);
}
printf("Filename: %s\n", fileName.c_str());
state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
state int extentSize = params.getInt("extentSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE);
state int64_t cacheSizeBytes = params.getInt("cacheSizeBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
// Choose a large remapCleanupWindowBytes to avoid popping the queue
state int64_t remapCleanupWindowBytes = params.getInt("remapCleanupWindowBytes").orDefault(1e16);
state int numEntries = params.getInt("numEntries").orDefault(10e6);
state int concurrentExtentReads =
params.getInt("concurrentExtentReads").orDefault(SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS);
state int targetCommitSize = deterministicRandom()->randomInt(2e6, 30e6);
state int currentCommitSize = 0;
state int64_t cumulativeCommitSize = 0;
printf("pageSize: %d\n", pageSize);
printf("extentSize: %d\n", extentSize);
printf("cacheSizeBytes: %" PRId64 "\n", cacheSizeBytes);
printf("remapCleanupWindowBytes: %" PRId64 "\n", remapCleanupWindowBytes);
// Do random pushes into the queue and commit periodically
if (reload) {
pager = new DWALPager(pageSize,
extentSize,
fileName,
cacheSizeBytes,
remapCleanupWindowBytes,
concurrentExtentReads,
false,
Reference<IPageEncryptionKeyProvider>());
wait(success(pager->init()));
LogicalPageID extID = pager->newLastExtentID();
m_extentQueue.create(pager, extID, "ExtentQueue", pager->newLastQueueID(), true);
pager->pushExtentUsedList(m_extentQueue.queueID, extID);
state int v;
state ExtentQueueEntry<16> e;
deterministicRandom()->randomBytes(e.entry, 16);
state int sinceYield = 0;
for (v = 1; v <= numEntries; ++v) {
// Sometimes do a commit
if (currentCommitSize >= targetCommitSize) {
fmt::print("currentCommitSize: {0}, cumulativeCommitSize: {1}, pageCacheCount: {2}\n",
currentCommitSize,
cumulativeCommitSize,
pager->getPageCacheCount());
wait(m_extentQueue.flush());
wait(pager->commit(pager->getLastCommittedVersion() + 1,
ObjectWriter::toValue(m_extentQueue.getState(), Unversioned())));
cumulativeCommitSize += currentCommitSize;
targetCommitSize = deterministicRandom()->randomInt(2e6, 30e6);
currentCommitSize = 0;
}
// push a random entry into the queue
m_extentQueue.pushBack(e);
currentCommitSize += 16;
// yield periodically to avoid overflowing the stack
if (++sinceYield >= 100) {
sinceYield = 0;
wait(yield());
}
}
cumulativeCommitSize += currentCommitSize;
fmt::print(
"Final cumulativeCommitSize: {0}, pageCacheCount: {1}\n", cumulativeCommitSize, pager->getPageCacheCount());
wait(m_extentQueue.flush());
printf("Commit ExtentQueue getState(): %s\n", m_extentQueue.getState().toString().c_str());
wait(pager->commit(pager->getLastCommittedVersion() + 1,
ObjectWriter::toValue(m_extentQueue.getState(), Unversioned())));
Future<Void> onClosed = pager->onClosed();
pager->close();
wait(onClosed);
}
printf("Reopening pager file from disk.\n");
pager = new DWALPager(pageSize,
extentSize,
fileName,
cacheSizeBytes,
remapCleanupWindowBytes,
concurrentExtentReads,
false,
Reference<IPageEncryptionKeyProvider>());
wait(success(pager->init()));
printf("Starting ExtentQueue FastPath Recovery from Disk.\n");
// reopen the pager from disk
extentQueueState = ObjectReader::fromStringRef<ExtentQueueT::QueueState>(pager->getCommitRecord(), Unversioned());
printf("Recovered ExtentQueue getState(): %s\n", extentQueueState.toString().c_str());
m_extentQueue.recover(pager, extentQueueState, "ExtentQueueRecovered");
state double intervalStart = timer();
state double start = intervalStart;
state Standalone<VectorRef<LogicalPageID>> extentIDs = wait(pager->getUsedExtents(m_extentQueue.queueID));
// fire read requests for all used extents
state int i;
for (i = 1; i < extentIDs.size() - 1; i++) {
LogicalPageID extID = extentIDs[i];
pager->readExtent(extID);
}
state PromiseStream<Standalone<VectorRef<ExtentQueueEntry<16>>>> resultStream;
state Future<Void> queueRecoverActor;
queueRecoverActor = m_extentQueue.peekAllExt(resultStream);
state int entriesRead = 0;
try {
loop choose {
when(Standalone<VectorRef<ExtentQueueEntry<16>>> entries = waitNext(resultStream.getFuture())) {
entriesRead += entries.size();
if (entriesRead == m_extentQueue.numEntries)
break;
}
when(wait(queueRecoverActor)) { queueRecoverActor = Never(); }
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
state double elapsed = timer() - start;
printf("Completed fastpath extent queue recovery: elapsed=%f entriesRead=%d recoveryRate=%f MB/s\n",
elapsed,
entriesRead,
cumulativeCommitSize / elapsed / 1e6);
fmt::print("pageCacheCount: {0} extentCacheCount: {1}\n", pager->getPageCacheCount(), pager->getExtentCacheCount());
pager->extentCacheClear();
m_extentQueue.resetHeadReader();
printf("Starting ExtentQueue SlowPath Recovery from Disk.\n");
intervalStart = timer();
start = intervalStart;
// peekAll the queue using regular slow path
Standalone<VectorRef<ExtentQueueEntry<16>>> entries = wait(m_extentQueue.peekAll());
elapsed = timer() - start;
printf("Completed slowpath extent queue recovery: elapsed=%f entriesRead=%d recoveryRate=%f MB/s\n",
elapsed,
entries.size(),
cumulativeCommitSize / elapsed / 1e6);
return Void();
}
TEST_CASE(":/redwood/performance/set") {
state SignalableActorCollection actors;
state std::string file = params.get("file").orDefault("unittest.redwood-v1");
state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
state int extentSize = params.getInt("extentSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE);
state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
state int nodeCount = params.getInt("nodeCount").orDefault(1e9);
state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000);
state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6);
state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9);
state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25);
state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25);
state int minValueSize = params.getInt("minValueSize").orDefault(100);
state int maxValueSize = params.getInt("maxValueSize").orDefault(500);
state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1);
state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100);
state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0];
state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0];
state int64_t remapCleanupWindowBytes = params.getInt("remapCleanupWindowBytes").orDefault(100LL * 1024 * 1024);
state int concurrentExtentReads =
params.getInt("concurrentExtentReads").orDefault(SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS);
state bool openExisting = params.getInt("openExisting").orDefault(0);
state bool insertRecords = !openExisting || params.getInt("insertRecords").orDefault(0);
state int concurrentSeeks = params.getInt("concurrentSeeks").orDefault(64);
state int concurrentScans = params.getInt("concurrentScans").orDefault(64);
state int seeks = params.getInt("seeks").orDefault(1000000);
state int scans = params.getInt("scans").orDefault(20000);
state int scanWidth = params.getInt("scanWidth").orDefault(50);
state int scanPrefetchBytes = params.getInt("scanPrefetchBytes").orDefault(0);
state bool pagerMemoryOnly = params.getInt("pagerMemoryOnly").orDefault(0);
state bool traceMetrics = params.getInt("traceMetrics").orDefault(0);
state bool destructiveSanityCheck = params.getInt("destructiveSanityCheck").orDefault(0);
printf("file: %s\n", file.c_str());
printf("openExisting: %d\n", openExisting);
printf("insertRecords: %d\n", insertRecords);
printf("destructiveSanityCheck: %d\n", destructiveSanityCheck);
printf("pagerMemoryOnly: %d\n", pagerMemoryOnly);
printf("pageSize: %d\n", pageSize);
printf("extentSize: %d\n", extentSize);
printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
printf("trailingIntegerIndexRange: %d\n", nodeCount);
printf("maxChangesPerCommit: %d\n", maxRecordsPerCommit);
printf("minKeyPrefixBytes: %d\n", minKeyPrefixBytes);
printf("maxKeyPrefixBytes: %d\n", maxKeyPrefixBytes);
printf("minConsecutiveRun: %d\n", minConsecutiveRun);
printf("maxConsecutiveRun: %d\n", maxConsecutiveRun);
printf("minValueSize: %d\n", minValueSize);
printf("maxValueSize: %d\n", maxValueSize);
printf("maxCommitSize: %d\n", maxKVBytesPerCommit);
printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget);
printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar);
printf("remapCleanupWindowBytes: %" PRId64 "\n", remapCleanupWindowBytes);
printf("concurrentScans: %d\n", concurrentScans);
printf("concurrentSeeks: %d\n", concurrentSeeks);
printf("seeks: %d\n", seeks);
printf("scans: %d\n", scans);
printf("scanWidth: %d\n", scanWidth);
printf("scanPrefetchBytes: %d\n", scanPrefetchBytes);
// If using stdout for metrics, prevent trace event metrics logger from starting
if (!traceMetrics) {
g_redwoodMetricsActor = Void();
g_redwoodMetrics.clear();
}
if (!openExisting) {
printf("Deleting old test data\n");
deleteFile(file);
}
DWALPager* pager = new DWALPager(pageSize,
extentSize,
file,
pageCacheBytes,
remapCleanupWindowBytes,
concurrentExtentReads,
pagerMemoryOnly,
Reference<IPageEncryptionKeyProvider>());
state VersionedBTree* btree =
new VersionedBTree(pager, file, EncodingType::XXHash64, Reference<IPageEncryptionKeyProvider>());
wait(btree->init());
printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
state int64_t kvBytesThisCommit = 0;
state int64_t kvBytesTotal = 0;
state int recordsThisCommit = 0;
state Future<Void> commit = Void();
state std::string value(maxValueSize, 'v');
printf("Starting.\n");
state double intervalStart = timer();
state double start = intervalStart;
state int sinceYield = 0;
state Version version = btree->getLastCommittedVersion();
if (insertRecords) {
while (kvBytesTotal < kvBytesTarget) {
state int changesThisVersion =
deterministicRandom()->randomInt(0, maxRecordsPerCommit - recordsThisCommit + 1);
while (changesThisVersion > 0 && kvBytesThisCommit < maxKVBytesPerCommit) {
KeyValue kv;
kv.key = randomString(kv.arena(),
deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t),
maxKeyPrefixBytes + sizeof(uint32_t) + 1),
firstKeyChar,
lastKeyChar);
int32_t index = deterministicRandom()->randomInt(0, nodeCount);
int runLength = deterministicRandom()->randomInt(minConsecutiveRun, maxConsecutiveRun + 1);
while (runLength > 0 && changesThisVersion > 0) {
*(uint32_t*)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++);
kv.value = StringRef((uint8_t*)value.data(),
deterministicRandom()->randomInt(minValueSize, maxValueSize + 1));
btree->set(kv);
--runLength;
--changesThisVersion;
kvBytesThisCommit += kv.key.size() + kv.value.size();
++recordsThisCommit;
}
if (++sinceYield >= 100) {
sinceYield = 0;
wait(yield());
}
}
if (kvBytesThisCommit >= maxKVBytesPerCommit || recordsThisCommit >= maxRecordsPerCommit) {
btree->setOldestReadableVersion(btree->getLastCommittedVersion());
wait(commit);
TraceEvent e("RedwoodState");
btree->toTraceEvent(e);
e.log();
printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n",
kvBytesTotal / 1e6,
kvBytesTotal / (timer() - start) / 1e6);
// Avoid capturing via this to freeze counter values
int recs = recordsThisCommit;
int kvb = kvBytesThisCommit;
// Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the
// actor state object
double* pIntervalStart = &intervalStart;
commit = map(btree->commit(++version), [=](Void result) {
if (!traceMetrics) {
printf("%s\n", g_redwoodMetrics.toString(true).c_str());
}
double elapsed = timer() - *pIntervalStart;
printf("Committed %d keyValueBytes in %d records in %f seconds, %.2f MB/s\n",
kvb,
recs,
elapsed,
kvb / elapsed / 1e6);
*pIntervalStart = timer();
return Void();
});
kvBytesTotal += kvBytesThisCommit;
kvBytesThisCommit = 0;
recordsThisCommit = 0;
}
}
wait(commit);
printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n",
kvBytesTotal / 1e6,
kvBytesTotal / (timer() - start) / 1e6);
printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
}
state Future<Void> stats =
traceMetrics ? Void()
: recurring([&]() { printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); }, 1.0);
if (scans > 0) {
printf("Parallel scans, concurrency=%d, scans=%d, scanWidth=%d, scanPreftchBytes=%d ...\n",
concurrentScans,
scans,
scanWidth,
scanPrefetchBytes);
for (int x = 0; x < concurrentScans; ++x) {
actors.add(
randomScans(btree, scans / concurrentScans, scanWidth, scanPrefetchBytes, firstKeyChar, lastKeyChar));
}
wait(actors.signalAndReset());
}
if (seeks > 0) {
printf("Parallel seeks, concurrency=%d, seeks=%d ...\n", concurrentSeeks, seeks);
for (int x = 0; x < concurrentSeeks; ++x) {
actors.add(
randomSeeks(btree, Optional<Version>(), true, seeks / concurrentSeeks, firstKeyChar, lastKeyChar, 4));
}
wait(actors.signalAndReset());
}
stats.cancel();
if (destructiveSanityCheck) {
wait(btree->clearAllAndCheckSanity());
}
Future<Void> closedFuture = btree->onClosed();
btree->close();
wait(closedFuture);
wait(delay(0));
ASSERT(DWALPager::PageCacheT::Evictor::getEvictor()->empty());
return Void();
}
struct PrefixSegment {
int length;
int cardinality;
std::string toString() const { return format("{%d bytes, %d choices}", length, cardinality); }
};
// Utility class for generating kv pairs under a prefix pattern
// It currently uses std::string in an abstraction breaking way.
struct KVSource {
KVSource() {}
typedef VectorRef<uint8_t> PrefixRef;
typedef Standalone<PrefixRef> Prefix;
std::vector<PrefixSegment> desc;
std::vector<std::vector<std::string>> segments;
std::vector<Prefix> prefixes;
std::vector<Prefix*> prefixesSorted;
std::string valueData;
int prefixLen;
int lastIndex;
// TODO there is probably a better way to do this
Prefix extraRangePrefix;
KVSource(const std::vector<PrefixSegment>& desc, int numPrefixes = 0) : desc(desc) {
if (numPrefixes == 0) {
numPrefixes = 1;
for (auto& p : desc) {
numPrefixes *= p.cardinality;
}
}
prefixLen = 0;
for (auto& s : desc) {
prefixLen += s.length;
std::vector<std::string> parts;
while (parts.size() < s.cardinality) {
parts.push_back(deterministicRandom()->randomAlphaNumeric(s.length));
}
segments.push_back(std::move(parts));
}
while (prefixes.size() < numPrefixes) {
std::string p;
for (auto& s : segments) {
p.append(s[deterministicRandom()->randomInt(0, s.size())]);
}
prefixes.push_back(PrefixRef((uint8_t*)p.data(), p.size()));
}
for (auto& p : prefixes) {
prefixesSorted.push_back(&p);
}
std::sort(prefixesSorted.begin(), prefixesSorted.end(), [](const Prefix* a, const Prefix* b) {
return KeyRef((uint8_t*)a->begin(), a->size()) < KeyRef((uint8_t*)b->begin(), b->size());
});
valueData = deterministicRandom()->randomAlphaNumeric(100000);
lastIndex = 0;
}
// Expands the chosen prefix in the prefix list to hold suffix,
// fills suffix with random bytes, and returns a reference to the string
KeyRef getKeyRef(int suffixLen) { return makeKey(randomPrefix(), suffixLen); }
// Like getKeyRef but uses the same prefix as the last randomly chosen prefix
KeyRef getAnotherKeyRef(int suffixLen, bool sorted = false) {
Prefix& p = sorted ? *prefixesSorted[lastIndex] : prefixes[lastIndex];
return makeKey(p, suffixLen);
}
// Like getKeyRef but gets a KeyRangeRef. If samePrefix, it returns a range from the same prefix,
// otherwise it returns a random range from the entire keyspace
// Can technically return an empty range with low probability
KeyRangeRef getKeyRangeRef(bool samePrefix, int suffixLen, bool sorted = false) {
KeyRef a, b;
a = getKeyRef(suffixLen);
// Copy a so that b's Prefix Arena allocation doesn't overwrite a if using the same prefix
extraRangePrefix.reserve(extraRangePrefix.arena(), a.size());
a.copyTo((uint8_t*)extraRangePrefix.begin());
a = KeyRef(extraRangePrefix.begin(), a.size());
if (samePrefix) {
b = getAnotherKeyRef(suffixLen, sorted);
} else {
b = getKeyRef(suffixLen);
}
if (a < b) {
return KeyRangeRef(a, b);
} else {
return KeyRangeRef(b, a);
}
}
// TODO unused, remove?
// Like getKeyRef but gets a KeyRangeRef for two keys covering the given number of sorted adjacent prefixes
KeyRangeRef getRangeRef(int prefixesCovered, int suffixLen) {
prefixesCovered = std::min<int>(prefixesCovered, prefixes.size());
int i = deterministicRandom()->randomInt(0, prefixesSorted.size() - prefixesCovered);
Prefix* begin = prefixesSorted[i];
Prefix* end = prefixesSorted[i + prefixesCovered];
return KeyRangeRef(makeKey(*begin, suffixLen), makeKey(*end, suffixLen));
}
KeyRef getValue(int len) { return KeyRef(valueData).substr(0, len); }
// Move lastIndex to the next position, wrapping around to 0
void nextPrefix() {
++lastIndex;
if (lastIndex == prefixes.size()) {
lastIndex = 0;
}
}
Prefix& randomPrefix() {
lastIndex = deterministicRandom()->randomInt(0, prefixes.size());
return prefixes[lastIndex];
}
static KeyRef makeKey(Prefix& p, int suffixLen) {
p.reserve(p.arena(), p.size() + suffixLen);
uint8_t* wptr = p.end();
for (int i = 0; i < suffixLen; ++i) {
*wptr++ = (uint8_t)deterministicRandom()->randomAlphaNumeric();
}
return KeyRef(p.begin(), p.size() + suffixLen);
}
int numPrefixes() const { return prefixes.size(); };
std::string toString() const {
return format("{prefixLen=%d prefixes=%d format=%s}", prefixLen, numPrefixes(), ::toString(desc).c_str());
}
};
ACTOR Future<StorageBytes> getStableStorageBytes(IKeyValueStore* kvs) {
state StorageBytes sb = kvs->getStorageBytes();
// Wait for StorageBytes used metric to stabilize
loop {
wait(kvs->commit());
StorageBytes sb2 = kvs->getStorageBytes();
bool stable = sb2.used == sb.used;
sb = sb2;
if (stable) {
break;
}
}
return sb;
}
ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
int suffixSize,
int valueSize,
KVSource source,
int recordCountTarget,
bool usePrefixesInOrder,
bool clearAfter) {
state int commitTarget = 5e6;
state int recordSize = source.prefixLen + suffixSize + valueSize;
state int64_t kvBytesTarget = (int64_t)recordCountTarget * recordSize;
state int recordsPerPrefix = recordCountTarget / source.numPrefixes();
fmt::print("\nstoreType: {}\n", static_cast<int>(kvs->getType()));
fmt::print("commitTarget: {}\n", commitTarget);
fmt::print("prefixSource: {}\n", source.toString());
fmt::print("usePrefixesInOrder: {}\n", usePrefixesInOrder);
fmt::print("suffixSize: {}\n", suffixSize);
fmt::print("valueSize: {}\n", valueSize);
fmt::print("recordSize: {}\n", recordSize);
fmt::print("recordsPerPrefix: {}\n", recordsPerPrefix);
fmt::print("recordCountTarget: {}\n", recordCountTarget);
fmt::print("kvBytesTarget: {}\n", kvBytesTarget);
state int64_t kvBytes = 0;
state int64_t kvBytesTotal = 0;
state int records = 0;
state Future<Void> commit = Void();
state std::string value = deterministicRandom()->randomAlphaNumeric(1e6);
wait(kvs->init());
state double intervalStart = timer();
state double start = intervalStart;
state std::function<void()> stats = [&]() {
double elapsed = timer() - start;
printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r",
elapsed,
kvBytesTotal / 1e6,
records,
kvBytesTotal / elapsed / 1e6,
records / elapsed);
fflush(stdout);
};
while (kvBytesTotal < kvBytesTarget) {
wait(yield());
state int i;
for (i = 0; i < recordsPerPrefix; ++i) {
KeyValueRef kv(source.getAnotherKeyRef(suffixSize, usePrefixesInOrder), source.getValue(valueSize));
kvs->set(kv);
kvBytes += kv.expectedSize();
++records;
if (kvBytes >= commitTarget) {
wait(commit);
stats();
commit = kvs->commit();
kvBytesTotal += kvBytes;
if (kvBytesTotal >= kvBytesTarget) {
break;
}
kvBytes = 0;
}
}
// Use every prefix, one at a time
source.nextPrefix();
}
wait(commit);
// TODO is it desired that not all records are committed? This could commit again to ensure any records set() since
// the last commit are persisted. For the purposes of how this is used currently, I don't think it matters though
stats();
printf("\n");
intervalStart = timer();
StorageBytes sb = wait(getStableStorageBytes(kvs));
printf("storageBytes: %s (stable after %.2f seconds)\n", toString(sb).c_str(), timer() - intervalStart);
if (clearAfter) {
printf("Clearing all keys\n");
intervalStart = timer();
kvs->clear(KeyRangeRef(""_sr, "\xff"_sr));
state StorageBytes sbClear = wait(getStableStorageBytes(kvs));
printf("Cleared all keys in %.2f seconds, final storageByte: %s\n",
timer() - intervalStart,
toString(sbClear).c_str());
}
return Void();
}
ACTOR Future<Void> sequentialInsert(IKeyValueStore* kvs, int prefixLen, int valueSize, int recordCountTarget) {
state int commitTarget = 5e6;
state KVSource source({ { prefixLen, 1 } });
state int recordSize = source.prefixLen + sizeof(uint64_t) + valueSize;
state int64_t kvBytesTarget = (int64_t)recordCountTarget * recordSize;
fmt::print("\nstoreType: {}\n", static_cast<int>(kvs->getType()));
fmt::print("commitTarget: {}\n", commitTarget);
fmt::print("valueSize: {}\n", valueSize);
fmt::print("recordSize: {}\n", recordSize);
fmt::print("recordCountTarget: {}\n", recordCountTarget);
fmt::print("kvBytesTarget: {}\n", kvBytesTarget);
state int64_t kvBytes = 0;
state int64_t kvBytesTotal = 0;
state int records = 0;
state Future<Void> commit = Void();
state std::string value = deterministicRandom()->randomAlphaNumeric(1e6);
wait(kvs->init());
state double intervalStart = timer();
state double start = intervalStart;
state std::function<void()> stats = [&]() {
double elapsed = timer() - start;
printf("Cumulative stats: %.2f seconds %.2f MB keyValue bytes %d records %.2f MB/s %.2f rec/s\r",
elapsed,
kvBytesTotal / 1e6,
records,
kvBytesTotal / elapsed / 1e6,
records / elapsed);
fflush(stdout);
};
state uint64_t c = 0;
state Key key = source.getKeyRef(sizeof(uint64_t));
while (kvBytesTotal < kvBytesTarget) {
wait(yield());
*(uint64_t*)(key.end() - sizeof(uint64_t)) = bigEndian64(c);
KeyValueRef kv(key, source.getValue(valueSize));
kvs->set(kv);
kvBytes += kv.expectedSize();
++records;
if (kvBytes >= commitTarget) {
wait(commit);
stats();
commit = kvs->commit();
kvBytesTotal += kvBytes;
if (kvBytesTotal >= kvBytesTarget) {
break;
}
kvBytes = 0;
}
++c;
}
wait(commit);
stats();
printf("\n");
return Void();
}
Future<Void> closeKVS(IKeyValueStore* kvs) {
Future<Void> closed = kvs->onClosed();
kvs->close();
return closed;
}
ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
int valueSize,
int recordCountTarget,
bool usePrefixesInOrder,
KVSource source) {
deleteFile("test.redwood-v1");
wait(delay(5));
state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood-v1", UID(), 0);
wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
wait(closeKVS(redwood));
printf("\n");
deleteFile("test.sqlite");
deleteFile("test.sqlite-wal");
wait(delay(5));
state IKeyValueStore* sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0);
wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
wait(closeKVS(sqlite));
printf("\n");
return Void();
}
TEST_CASE(":/redwood/performance/prefixSizeComparison") {
state int suffixSize = params.getInt("suffixSize").orDefault(12);
state int valueSize = params.getInt("valueSize").orDefault(100);
state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0);
wait(doPrefixInsertComparison(
suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } })));
wait(doPrefixInsertComparison(
suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 16, 100000 } })));
wait(doPrefixInsertComparison(
suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 32, 100000 } })));
wait(doPrefixInsertComparison(suffixSize,
valueSize,
recordCountTarget,
usePrefixesInOrder,
KVSource({ { 4, 5 }, { 12, 1000 }, { 8, 5 }, { 8, 4 } })));
return Void();
}
TEST_CASE(":/redwood/performance/sequentialInsert") {
state int prefixLen = params.getInt("prefixLen").orDefault(30);
state int valueSize = params.getInt("valueSize").orDefault(100);
state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
deleteFile("test.redwood-v1");
wait(delay(5));
state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood-v1", UID(), 0);
wait(sequentialInsert(redwood, prefixLen, valueSize, recordCountTarget));
wait(closeKVS(redwood));
printf("\n");
return Void();
}
// singlePrefix forces the range read to have the start and end key with the same prefix
ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
int suffixSize,
KVSource source,
int valueSize,
int recordCountTarget,
bool singlePrefix,
int rowLimit,
int byteLimit,
Optional<ReadOptions> options = Optional<ReadOptions>()) {
fmt::print("\nstoreType: {}\n", static_cast<int>(kvs->getType()));
fmt::print("prefixSource: {}\n", source.toString());
fmt::print("suffixSize: {}\n", suffixSize);
fmt::print("recordCountTarget: {}\n", recordCountTarget);
fmt::print("singlePrefix: {}\n", singlePrefix);
fmt::print("rowLimit: {}\n", rowLimit);
state int64_t recordSize = source.prefixLen + suffixSize + valueSize;
state int64_t bytesRead = 0;
state int64_t recordsRead = 0;
state int queries = 0;
state int64_t nextPrintRecords = 1e5;
state double start = timer();
state std::function<void()> stats = [&]() {
double elapsed = timer() - start;
fmt::print("Cumulative stats: {0:.2f} seconds {1} queries {2:.2f} MB {3} records {4:.2f} qps {5:.2f} MB/s "
"{6:.2f} rec/s\r\n",
elapsed,
queries,
bytesRead / 1e6,
recordsRead,
queries / elapsed,
bytesRead / elapsed / 1e6,
recordsRead / elapsed);
fflush(stdout);
};
while (recordsRead < recordCountTarget) {
KeyRangeRef range = source.getKeyRangeRef(singlePrefix, suffixSize);
int rowLim = (deterministicRandom()->randomInt(0, 2) != 0) ? rowLimit : -rowLimit;
RangeResult result = wait(kvs->readRange(range, rowLim, byteLimit, options));
recordsRead += result.size();
bytesRead += result.size() * recordSize;
++queries;
// log stats with exponential backoff
if (recordsRead >= nextPrintRecords) {
stats();
nextPrintRecords *= 2;
}
}
stats();
printf("\n");
return Void();
}
TEST_CASE(":/redwood/performance/randomRangeScans") {
state int prefixLen = 30;
state int suffixSize = 12;
state int valueSize = 100;
state int maxByteLimit = std::numeric_limits<int>::max();
// TODO change to 100e8 after figuring out no-disk redwood mode
state int writeRecordCountTarget = 1e6;
state int queryRecordTarget = 1e7;
state int writePrefixesInOrder = false;
state KVSource source({ { prefixLen, 1000 } });
deleteFile("test.redwood-v1");
wait(delay(5));
state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood-v1", UID(), 0);
wait(prefixClusteredInsert(
redwood, suffixSize, valueSize, source, writeRecordCountTarget, writePrefixesInOrder, false));
// divide targets for tiny queries by 10 because they are much slower
wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, true, 10, maxByteLimit));
wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, true, 1000, maxByteLimit));
wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, false, 100, maxByteLimit));
wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 10000, maxByteLimit));
wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 1000000, maxByteLimit));
wait(closeKVS(redwood));
printf("\n");
return Void();
}
TEST_CASE(":/redwood/performance/histograms") {
// Time needed to log 33 histograms.
std::vector<Reference<Histogram>> histograms;
for (int i = 0; i < 33; i++) {
std::string levelString = "L" + std::to_string(i);
histograms.push_back(Histogram::getHistogram("histogramTest"_sr, "levelString"_sr, Histogram::Unit::bytes));
}
for (int i = 0; i < 33; i++) {
for (int j = 0; j < 32; j++) {
histograms[i]->sample(std::pow(2, j));
}
}
auto t_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 33; i++) {
histograms[i]->writeToLog(30.0);
}
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Time needed to log 33 histograms (millisecond): " << elapsed_time_ms << std::endl;
return Void();
}
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}