diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 24340a7ca2..5e51a1c6ca 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -619,23 +619,23 @@ FDBFuture* fdb_transaction_get_range_impl(FDBTransaction* tr, .extractPtr()); } -FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr, - uint8_t const* begin_key_name, - int begin_key_name_length, - fdb_bool_t begin_or_equal, - int begin_offset, - uint8_t const* end_key_name, - int end_key_name_length, - fdb_bool_t end_or_equal, - int end_offset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int target_bytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse) { +extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_mapped_range(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + fdb_bool_t begin_or_equal, + int begin_offset, + uint8_t const* end_key_name, + int end_key_name_length, + fdb_bool_t end_or_equal, + int end_offset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int target_bytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse) { FDBFuture* r = validate_and_update_parameters(limit, target_bytes, mode, iteration, reverse); if (r != nullptr) return r; @@ -651,25 +651,24 @@ FDBFuture* fdb_transaction_get_mapped_range_impl(FDBTransaction* tr, .extractPtr()); } -// TODO: Support FDB_API_ADDED in generate_asm.py and then this can be replaced with fdb_api_ptr_unimpl. -FDBFuture* fdb_transaction_get_mapped_range_v699(FDBTransaction* tr, - uint8_t const* begin_key_name, - int begin_key_name_length, - fdb_bool_t begin_or_equal, - int begin_offset, - uint8_t const* end_key_name, - int end_key_name_length, - fdb_bool_t end_or_equal, - int end_offset, - uint8_t const* mapper_name, - int mapper_name_length, - int limit, - int target_bytes, - FDBStreamingMode mode, - int iteration, - fdb_bool_t snapshot, - fdb_bool_t reverse) { - fprintf(stderr, "UNIMPLEMENTED FDB API FUNCTION\n"); +FDBFuture* fdb_transaction_get_range_and_flat_map_v709(FDBTransaction* tr, + uint8_t const* begin_key_name, + int begin_key_name_length, + fdb_bool_t begin_or_equal, + int begin_offset, + uint8_t const* end_key_name, + int end_key_name_length, + fdb_bool_t end_or_equal, + int end_offset, + uint8_t const* mapper_name, + int mapper_name_length, + int limit, + int target_bytes, + FDBStreamingMode mode, + int iteration, + fdb_bool_t snapshot, + fdb_bool_t reverse) { + fprintf(stderr, "GetRangeAndFlatMap is removed from 7.0. Please upgrade to 7.1 and use GetMappedRange\n"); abort(); } @@ -900,13 +899,13 @@ extern "C" DLLEXPORT fdb_error_t fdb_select_api_version_impl(int runtime_version // Versioned API changes -- descending order by version (new changes at top) // FDB_API_CHANGED( function, ver ) means there is a new implementation as of ver, and a function function_(ver-1) - // is the old implementation FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and + // is the old implementation. FDB_API_REMOVED( function, ver ) means the function was removed as of ver, and // function_(ver-1) is the old implementation // // WARNING: use caution when implementing removed functions by calling public API functions. This can lead to // undesired behavior when using the multi-version API. Instead, it is better to have both the removed and public // functions call an internal implementation function. See fdb_create_database_impl for an example. - FDB_API_CHANGED(fdb_transaction_get_mapped_range, 700); + FDB_API_REMOVED(fdb_transaction_get_range_and_flat_map, 710); FDB_API_REMOVED(fdb_future_get_version, 620); FDB_API_REMOVED(fdb_create_cluster, 610); FDB_API_REMOVED(fdb_cluster_create_database, 610); diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index ce3be8b68f..823c0308b5 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -28,7 +28,6 @@ Features * Improved the efficiency with which storage servers replicate data between themselves. `(PR #5017) `_ * Added support to ``exclude command`` to exclude based on locality match. `(PR #5113) `_ * Add the ``trace_partial_file_suffix`` network option. This option will give unfinished trace files a special suffix to indicate they're not complete yet. When the trace file is complete, it is renamed to remove the suffix. `(PR #5328) `_ -* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) `_ Performance ----------- @@ -85,8 +84,6 @@ Bindings * C: Added a function, ``fdb_database_create_snapshot``, to create a snapshot of the database. `(PR #4241) `_ * C: Added ``fdb_database_get_main_thread_busyness`` function to report how busy a client's main thread is. `(PR #4504) `_ * Java: Added ``Database.getMainThreadBusyness`` function to report how busy a client's main thread is. `(PR #4564) `_ -* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) `_ -* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) `_ Other Changes ------------- diff --git a/documentation/sphinx/source/release-notes/release-notes-710.rst b/documentation/sphinx/source/release-notes/release-notes-710.rst index 9eb59ab541..78412c15bb 100644 --- a/documentation/sphinx/source/release-notes/release-notes-710.rst +++ b/documentation/sphinx/source/release-notes/release-notes-710.rst @@ -10,6 +10,7 @@ Release Notes Features -------- * Added ``USE_GRV_CACHE`` transaction option to allow read versions to be locally cached on the client side for latency optimizations. `(PR #5725) `_ `(PR #6664) `_ +* Added "get range and flat map" feature with new APIs (see Bindings section). Storage servers are able to generate the keys in the queries based on another query. With this, upper layer can push some computations down to FDB, to improve latency and bandwidth when read. `(PR #5609) `_, `(PR #6181) `_, etc.. Performance ----------- @@ -25,6 +26,8 @@ Status Bindings -------- +* C: Added ``fdb_transaction_get_range_and_flat_map`` function to support running queries based on another query in one request. `(PR #5609) `_ +* Java: Added ``Transaction.getRangeAndFlatMap`` function to support running queries based on another query in one request. `(PR #5609) `_ Other Changes ------------- diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index ef0603aeaf..1a00bbe071 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -614,7 +614,7 @@ void DLApi::init() { headerVersion >= 0); loadClientFunction(&api->transactionGetRange, lib, fdbCPath, "fdb_transaction_get_range", headerVersion >= 0); loadClientFunction( - &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 700); + &api->transactionGetMappedRange, lib, fdbCPath, "fdb_transaction_get_mapped_range", headerVersion >= 710); loadClientFunction( &api->transactionGetVersionstamp, lib, fdbCPath, "fdb_transaction_get_versionstamp", headerVersion >= 410); loadClientFunction(&api->transactionSet, lib, fdbCPath, "fdb_transaction_set", headerVersion >= 0); @@ -672,7 +672,7 @@ void DLApi::init() { loadClientFunction( &api->futureGetKeyValueArray, lib, fdbCPath, "fdb_future_get_keyvalue_array", headerVersion >= 0); loadClientFunction( - &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 700); + &api->futureGetMappedKeyValueArray, lib, fdbCPath, "fdb_future_get_mappedkeyvalue_array", headerVersion >= 710); loadClientFunction(&api->futureGetSharedState, lib, fdbCPath, "fdb_future_get_shared_state", headerVersion >= 710); loadClientFunction(&api->futureSetCallback, lib, fdbCPath, "fdb_future_set_callback", headerVersion >= 0); loadClientFunction(&api->futureCancel, lib, fdbCPath, "fdb_future_cancel", headerVersion >= 0); diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index b31825f24c..935beb232f 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -835,6 +835,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_METRICS_INTERVAL, 5.0 ); init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } + init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/ServerKnobs.h b/fdbclient/ServerKnobs.h index 082a737a07..aec8494802 100644 --- a/fdbclient/ServerKnobs.h +++ b/fdbclient/ServerKnobs.h @@ -798,6 +798,7 @@ public: double REDWOOD_METRICS_INTERVAL; double REDWOOD_HISTOGRAM_INTERVAL; bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. + int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches // Server request latency measurement int LATENCY_SAMPLE_SIZE; diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index e1bb0edfd0..a7c73f0d78 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1177,15 +1177,16 @@ public: struct Cursor { Cursor() : cache(nullptr), nodeIndex(-1) {} - Cursor(DecodeCache* cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {} + Cursor(Reference cache, DeltaTree2* tree) : tree(tree), cache(cache), nodeIndex(-1) {} - Cursor(DecodeCache* cache, DeltaTree2* tree, int nodeIndex) : tree(tree), cache(cache), nodeIndex(nodeIndex) {} + Cursor(Reference cache, DeltaTree2* tree, int nodeIndex) + : tree(tree), cache(cache), nodeIndex(nodeIndex) {} // Copy constructor does not copy item because normally a copied cursor will be immediately moved. Cursor(const Cursor& c) : tree(c.tree), cache(c.cache), nodeIndex(c.nodeIndex) {} ~Cursor() { - if (cache != nullptr) { + if (cache.isValid()) { cache->updateUsedMemory(); } } @@ -1212,7 +1213,7 @@ public: } DeltaTree2* tree; - DecodeCache* cache; + Reference cache; int nodeIndex; mutable Optional item; @@ -1274,6 +1275,7 @@ public: return item.get(); } + // Switch the cursor to point to a new DeltaTree void switchTree(DeltaTree2* newTree) { tree = newTree; // Reset item because it may point into tree memory @@ -1709,7 +1711,13 @@ public: } else { nodeBytesUsed = 0; } + + ASSERT(size() <= spaceAvailable); nodeBytesFree = spaceAvailable - size(); + + // Zero unused available space + memset((uint8_t*)this + size(), 0, nodeBytesFree); + return size(); } @@ -1782,8 +1790,15 @@ private: node.setLeftChildOffset(largeNodes, leftChildOffset); node.setRightChildOffset(largeNodes, rightChildOffset); - deltatree_printf("%p: Serialized %s as %s\n", this, item.toString().c_str(), node.toString(this).c_str()); + int written = wptr - (uint8_t*)&node; + deltatree_printf("Built subtree tree=%p subtreeRoot=%p written=%d end=%p serialized subtreeRoot %s as %s \n", + this, + &node, + written, + (uint8_t*)&node + written, + item.toString().c_str(), + node.toString(this).c_str()); - return wptr - (uint8_t*)&node; + return written; } }; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 0625650086..fdfc3c364d 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -20,22 +20,24 @@ #ifndef FDBSERVER_IPAGER_H #define FDBSERVER_IPAGER_H +#include "flow/Error.h" +#include "flow/FastAlloc.h" +#include "flow/ProtocolVersion.h" +#include +#include #pragma once #include "fdbserver/IKeyValueStore.h" #include "flow/flow.h" #include "fdbclient/FDBTypes.h" +#define XXH_INLINE_ALL #include "flow/xxhash.h" -#ifndef VALGRIND -#define VALGRIND_MAKE_MEM_UNDEFINED(x, y) -#define VALGRIND_MAKE_MEM_DEFINED(x, y) -#endif - typedef uint32_t LogicalPageID; typedef uint32_t PhysicalPageID; #define invalidLogicalPageID std::numeric_limits::max() +#define invalidPhysicalPageID std::numeric_limits::max() typedef uint32_t QueueID; #define invalidQueueID std::numeric_limits::max() @@ -76,90 +78,509 @@ static const std::vector> L0PossibleEv { PagerEvents::PageWrite, PagerEventReasons::MetaData }, }; -// Represents a block of memory in a 4096-byte aligned location held by an Arena. +enum EncodingType : uint8_t { + XXHash64 = 0, + // For testing purposes + XOREncryption = 1 +}; + +enum PageType : uint8_t { + HeaderPage = 0, + BackupHeaderPage = 1, + BTreeNode = 2, + BTreeSuperNode = 3, + QueuePageStandalone = 4, + QueuePageInExtent = 5 +}; + +// Encryption key ID +typedef uint64_t KeyID; + +// EncryptionKeyRef is somewhat multi-variant, it will contain members representing the union +// of all fields relevant to any implemented encryption scheme. They are generally of +// the form +// Page Fields - fields which come from or are stored in the Page +// Secret Fields - fields which are only known by the Key Provider +// but it is up to each encoding and provider which fields are which and which ones are used +struct EncryptionKeyRef { + + EncryptionKeyRef(){}; + EncryptionKeyRef(Arena& arena, const EncryptionKeyRef& toCopy) : secret(arena, toCopy.secret), id(toCopy.id) {} + int expectedSize() const { return secret.size(); } + + StringRef secret; + Optional id; +}; +typedef Standalone EncryptionKey; + +// Interface used by pager to get encryption keys by ID when reading pages from disk +// and by the BTree to get encryption keys to use for new pages +class IEncryptionKeyProvider { +public: + virtual ~IEncryptionKeyProvider() {} + + // Get an EncryptionKey with Secret Fields populated based on the given Page Fields. + // It is up to the implementation which fields those are. + // The output Page Fields must match the input Page Fields. + virtual Future getSecrets(const EncryptionKeyRef& key) = 0; + + // Get encryption key that should be used for a given user Key-Value range + virtual Future getByRange(const KeyRef& begin, const KeyRef& end) = 0; +}; + +// This is a hacky way to attach an additional object of an arbitrary type at runtime to another object. +// It stores an arbitrary void pointer and a void pointer function to call when the ArbitraryObject +// is destroyed. +// It has helper operator= methods for storing heap-allocated T's or Reference's in into it via +// x = thing; +// Examples: +// ArbitraryObject x; +// x.set(new Widget()); // x owns the new object +// x.set(Reference(new SomeClass()); // x holds a reference now too +// x.setReference(new SomeReferenceCountedType()); // +struct ArbitraryObject { + ArbitraryObject() : ptr(nullptr), onDestruct(nullptr) {} + ArbitraryObject(const ArbitraryObject&) = delete; + + ~ArbitraryObject() { destructOnly(); } + + bool valid() const { return ptr != nullptr; } + + template + void operator=(T* p) { + destructOnly(); + ptr = p; + onDestruct = [](void* ptr) { delete (T*)ptr; }; + } + + template + void operator=(Reference& r) { + destructOnly(); + ptr = r.getPtr(); + r.getPtr()->addref(); + onDestruct = [](void* ptr) { ((T*)ptr)->delref(); }; + } + + template + void operator=(Reference&& r) { + destructOnly(); + ptr = r.extractPtr(); + onDestruct = [](void* ptr) { ((T*)ptr)->delref(); }; + } + + template + T* getPtr() { + return (T*)ptr; + } + + template + Reference getReference() { + return Reference::addRef((T*)ptr); + } + + void reset() { + destructOnly(); + ptr = nullptr; + onDestruct = nullptr; + } + + // ptr can be set to any arbitrary thing. If it is not null at destruct time then + // onDestruct(ptr) will be called if onDestruct is not null. + void* ptr = nullptr; + void (*onDestruct)(void*) = nullptr; + +private: + // Call onDestruct(ptr) if needed but don't reset any state + void destructOnly() { + if (ptr != nullptr && onDestruct != nullptr) { + onDestruct(ptr); + } + } +}; + +// ArenaPage represents a data page meant to be stored on disk, located in a block of +// 4k-aligned memory held by an Arena +// +// Page Format: +// PageHeader - describes main header version, encoding type, and offsets of subheaders and payload. +// MainHeader - structure based on header version. It is responsible for protecting all bytes +// of PageHeader, MainHeader, and EncodingHeader with some sort of checksum. +// EncodingHeader - structure based on encoding type. It is responsible for protecting and +// possibly encrypting all payload bytes. +// Payload - User accessible bytes, protected and possibly encrypted based on the encoding +// +// preWrite() must be called before writing a page to disk to update checksums and encrypt as needed +// After reading a page from disk, +// postReadHeader() must be called to verify the verison, main, and encoding headers +// postReadPayload() must be called, after potentially setting encryption secret, to verify and possibly +// decrypt the payload class ArenaPage : public ReferenceCounted, public FastAllocated { public: - // The page's logical size includes an opaque checksum, use size() to get usable size - ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) { + // This is the header version that new page init() calls will use. + // It is not necessarily the latest header version, as read/modify support for + // a new header version may be added prior to using that version as the default + // for new pages as part of downgrade support. + static constexpr uint8_t HEADER_WRITE_VERSION = 1; + + ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), pPayload(nullptr) { if (bufferSize > 0) { buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize); - // Mark any unused page portion defined - VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); + // Zero unused region + memset(buffer + logicalSize, 0, bufferSize - logicalSize); } else { buffer = nullptr; } }; - ~ArenaPage() { - if (userData != nullptr && userDataDestructor != nullptr) { - userDataDestructor(userData); + ~ArenaPage() {} + + // Before using these, either init() or postReadHeader and postReadPayload() must be called + const uint8_t* data() const { return pPayload; } + uint8_t* mutateData() const { return (uint8_t*)pPayload; } + int dataSize() const { return payloadSize; } + + StringRef dataAsStringRef() const { return StringRef((uint8_t*)pPayload, payloadSize); } + + const uint8_t* rawData() const { return buffer; } + uint8_t* rawData() { return buffer; } + int rawSize() const { return bufferSize; } + +#pragma pack(push, 1) + + // The next few structs describe the byte-packed physical structure. The fields of Page + // cannot change, but new header versions and encoding types can be added and existing + // header versions and encoding type headers could change size as offset information + // is stored to enable efficient jumping to the encoding header or payload. + // Page members are only initialized in init() + struct PageHeader { + uint8_t headerVersion; + EncodingType encodingType; + + // Encoding header comes after main header + uint8_t encodingHeaderOffset; + + // Payload comes after encoding header + uint8_t payloadOffset; + + // Get main header pointer, casting to its type + template + T* getMainHeader() const { + return (T*)(this + 1); + } + + // Get encoding header pointer, casting to its type + template + T* getEncodingHeader() const { + return (T*)((uint8_t*)this + encodingHeaderOffset); + } + + // Get payload pointer + uint8_t* getPayload() const { return (uint8_t*)this + payloadOffset; } + }; + + // Redwood header version 1 + // Protects all headers with a 64-bit XXHash checksum + // Most other fields are forensic in nature and are not required to be set for correct + // behavior but they can faciliate forensic investigation of data on disk. Some of them + // could be used for sanity checks at runtime. + struct RedwoodHeaderV1 { + PageType pageType; + // The meaning of pageSubType is based on pageType + // For Queue pages, pageSubType is the QueueID + // For BTree nodes, pageSubType is Height (also stored in BTreeNode) + uint8_t pageSubType; + // Format identifier, normally specific to the page Type and SubType + uint8_t pageFormat; + XXH64_hash_t checksum; + + // Physical page ID of first block on disk of the ArenaPage + PhysicalPageID firstPhysicalPageID; + // The first logical page ID the ArenaPage was referenced by when last written + LogicalPageID lastKnownLogicalPageID; + // The first logical page ID of the parent of this ArenaPage when last written + LogicalPageID lastKnownParentLogicalPageID; + + // Time and write version as of the last update to this page. + // Note that for relocated pages, writeVersion should not be updated. + double writeTime; + Version writeVersion; + + // Update checksum + void updateChecksum(uint8_t* headerBytes, int len) { + // Checksum is within the checksum input so clear it first + checksum = 0; + checksum = XXH3_64bits(headerBytes, len); + } + + // Verify checksum + void verifyChecksum(uint8_t* headerBytes, int len) { + // Checksum is within the checksum input so save it and restore it afterwards + XXH64_hash_t saved = checksum; + checksum = 0; + XXH64_hash_t calculated = XXH3_64bits(headerBytes, len); + checksum = saved; + + if (saved != calculated) { + throw page_header_checksum_failed(); + } + } + }; + + // An encoding that validates the payload with an XXHash checksum + struct XXHashEncodingHeader { + XXH64_hash_t checksum; + void encode(uint8_t* payload, int len, PhysicalPageID seed) { + checksum = XXH3_64bits_withSeed(payload, len, seed); + } + void decode(uint8_t* payload, int len, PhysicalPageID seed) { + if (checksum != XXH3_64bits_withSeed(payload, len, seed)) { + throw page_decoding_failed(); + } + } + }; + + // A dummy "encrypting" encoding which uses XOR with a 1 byte secret key on + // the payload to obfuscate it and protects the payload with an XXHash checksum. + struct XOREncryptionEncodingHeader { + // Checksum is on unencrypted payload + XXH64_hash_t checksum; + uint8_t keyID; + + void encode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) { + checksum = XXH3_64bits_withSeed(payload, len, seed); + for (int i = 0; i < len; ++i) { + payload[i] ^= secret; + } + } + void decode(uint8_t secret, uint8_t* payload, int len, PhysicalPageID seed) { + for (int i = 0; i < len; ++i) { + payload[i] ^= secret; + } + if (checksum != XXH3_64bits_withSeed(payload, len, seed)) { + throw page_decoding_failed(); + } + } + }; +#pragma pack(pop) + + // Get the size of the encoding header based on type + // Note that this is only to be used in operations involving new pages to calculate the payload offset. For + // existing pages, the payload offset is stored in the page. + static int encodingHeaderSize(EncodingType t) { + if (t == EncodingType::XXHash64) { + return sizeof(XXHashEncodingHeader); + } else if (t == EncodingType::XOREncryption) { + return sizeof(XOREncryptionEncodingHeader); + } else { + throw page_encoding_not_supported(); } } - uint8_t const* begin() const { return (uint8_t*)buffer; } + // Get the usable size for a new page of pageSize using HEADER_WRITE_VERSION with encoding type t + static int getUsableSize(int pageSize, EncodingType t) { + return pageSize - sizeof(PageHeader) - sizeof(RedwoodHeaderV1) - encodingHeaderSize(t); + } - uint8_t* mutate() { return (uint8_t*)buffer; } + // Initialize the header for a new page so that the payload can be written to + // Pre: Buffer is allocated and logical size is set + // Post: Page header is initialized and space is reserved for subheaders for + // HEADER_WRITE_VERSION main header and the given encoding type. + // Payload can be written to with mutateData() and dataSize() + void init(EncodingType t, PageType pageType, uint8_t pageSubType, uint8_t pageFormat = 0) { + // Carefully cast away constness to modify page header + PageHeader* p = const_cast(page); + p->headerVersion = HEADER_WRITE_VERSION; + p->encodingHeaderOffset = sizeof(PageHeader) + sizeof(RedwoodHeaderV1); + p->encodingType = t; + p->payloadOffset = page->encodingHeaderOffset + encodingHeaderSize(t); - typedef XXH64_hash_t Checksum; + pPayload = page->getPayload(); + payloadSize = logicalSize - (pPayload - buffer); - // Usable size, without checksum - int size() const { return logicalSize - sizeof(Checksum); } + RedwoodHeaderV1* h = page->getMainHeader(); + h->pageType = pageType; + h->pageSubType = pageSubType; + h->pageFormat = pageFormat; - Standalone asStringRef() const { return Standalone(StringRef(begin(), size()), arena); } + // Write dummy values for these in new pages. They should be updated when possible before calling preWrite() + // when modifying existing pages + h->lastKnownLogicalPageID = invalidLogicalPageID; + h->lastKnownParentLogicalPageID = invalidLogicalPageID; + h->writeVersion = invalidVersion; + } - // Get an ArenaPage which is a copy of this page, in its own Arena - Reference cloneContents() const { + // Get the logical page buffer as a StringRef + Standalone asStringRef() const { return Standalone(StringRef(buffer, logicalSize)); } + + // Get a new ArenaPage that contains a copy of this page's data. + // extra is not copied to the returned page + Reference clone() const { ArenaPage* p = new ArenaPage(logicalSize, bufferSize); memcpy(p->buffer, buffer, logicalSize); + + // Non-verifying header parse just to initialize members + p->postReadHeader(invalidPhysicalPageID, false); + p->encryptionKey = encryptionKey; + return Reference(p); } // Get an ArenaPage which depends on this page's Arena and references some of its memory - Reference subPage(int offset, int len) const { + Reference getSubPage(int offset, int len) const { + ASSERT(offset + len <= logicalSize); ArenaPage* p = new ArenaPage(len, 0); p->buffer = buffer + offset; p->arena.dependsOn(arena); + + // Non-verifying header parse just to initialize component pointers + p->postReadHeader(invalidPhysicalPageID, false); + p->encryptionKey = encryptionKey; + return Reference(p); } - // Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is - // equivalent to all of the input pages and has all of their contents copied into it. - static Reference concatPages(const std::vector>& pages) { - int usableSize = pages.front()->size(); - int totalUsableSize = pages.size() * usableSize; - int totalBufferSize = pages.front()->bufferSize * pages.size(); - ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize); - - uint8_t* wptr = superpage->mutate(); - for (auto& p : pages) { - ASSERT(p->size() == usableSize); - memcpy(wptr, p->begin(), usableSize); - wptr += usableSize; + // The next two functions set mostly forensic info that may help in an investigation to identify data on disk. The + // exception is pageID which must be set to the physical page ID on disk where the page is written or post-read + // verification will fail. + void setWriteInfo(PhysicalPageID pageID, Version writeVersion) { + if (page->headerVersion == 1) { + RedwoodHeaderV1* h = page->getMainHeader(); + h->firstPhysicalPageID = pageID; + h->writeVersion = writeVersion; + h->writeTime = now(); } - - return Reference(superpage); } - Checksum& getChecksum() { return *(Checksum*)(buffer + size()); } + // These should be updated before writing a BTree page. Note that the logical ID that refers to a page can change + // after the page is written, if its parent is updated to point directly to its physical page ID. Therefore, the + // last known logical page ID should always be updated before writing an updated version of a BTree page. + void setLogicalPageInfo(LogicalPageID lastKnownLogicalPageID, LogicalPageID lastKnownParentLogicalPageID) { + if (page->headerVersion == 1) { + RedwoodHeaderV1* h = page->getMainHeader(); + h->lastKnownLogicalPageID = lastKnownLogicalPageID; + h->lastKnownParentLogicalPageID = lastKnownParentLogicalPageID; + } + } - Checksum calculateChecksum(LogicalPageID pageID) { return XXH3_64bits_withSeed(buffer, size(), pageID); } + // Must be called before writing to disk to update headers and encrypt page + // Pre: Encoding-specific header fields are set if needed + // Secret is set if needed + // Post: Main and Encoding subheaders are updated + // Payload is possibly encrypted + void preWrite(PhysicalPageID pageID) const { + // Explicitly check payload definedness to make the source of valgrind errors more clear. + // Without this check, calculating a checksum on a payload with undefined bytes does not + // cause a valgrind error but the resulting checksum is undefined which causes errors later. + ASSERT(VALGRIND_CHECK_MEM_IS_DEFINED(pPayload, payloadSize) == 0); - void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); } + if (page->encodingType == EncodingType::XXHash64) { + page->getEncodingHeader()->encode(pPayload, payloadSize, pageID); + } else if (page->encodingType == EncodingType::XOREncryption) { + ASSERT(encryptionKey.secret.size() == 1); + XOREncryptionEncodingHeader* xh = page->getEncodingHeader(); + xh->keyID = encryptionKey.id.orDefault(0); + xh->encode(encryptionKey.secret[0], pPayload, payloadSize, pageID); + } else { + throw page_encoding_not_supported(); + } - bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); } + if (page->headerVersion == 1) { + page->getMainHeader()->updateChecksum(buffer, pPayload - buffer); + } else { + throw page_header_version_not_supported(); + } + } + + // Must be called after reading from disk to verify all non-payload bytes + // Pre: Bytes from storage medium copied into raw buffer space + // Post: Page headers outside of payload are verified (unless verify is false) + // encryptionKey is updated with information from encoding header if needed + // Payload is accessible via data(), dataSize(), etc. + // + // Exceptions are thrown for unknown header types or pages which fail verification + void postReadHeader(PhysicalPageID pageID, bool verify = true) { + pPayload = page->getPayload(); + payloadSize = logicalSize - (pPayload - buffer); + + // Populate encryption key with relevant fields from page + if (page->encodingType == EncodingType::XOREncryption) { + encryptionKey.id = page->getEncodingHeader()->keyID; + } + + if (page->headerVersion == 1) { + if (verify) { + RedwoodHeaderV1* h = page->getMainHeader(); + h->verifyChecksum(buffer, pPayload - buffer); + if (pageID != h->firstPhysicalPageID) { + throw page_header_wrong_page_id(); + } + } + } else { + throw page_header_version_not_supported(); + } + } + + // Pre: postReadHeader has been called, encoding-specific parameters (such as the encryption secret) have been set + // Post: Payload has been verified and decrypted if necessary + void postReadPayload(PhysicalPageID pageID) { + if (page->encodingType == EncodingType::XXHash64) { + page->getEncodingHeader()->decode(pPayload, payloadSize, pageID); + } else if (page->encodingType == EncodingType::XOREncryption) { + ASSERT(encryptionKey.secret.size() == 1); + page->getEncodingHeader()->decode( + encryptionKey.secret[0], pPayload, payloadSize, pageID); + } else { + throw page_encoding_not_supported(); + } + } const Arena& getArena() const { return arena; } + static bool isEncodingTypeEncrypted(EncodingType t) { return t == EncodingType::XOREncryption; } + + // Returns true if the page's encoding type employs encryption + bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); } + private: Arena arena; + + // The logical size of the page, which can be smaller than bufferSize, which is only of + // practical purpose in simulation to use arbitrarily small page sizes to test edge cases + // with shorter execution time int logicalSize; + + // The 4k-aligned physical size of allocated memory for the page which also represents the + // block size to be written to disk int bufferSize; - uint8_t* buffer; + + // buffer is a pointer to the page's memory + // For convenience, it is unioned with a Page pointer which defines the page structure + union { + uint8_t* buffer; + const PageHeader* page; + }; + + // Pointer and length of page space available to the user + // These are accessed very often so they are stored directly + uint8_t* pPayload; + int payloadSize; public: - mutable void* userData; - mutable void (*userDataDestructor)(void*); + EncodingType getEncodingType() const { return page->encodingType; } + + PhysicalPageID getPhysicalPageID() const { + if (page->headerVersion == 1) { + return page->getMainHeader()->firstPhysicalPageID; + } else { + throw page_header_version_not_supported(); + } + } + + // Used by encodings that do encryption + EncryptionKey encryptionKey; + + mutable ArbitraryObject extra; }; class IPagerSnapshot { @@ -184,18 +605,21 @@ public: virtual void addref() = 0; virtual void delref() = 0; + + ArbitraryObject extra; }; // This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic. class IPager2 : public IClosable { public: + virtual std::string getName() const = 0; + // Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed. - virtual Reference newPageBuffer(size_t size = 1) = 0; + virtual Reference newPageBuffer(size_t blocks = 1) = 0; // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). // For a given pager instance, separate calls to this function must return the same value. // Only valid to call after recovery is complete. - virtual int getUsablePageSize() const = 0; virtual int getPhysicalPageSize() const = 0; virtual int getLogicalPageSize() const = 0; virtual int getPagesPerExtent() const = 0; @@ -251,7 +675,7 @@ public: bool noHit) = 0; virtual Future> readMultiPage(PagerEventReasons reason, unsigned int level, - Standalone> pageIDs, + VectorRef pageIDs, int priority, bool cacheable, bool noHit) = 0; @@ -271,16 +695,13 @@ public: // The snapshot shall be usable until setOldVersion() is called with a version > v. virtual Reference getReadSnapshot(Version v) = 0; - // Atomically make durable all pending page writes, page frees, and update the metadata string, - // setting the committed version to v - // v must be >= the highest versioned page write. - virtual Future commit(Version v) = 0; + // Atomically make durable all pending page writes, page frees, and update the user commit + // record at version v + // v must be higher than the highest committed version + virtual Future commit(Version v, Value commitRecord) = 0; - // Get the latest meta key set or committed - virtual Key getMetaKey() const = 0; - - // Set the metakey which will be stored in the next commit - virtual void setMetaKey(KeyRef metaKey) = 0; + // Get the latest committed user commit record + virtual Value getCommitRecord() const = 0; virtual StorageBytes getStorageBytes() const = 0; @@ -318,4 +739,52 @@ protected: ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface }; +// The null key provider is useful to simplify page decoding. +// It throws an error for any key info requested. +class NullKeyProvider : public IEncryptionKeyProvider { +public: + virtual ~NullKeyProvider() {} + Future getSecrets(const EncryptionKeyRef& key) override { throw encryption_key_not_found(); } + Future getByRange(const KeyRef& begin, const KeyRef& end) override { + throw encryption_key_not_found(); + } +}; + +// Key provider for dummy XOR encryption scheme +class XOREncryptionKeyProvider : public IEncryptionKeyProvider { +public: + XOREncryptionKeyProvider(std::string filename) { + ASSERT(g_network->isSimulated()); + + // Choose a deterministic random filename (without path) byte for secret generation + // Remove any leading directory names + size_t lastSlash = filename.find_last_of("\\/"); + if (lastSlash != filename.npos) { + filename.erase(0, lastSlash); + } + xorWith = filename.empty() ? 0x5e + : (uint8_t)filename[XXH3_64bits(filename.data(), filename.size()) % filename.size()]; + } + + virtual ~XOREncryptionKeyProvider() {} + + virtual Future getSecrets(const EncryptionKeyRef& key) override { + if (!key.id.present()) { + throw encryption_key_not_found(); + } + EncryptionKey s = key; + uint8_t secret = ~(uint8_t)key.id.get() ^ xorWith; + s.secret = StringRef(s.arena(), &secret, 1); + return s; + } + + virtual Future getByRange(const KeyRef& begin, const KeyRef& end) override { + EncryptionKeyRef k; + k.id = end.empty() ? 0 : *(end.end() - 1); + return getSecrets(k); + } + + uint8_t xorWith; +}; + #endif diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 0f34a5190d..9b4b177c2c 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -21,8 +21,11 @@ #include "contrib/fmt-8.1.1/include/fmt/format.h" #include "fdbclient/FDBTypes.h" #include "fdbserver/Knobs.h" +#include "flow/Error.h" #include "flow/IRandom.h" #include "flow/Knobs.h" +#include "flow/ObjectSerializer.h" +#include "flow/Trace.h" #include "flow/flow.h" #include "flow/Histogram.h" #include @@ -51,31 +54,20 @@ #define REDWOOD_DEBUG 0 -// Only print redwood debug statements for a certain address. Useful in simulation with many redwood processes to reduce -// log size. -#define REDWOOD_DEBUG_ADDR 0 -// example addr: "[abcd::4:0:1:4]:1" -#define REDWOOD_DEBUG_ADDR_VAL ""; +// Only print debug info for a specific address +static NetworkAddress g_debugAddress = NetworkAddress::parse("0.0.0.0:0"); +// Only print debug info after a specific time +static double g_debugStart = 0; +// Debug output stream +static FILE* g_debugStream = stdout; -#define debug_printf_stream stdout #define debug_printf_always(...) \ - { \ + if (now() >= g_debugStart && \ + (!g_network->getLocalAddress().isValid() || g_network->getLocalAddress() == g_debugAddress)) { \ std::string prefix = format("%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \ std::string msg = format(__VA_ARGS__); \ - fputs(addPrefix(prefix, msg).c_str(), debug_printf_stream); \ - fflush(debug_printf_stream); \ - } - -#define debug_printf_addr(...) \ - { \ - std::string addr = REDWOOD_DEBUG_ADDR_VAL; \ - if (!memcmp(addr.c_str(), g_network->getLocalAddress().toString().c_str(), addr.size())) { \ - std::string prefix = \ - format("%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \ - std::string msg = format(__VA_ARGS__); \ - fputs(addPrefix(prefix, msg).c_str(), debug_printf_stream); \ - fflush(debug_printf_stream); \ - } \ + fputs(addPrefix(prefix, msg).c_str(), g_debugStream); \ + fflush(g_debugStream); \ } #define debug_print(str) debug_printf("%s\n", str.c_str()) @@ -85,8 +77,6 @@ #if defined(NO_INTELLISENSE) #if REDWOOD_DEBUG #define debug_printf debug_printf_always -#elif REDWOOD_DEBUG_ADDR -#define debug_printf debug_printf_addr #else #define debug_printf debug_printf_noop #endif @@ -424,9 +414,9 @@ std::string toString(const std::pair& o) { return format("{%s, %s}", toString(o.first).c_str(), toString(o.second).c_str()); } -static constexpr int ioMinPriority = 0; -static constexpr int ioLeafPriority = 1; -static constexpr int ioMaxPriority = 3; +constexpr static int ioMinPriority = 0; +constexpr static int ioLeafPriority = 1; +constexpr static int ioMaxPriority = 3; // A FIFO queue of T stored as a linked list of pages. // Main operations are pop(), pushBack(), pushFront(), and flush(). @@ -450,6 +440,17 @@ static constexpr int ioMaxPriority = 3; // the previous tail page to update its next link, which risks corrupting it and losing // data that was not yet popped if that write is never fsync'd. // +// While FIFOQueue stores data inside a pager, it also is used _by_ the pager to hold internal metadata +// such as free lists and logical page remapping activity. +// Therefore, FIFOQueue +// - does not use versioned page reads +// - does not use versioned page updates via atomicUpdatePage() +// - does not update pages in-place +// - writes to a Page ID only once before freeing it +// +// For clarity, FIFOQueue Page IDs are always typed as PhysicalPageID, as they are always +// physical page IDs since atomic page updates are not used. +// // Requirements on T // - must be trivially copyable // OR have a specialization for FIFOQueueCodec @@ -491,21 +492,26 @@ class FIFOQueue { public: #pragma pack(push, 1) struct QueueState { - bool operator==(const QueueState& rhs) const { return memcmp(this, &rhs, sizeof(QueueState)) == 0; } - QueueID queueID = invalidQueueID; - LogicalPageID headPageID = invalidLogicalPageID; - LogicalPageID tailPageID = invalidLogicalPageID; - uint16_t headOffset; - // Note that there is no tail index because the tail page is always never-before-written and its index will - // start at 0 - int64_t numPages; - int64_t numEntries; - bool usesExtents = false; // Is this an extent based queue? - LogicalPageID prevExtentEndPageID = invalidLogicalPageID; - bool tailPageNewExtent = false; - KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(QueueState)); } + constexpr static FileIdentifier file_identifier = 16482812; - void fromKeyRef(KeyRef k) { memcpy(this, k.begin(), k.size()); } + QueueID queueID; + + // First page in the linked-list queue structure + PhysicalPageID headPageID; + // Item offset representing the next item in the queue in the first page + uint16_t headOffset; + // Last page in the linked-list queue structure, where new entries will be written + // Note there is no tailPageOffset, it is always 0 because the tail page has never been written + PhysicalPageID tailPageID; + + uint32_t numPages; + // numEntries could technically exceed the max page ID so it is 64 bits + uint64_t numEntries; + + // State for queues that use pages in contiguous blocks of disk space called extents + bool usesExtents; + PhysicalPageID prevExtentEndPageID; + bool tailPageNewExtent; // Tail page points to the start of a new extent std::string toString() const { return format("{queueID: %u head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 @@ -518,20 +524,36 @@ public: numEntries, usesExtents); } + + template + void serialize(Ar& ar) { + serializer(ar, + queueID, + headPageID, + headOffset, + tailPageID, + numPages, + numEntries, + usesExtents, + prevExtentEndPageID, + tailPageNewExtent); + } }; - struct RawPage { + struct QueuePage { // The next page of the queue after this one - LogicalPageID nextPageID; + PhysicalPageID nextPageID; // The start offset of the next page uint16_t nextOffset; - // The end offset of the current page + // The end offset of the current page's data entries uint16_t endOffset; // Current page within the extent - LogicalPageID extentCurPageID; - // The nd page within the extent - LogicalPageID extentEndPageID; + PhysicalPageID extentCurPageID; + // The end page within the extent + PhysicalPageID extentEndPageID; + uint16_t itemSpace; // Get pointer to data after page header + const uint8_t* begin() const { return (const uint8_t*)(this + 1); } uint8_t* begin() { return (uint8_t*)(this + 1); } }; #pragma pack(pop) @@ -545,22 +567,22 @@ public: FIFOQueue* queue; // The current page and pageID being read or written to - LogicalPageID pageID; + PhysicalPageID pageID; Reference page; // The first page ID to be written to the pager, if this cursor has written anything - LogicalPageID firstPageIDWritten; + PhysicalPageID firstPageIDWritten; - // Offset after RawPage header in page to next read from or write to + // Offset after QueuePage::begin() to next read from or write to int offset; // A read cursor will not read this page (or beyond) - LogicalPageID endPageID; + PhysicalPageID endPageID; // Page future and corresponding page ID for the expected next page to be used. It may not // match the current page's next page link because queues can prepended with new front pages. Future> nextPageReader; - LogicalPageID nextPageID; + PhysicalPageID nextPageID; // Future that represents all outstanding write operations previously issued // This exists because writing the queue returns void, not a future @@ -573,15 +595,15 @@ public: // Initialize a cursor. void init(FIFOQueue* q = nullptr, Mode m = NONE, - LogicalPageID initialPageID = invalidLogicalPageID, + PhysicalPageID initialPageID = invalidPhysicalPageID, bool initExtentInfo = true, bool tailPageNewExtent = false, - LogicalPageID endPage = invalidLogicalPageID, + PhysicalPageID endPage = invalidPhysicalPageID, int readOffset = 0, - LogicalPageID prevExtentEndPageID = invalidLogicalPageID) { + PhysicalPageID prevExtentEndPageID = invalidPhysicalPageID) { queue = q; mode = m; - firstPageIDWritten = invalidLogicalPageID; + firstPageIDWritten = invalidPhysicalPageID; offset = readOffset; endPageID = endPage; page.clear(); @@ -598,17 +620,17 @@ public: else startNextPageLoad(pageID); } else { - nextPageID = invalidLogicalPageID; + nextPageID = invalidPhysicalPageID; } } else { - pageID = invalidLogicalPageID; + pageID = invalidPhysicalPageID; ASSERT(mode == WRITE || - (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); + (initialPageID == invalidPhysicalPageID && readOffset == 0 && endPage == invalidPhysicalPageID)); } debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str()); - if (mode == WRITE && initialPageID != invalidLogicalPageID) { + if (mode == WRITE && initialPageID != invalidPhysicalPageID) { debug_printf("FIFOQueue::Cursor(%s) init. Adding new page %u\n", toString().c_str(), initialPageID); addNewPage(initialPageID, 0, true, initExtentInfo, tailPageNewExtent, prevExtentEndPageID); } @@ -642,7 +664,7 @@ public: this, ::toString(pageID).c_str(), offset, - page ? raw()->endOffset : -1); + page ? header()->endOffset : -1); } if (mode == POP || mode == READONLY) { return format("{ReadCursor %s:%p pos=%s:%d rawEndOffset=%d endPage=%s nextPage=%s}", @@ -650,7 +672,7 @@ public: this, ::toString(pageID).c_str(), offset, - page ? raw()->endOffset : -1, + (page && header()) ? header()->endOffset : -1, ::toString(endPageID).c_str(), ::toString(nextPageID).c_str()); } @@ -675,16 +697,16 @@ public: // Returns true if any items have been written to the last page bool pendingTailWrites() const { return mode == WRITE && offset != 0; } - RawPage* raw() const { return ((RawPage*)(page->begin())); } + QueuePage* header() const { return ((QueuePage*)(page->mutateData())); } - void setNext(LogicalPageID pageID, int offset) { + void setNext(PhysicalPageID pageID, int offset) { ASSERT(mode == WRITE); - RawPage* p = raw(); + QueuePage* p = header(); p->nextPageID = pageID; p->nextOffset = offset; } - void startNextPageLoad(LogicalPageID id) { + void startNextPageLoad(PhysicalPageID id) { nextPageID = id; debug_printf( "FIFOQueue::Cursor(%s) loadPage start id=%s\n", toString().c_str(), ::toString(nextPageID).c_str()); @@ -700,7 +722,7 @@ public: debug_printf("FIFOQueue::Cursor(%s) loadExtent\n", toString().c_str()); return map(queue->pager->readExtent(pageID), [=](Reference p) { page = p; - debug_printf("FIFOQueue::Cursor(%s) loadExtent done. Page: %p\n", toString().c_str(), page->begin()); + debug_printf("FIFOQueue::Cursor(%s) loadExtent done. Page: %p\n", toString().c_str(), page->rawData()); return Void(); }); } @@ -708,11 +730,13 @@ public: void writePage() { ASSERT(mode == WRITE); debug_printf("FIFOQueue::Cursor(%s) writePage\n", toString().c_str()); - VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); - VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); + + // Zero unused space within itemspace + memset(header()->begin() + header()->endOffset, 0, header()->itemSpace - header()->endOffset); + queue->pager->updatePage( - PagerEventReasons::MetaData, nonBtreeLevel, VectorRef(&pageID, 1), page); - if (firstPageIDWritten == invalidLogicalPageID) { + PagerEventReasons::MetaData, nonBtreeLevel, VectorRef(&pageID, 1), page); + if (firstPageIDWritten == invalidPhysicalPageID) { firstPageIDWritten = pageID; } } @@ -724,14 +748,14 @@ public: // as a new tail page. // if initializeExtentInfo is true in addition to initializeNewPage, update the extentEndPageID info // in the mew page being added using newExtentPage and prevExtentEndPageID parameters - void addNewPage(LogicalPageID newPageID, + void addNewPage(PhysicalPageID newPageID, int newOffset, bool initializeNewPage, bool initializeExtentInfo = false, bool newExtentPage = false, - LogicalPageID prevExtentEndPageID = invalidLogicalPageID) { + PhysicalPageID prevExtentEndPageID = invalidPhysicalPageID) { ASSERT(mode == WRITE); - ASSERT(newPageID != invalidLogicalPageID); + ASSERT(newPageID != invalidPhysicalPageID); debug_printf("FIFOQueue::Cursor(%s) Adding page %s initPage=%d initExtentInfo=%d newExtentPage=%d\n", toString().c_str(), ::toString(newPageID).c_str(), @@ -747,8 +771,7 @@ public: ::toString(newPageID).c_str(), newOffset); writePage(); - auto p = raw(); - prevExtentEndPageID = p->extentEndPageID; + prevExtentEndPageID = header()->extentEndPageID; if (pageID == prevExtentEndPageID) newExtentPage = true; debug_printf( @@ -762,28 +785,33 @@ public: pageID = newPageID; offset = newOffset; - if (BUGGIFY) { - // Randomly change the byte limit for queue pages. The min here must be large enough for at least one - // queue item of any type. This change will suddenly make some pages being written to seem overfilled - // but this won't break anything, the next write will just be detected as not fitting and the page will - // end. - queue->dataBytesPerPage = - deterministicRandom()->randomInt(50, queue->pager->getUsablePageSize() - sizeof(RawPage)); - } - if (initializeNewPage) { debug_printf("FIFOQueue::Cursor(%s) Initializing new page. usesExtents: %d, initializeExtentInfo: %d\n", toString().c_str(), queue->usesExtents, initializeExtentInfo); page = queue->pager->newPageBuffer(); + page->init(EncodingType::XXHash64, + queue->usesExtents ? PageType::QueuePageInExtent : PageType::QueuePageStandalone, + (uint8_t)queue->queueID); setNext(0, 0); - auto p = raw(); + auto p = header(); ASSERT(newOffset == 0); p->endOffset = 0; + p->itemSpace = page->dataSize() - sizeof(QueuePage); + if (g_network->isSimulated() && deterministicRandom()->coinflip()) { + // Randomly reduce available item space to cause more queue pages to be needed + int reducedSpace = deterministicRandom()->randomInt(50, p->itemSpace); + + // Zero the eliminated space + memset(header()->begin() + reducedSpace, 0, p->itemSpace - reducedSpace); + + p->itemSpace = reducedSpace; + } + // For extent based queue, update the index of current page within the extent if (queue->usesExtents) { - debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d pageCount %d\n", + debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d pageCount % " PRId64 "\n", toString().c_str(), ::toString(newPageID).c_str(), initializeNewPage, @@ -806,6 +834,9 @@ public: ::toString(p->extentEndPageID).c_str()); } } + } else { + p->extentCurPageID = invalidPhysicalPageID; + p->extentEndPageID = invalidPhysicalPageID; } } else { debug_printf("FIFOQueue::Cursor(%s) Clearing new page\n", toString().c_str()); @@ -821,9 +852,9 @@ public: state bool mustWait = self->isBusy(); state int bytesNeeded = Codec::bytesNeeded(item); state bool needNewPage = - self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage; + self->pageID == invalidPhysicalPageID || self->offset + bytesNeeded > self->header()->itemSpace; - if (BUGGIFY) { + if (g_network->isSimulated()) { // Sometimes (1% probability) decide a new page is needed as long as at least 1 item has been // written (indicated by non-zero offset) to the current page. if ((self->offset > 0) && deterministicRandom()->random01() < 0.01) { @@ -846,9 +877,9 @@ public: // would have changed the cursor state // Otherwise, taking the mutex would be immediate so no other writer could have run if (mustWait) { - needNewPage = self->pageID == invalidLogicalPageID || - self->offset + bytesNeeded > self->queue->dataBytesPerPage; - if (BUGGIFY) { + needNewPage = + self->pageID == invalidPhysicalPageID || self->offset + bytesNeeded > self->header()->itemSpace; + if (g_network->isSimulated()) { // Sometimes (1% probability) decide a new page is needed as long as at least 1 item has been // written (indicated by non-zero offset) to the current page. if ((self->offset > 0) && deterministicRandom()->random01() < 0.01) { @@ -862,16 +893,13 @@ public: if (needNewPage) { debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), - ::toString(item).c_str(), - ::toString(self->pageID).c_str(), - bytesNeeded, - self->queue->dataBytesPerPage); - state LogicalPageID newPageID; + ::toString(item).c_str()); + state PhysicalPageID newPageID; // If this is an extent based queue, check if there is an available page in current extent if (self->queue->usesExtents) { bool allocateNewExtent = false; - if (self->pageID != invalidLogicalPageID) { - auto praw = self->raw(); + if (self->pageID != invalidPhysicalPageID) { + auto praw = self->header(); if (praw->extentCurPageID < praw->extentEndPageID) { newPageID = praw->extentCurPageID + 1; } else { @@ -880,11 +908,11 @@ public: } else allocateNewExtent = true; if (allocateNewExtent) { - LogicalPageID newPID = wait(self->queue->pager->newExtentPageID(self->queue->queueID)); + PhysicalPageID newPID = wait(self->queue->pager->newExtentPageID(self->queue->queueID)); newPageID = newPID; } } else { - LogicalPageID newPID = wait(self->queue->pager->newPageID()); + PhysicalPageID newPID = wait(self->queue->pager->newPageID()); newPageID = newPID; } self->addNewPage(newPageID, 0, true, true); @@ -894,7 +922,7 @@ public: debug_printf( "FIFOQueue::Cursor(%s) write(%s) writing\n", self->toString().c_str(), ::toString(item).c_str()); - auto p = self->raw(); + auto p = self->header(); Codec::writeToBytes(p->begin() + self->offset, item); self->offset += bytesNeeded; p->endOffset = self->offset; @@ -975,7 +1003,7 @@ public: // If locked is true, this call owns the mutex, which would have been locked by readNext() before a recursive // call. See waitThenReadNext() for more detail. Future> readNext(const Optional& inclusiveMaximum = {}, FlowMutex::Lock* lock = nullptr) { - if ((mode != POP && mode != READONLY) || pageID == invalidLogicalPageID || pageID == endPageID) { + if ((mode != POP && mode != READONLY) || pageID == invalidPhysicalPageID || pageID == endPageID) { debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", toString().c_str()); return Optional(); } @@ -1004,16 +1032,16 @@ public: page = nextPageReader.get(); // Start loading the next page if it's not the end page - auto p = raw(); + auto p = header(); if (p->nextPageID != endPageID) { startNextPageLoad(p->nextPageID); } else { // Prevent a future next page read from reusing the same result as page would have to be updated // before the queue would read it again - nextPageID = invalidLogicalPageID; + nextPageID = invalidPhysicalPageID; } } - auto p = raw(); + auto p = header(); debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", toString().c_str()); ASSERT(offset < p->endOffset); int bytesRead; @@ -1039,9 +1067,9 @@ public: // tail page if (offset == p->endOffset) { debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", toString().c_str()); - LogicalPageID oldPageID = pageID; - LogicalPageID extentCurPageID = p->extentCurPageID; - LogicalPageID extentEndPageID = p->extentEndPageID; + PhysicalPageID oldPageID = pageID; + PhysicalPageID extentCurPageID = p->extentCurPageID; + PhysicalPageID extentEndPageID = p->extentEndPageID; pageID = p->nextPageID; offset = p->nextOffset; @@ -1089,7 +1117,7 @@ public: void operator=(const FIFOQueue& rhs) = delete; // Create a new queue at newPageID - void create(IPager2* p, LogicalPageID newPageID, std::string queueName, QueueID id, bool extent) { + void create(IPager2* p, PhysicalPageID newPageID, std::string queueName, QueueID id, bool extent) { debug_printf("FIFOQueue(%s) create from page %s. usesExtents %d\n", queueName.c_str(), toString(newPageID).c_str(), @@ -1100,13 +1128,14 @@ public: queueID = id; numPages = 1; numEntries = 0; - dataBytesPerPage = pager->getUsablePageSize() - sizeof(RawPage); usesExtents = extent; + tailPageNewExtent = false; + prevExtentEndPageID = invalidPhysicalPageID; pagesPerExtent = pager->getPagesPerExtent(); headReader.init(this, Cursor::POP, newPageID, false, false, newPageID, 0); tailWriter.init(this, Cursor::WRITE, newPageID, true, true); headWriter.init(this, Cursor::WRITE); - newTailPage = invalidLogicalPageID; + newTailPage = invalidPhysicalPageID; debug_printf("FIFOQueue(%s) created\n", queueName.c_str()); } @@ -1119,8 +1148,9 @@ public: queueID = qs.queueID; numPages = qs.numPages; numEntries = qs.numEntries; - dataBytesPerPage = pager->getUsablePageSize() - sizeof(RawPage); usesExtents = qs.usesExtents; + tailPageNewExtent = qs.tailPageNewExtent; + prevExtentEndPageID = qs.prevExtentEndPageID; pagesPerExtent = pager->getPagesPerExtent(); headReader.init(this, Cursor::POP, qs.headPageID, loadExtents, false, qs.tailPageID, qs.headOffset); tailWriter.init(this, @@ -1128,11 +1158,11 @@ public: qs.tailPageID, true, qs.tailPageNewExtent, - invalidLogicalPageID, + invalidPhysicalPageID, 0, qs.prevExtentEndPageID); headWriter.init(this, Cursor::WRITE); - newTailPage = invalidLogicalPageID; + newTailPage = invalidPhysicalPageID; debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); } @@ -1151,7 +1181,7 @@ public: c.initReadOnly(self->headReader, true); debug_printf("FIFOQueue::Cursor(%s) peekAllExt begin\n", c.toString().c_str()); - if (c.pageID == invalidLogicalPageID || c.pageID == c.endPageID) { + if (c.pageID == invalidPhysicalPageID || c.pageID == c.endPageID) { debug_printf("FIFOQueue::Cursor(%s) peekAllExt returning nothing\n", c.toString().c_str()); res.sendError(end_of_stream()); return Void(); @@ -1173,37 +1203,36 @@ public: results.reserve(results.arena(), self->pagesPerExtent * self->pager->getPhysicalPageSize() / sizeof(T)); // Loop over all the pages in this extent - int pageIdx = 0; + state int pageIdx = 0; loop { // Position the page pointer to current page in the extent - Reference page = - c.page->subPage(pageIdx++ * self->pager->getPhysicalPageSize(), self->pager->getLogicalPageSize()); - debug_printf("FIFOQueue::Cursor(%s) peekALLExt %s. Offset %d, CalculateChecksum %d ChecksumInPage %d\n", + state Reference page = c.page->getSubPage(pageIdx++ * self->pager->getPhysicalPageSize(), + self->pager->getLogicalPageSize()); + debug_printf("FIFOQueue::Cursor(%s) peekALLExt %s. Offset %d\n", c.toString().c_str(), toString(c.pageID).c_str(), - c.pageID * self->pager->getPhysicalPageSize(), - page->calculateChecksum(c.pageID), - page->getChecksum()); - if (!page->verifyChecksum(c.pageID)) { - debug_printf("FIFOQueue::Cursor(%s) peekALLExt checksum failed for %s. Offset %d, " - "CalculateChecksum %d ChecksumInPage %d\n", - c.toString().c_str(), - toString(c.pageID).c_str(), - c.pageID * self->pager->getPhysicalPageSize(), - page->calculateChecksum(c.pageID), - page->getChecksum()); - Error e = checksum_failed(); + c.pageID * self->pager->getPhysicalPageSize()); + + try { + page->postReadHeader(c.pageID); + // These pages are not encrypted + page->postReadPayload(c.pageID); + } catch (Error& e) { TraceEvent(SevError, "RedwoodChecksumFailed") .error(e) .detail("PageID", c.pageID) .detail("PageSize", self->pager->getPhysicalPageSize()) - .detail("Offset", c.pageID * self->pager->getPhysicalPageSize()) - .detail("CalculatedChecksum", page->calculateChecksum(c.pageID)) - .detail("ChecksumInPage", page->getChecksum()); - throw e; + .detail("Offset", c.pageID * self->pager->getPhysicalPageSize()); + + debug_printf("FIFOQueue::Cursor(%s) peekALLExt getSubPage error=%s for %s. Offset %d ", + c.toString().c_str(), + e.what(), + toString(c.pageID).c_str(), + c.pageID * self->pager->getPhysicalPageSize()); + throw; } - RawPage* p = (RawPage*)(page->begin()); + const QueuePage* p = (const QueuePage*)(page->data()); int bytesRead; // Now loop over all entries inside the current page @@ -1232,7 +1261,7 @@ public: } // End of Page // Check if we have reached the end of the queue - if (c.pageID == invalidLogicalPageID || c.pageID == c.endPageID) { + if (c.pageID == invalidPhysicalPageID || c.pageID == c.endPageID) { debug_printf("FIFOQueue::Cursor(%s) peekAllExt Queue exhausted\n", c.toString().c_str()); res.send(results); @@ -1373,16 +1402,16 @@ public: // // If the newTailPage future is ready but it's an invalid page and the tail page we are currently pointed to // has had items added to it, then get a new tail page ID. - if (self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID) { + if (self->newTailPage.isReady() && self->newTailPage.get() == invalidPhysicalPageID) { if (self->tailWriter.pendingTailWrites()) { debug_printf("FIFOQueue(%s) preFlush starting to get new page ID\n", self->name.c_str()); if (self->usesExtents) { - if (self->tailWriter.pageID == invalidLogicalPageID) { + if (self->tailWriter.pageID == invalidPhysicalPageID) { self->newTailPage = self->pager->newExtentPageID(self->queueID); self->tailPageNewExtent = true; - self->prevExtentEndPageID = invalidLogicalPageID; + self->prevExtentEndPageID = invalidPhysicalPageID; } else { - auto p = self->tailWriter.raw(); + auto p = self->tailWriter.header(); debug_printf( "FIFOQueue(%s) newTailPage tailWriterPage %u extentCurPageID %u, extentEndPageID %u\n", self->name.c_str(), @@ -1396,7 +1425,7 @@ public: } else { self->newTailPage = self->pager->newExtentPageID(self->queueID); self->tailPageNewExtent = true; - self->prevExtentEndPageID = invalidLogicalPageID; + self->prevExtentEndPageID = invalidPhysicalPageID; } } debug_printf("FIFOQueue(%s) newTailPage tailPageNewExtent:%d prevExtentEndPageID: %u " @@ -1410,7 +1439,7 @@ public: workPending = true; } else { if (self->usesExtents) { - auto p = self->tailWriter.raw(); + auto p = self->tailWriter.header(); self->prevExtentEndPageID = p->extentEndPageID; self->tailPageNewExtent = false; debug_printf("FIFOQueue(%s) newTailPage tailPageNewExtent: %d prevExtentEndPageID: %u " @@ -1436,7 +1465,7 @@ public: bool initTailWriter = true; // If a new tail page was allocated, link the last page of the tail writer to it. - if (newTailPage.get() != invalidLogicalPageID) { + if (newTailPage.get() != invalidPhysicalPageID) { tailWriter.addNewPage(newTailPage.get(), 0, false, false); // The flush sequence allocated a page and added it to the queue so increment numPages ++numPages; @@ -1444,7 +1473,7 @@ public: // newPage() should be ready immediately since a pageID is being explicitly passed. ASSERT(!tailWriter.isBusy()); - newTailPage = invalidLogicalPageID; + newTailPage = invalidPhysicalPageID; initTailWriter = true; } @@ -1467,7 +1496,7 @@ public: tailWriter.pageID, initTailWriter /*false*/, tailPageNewExtent, - invalidLogicalPageID, + invalidPhysicalPageID, 0, prevExtentEndPageID); headWriter.init(this, Cursor::WRITE); @@ -1494,17 +1523,16 @@ public: int64_t numPages; int64_t numEntries; - int dataBytesPerPage; int pagesPerExtent; - bool usesExtents = false; - bool tailPageNewExtent = false; - LogicalPageID prevExtentEndPageID; + bool usesExtents; + bool tailPageNewExtent; + PhysicalPageID prevExtentEndPageID; Cursor headReader; Cursor tailWriter; Cursor headWriter; - Future newTailPage; + Future newTailPage; // For debugging std::string name; @@ -1521,7 +1549,7 @@ int nextPowerOf2(uint32_t x) { } struct RedwoodMetrics { - static constexpr unsigned int btreeLevels = 5; + constexpr static unsigned int btreeLevels = 5; static int maxRecordCount; struct EventReasonsArray { @@ -1871,9 +1899,9 @@ public: !evictionOrder.empty()) { Entry& toEvict = evictionOrder.front(); - debug_printf("Evictor count=%" PRId64 " sizeUsed=%" PRId64 " sizeLimit=%" PRId64 " sizePenalty=%" PRId64 + debug_printf("Evictor count=%d sizeUsed=%" PRId64 " sizeLimit=%" PRId64 " sizePenalty=%" PRId64 " needed=%d Trying to evict %s evictable %d\n", - evictionOrder.size(), + (int)evictionOrder.size(), sizeUsed, sizeLimit, reservedSize, @@ -2043,7 +2071,7 @@ Future forwardError(Future f, Promise target) { target.sendError(e); } - throw e; + throw; } } @@ -2086,6 +2114,9 @@ public: int64_t* getPageCachePenaltySource() override { return &pageCache.evictor().reservedSize; } + constexpr static PhysicalPageID primaryHeaderPageID = 0; + constexpr static PhysicalPageID backupHeaderPageID = 1; + #pragma pack(push, 1) struct DelayedFreePage { Version version; @@ -2152,19 +2183,25 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 to use default from flow knobs // If memoryOnly is true, the pager will exist only in memory and once the cache is full writes will fail. + // Note that ownership is not taken for keyProvider and it must outlive the pager DWALPager(int desiredPageSize, int desiredExtentSize, std::string filename, int64_t pageCacheSizeBytes, int64_t remapCleanupWindowBytes, int concurrentExtentReads, - bool memoryOnly = false, + bool memoryOnly, + std::shared_ptr keyProvider, Promise errorPromise = {}) - : ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), - pageCacheBytes(pageCacheSizeBytes), pHeader(nullptr), desiredPageSize(desiredPageSize), - desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), + : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), + pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), + filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { + if (keyProvider == nullptr) { + keyProvider = std::make_shared(); + } + // This sets the page cache size for all PageCacheT instances using the same evictor pageCache.evictor().sizeLimit = pageCacheBytes; @@ -2176,6 +2213,8 @@ public: recoverFuture = forwardError(recover(this), errorPromise); } + std::string getName() const override { return filename; } + void setPageSize(int size) { // Conservative maximum for number of records that can fit in this page size g_redwoodMetrics.updateMaxRecordCount(315.0 * size / 4096); @@ -2185,9 +2224,7 @@ public: // logicalPageSize bytes int blocks = 1 + ((logicalPageSize - 1) / smallestPhysicalBlock); physicalPageSize = blocks * smallestPhysicalBlock; - if (pHeader != nullptr) { - pHeader->pageSize = logicalPageSize; - } + header.pageSize = logicalPageSize; } void setExtentSize(int size) { @@ -2199,14 +2236,22 @@ public: pagesPerExtent = 1 + ((size - 1) / physicalPageSize); } physicalExtentSize = pagesPerExtent * physicalPageSize; - - if (pHeader != nullptr) { - pHeader->extentSize = size; - } + header.extentSize = size; } - void updateCommittedHeader() { - memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); + void updateHeaderPage() { + Value serializedHeader = ObjectWriter::toValue(header, Unversioned()); + ASSERT(serializedHeader.size() <= headerPage->dataSize()); + serializedHeader.copyTo(headerPage->mutateData()); + + // Set remaining header bytes to \xff + memset( + headerPage->mutateData() + serializedHeader.size(), 0xff, headerPage->dataSize() - serializedHeader.size()); + } + + void updateLastCommittedHeader() { + lastCommittedHeaderPage = headerPage->clone(); + lastCommittedHeader = header; } ACTOR static Future recover(DWALPager* self) { @@ -2223,10 +2268,9 @@ public: } wait(store(self->pageFile, IAsyncFileSystem::filesystem()->open(self->filename, flags, 0644))); } + // Header page is always treated as having a page size of smallestPhysicalBlock self->setPageSize(smallestPhysicalBlock); - self->lastCommittedHeaderPage = self->newPageBuffer(); - self->pLastCommittedHeader = (Header*)self->lastCommittedHeaderPage->begin(); state int64_t fileSize = 0; if (exists) { @@ -2244,60 +2288,83 @@ public: if (exists && fileSize >= (self->smallestPhysicalBlock * 2)) { debug_printf("DWALPager(%s) recovering using existing file\n", self->filename.c_str()); - state bool recoveredHeader = false; + state bool recoveredBackupHeader = false; - // Read physical page 0 directly - wait(store(self->headerPage, self->readHeaderPage(self, 0))); + // Try to read primary header + try { + wait(store(self->headerPage, self->readHeaderPage(primaryHeaderPageID))); + } catch (Error& e) { + debug_printf("DWALPager(%s) Primary header read failed with %s\n", self->filename.c_str(), e.what()); - // If the checksum fails for the header page, try to recover committed header backup from page 1 - if (!self->headerPage->verifyChecksum(0)) { - TraceEvent(SevWarn, "RedwoodRecoveringHeader").detail("Filename", self->filename); + // Errors that can be caused by a corrupted and unsync'd write to the header page can be ignored and the + // committed, sync'd backup header will be tried. Notably, page_header_wrong_page_id is not included in + // this list because it means the header checksum passed but this page data is not meant to be at this + // location, which is a very serious error so recovery should not continue. + bool tryBackupHeader = + (e.code() == error_code_page_header_version_not_supported || + e.code() == error_code_page_encoding_not_supported || + e.code() == error_code_page_decoding_failed || e.code() == error_code_page_header_checksum_failed); - wait(store(self->headerPage, self->readHeaderPage(self, 1))); + TraceEvent(SevWarn, "RedwoodRecoveryErrorPrimaryHeaderFailed") + .errorUnsuppressed(e) + .detail("Filename", self->filename) + .detail("PageID", primaryHeaderPageID) + .detail("TryingBackupHeader", tryBackupHeader); - if (!self->headerPage->verifyChecksum(1)) { - if (g_network->isSimulated()) { - // TODO: Detect if process is being restarted and only throw injected if so? - throw io_error().asInjectedFault(); - } - - Error e = checksum_failed(); - TraceEvent(SevError, "RedwoodRecoveryFailed").error(e).detail("Filename", self->filename); - throw e; + // Don't throw if trying backup header below + if (!tryBackupHeader) { + throw; } - recoveredHeader = true; } - self->pHeader = (Header*)self->headerPage->begin(); + // If primary header wasn't valid, try backup header + if (!self->headerPage.isValid()) { + // Try to read backup header + try { + wait(store(self->headerPage, self->readHeaderPage(backupHeaderPageID))); + recoveredBackupHeader = true; + } catch (Error& e) { + debug_printf("DWALPager(%s) Backup header read failed with %s\n", self->filename.c_str(), e.what()); + TraceEvent(SevWarn, "RedwoodRecoveryErrorBackupHeaderFailed") + .error(e) + .detail("Filename", self->filename) + .detail("PageID", backupHeaderPageID); + throw; + } + } - if (self->pHeader->formatVersion != Header::FORMAT_VERSION) { + // Get header from the header page data + self->header = + ObjectReader::fromStringRef(self->headerPage->dataAsStringRef(), Unversioned()); + + if (self->header.formatVersion != PagerCommitHeader::FORMAT_VERSION) { Error e = unsupported_format_version(); - TraceEvent(SevWarn, "RedwoodRecoveryFailedWrongVersion") + TraceEvent(SevWarnAlways, "RedwoodRecoveryFailedWrongVersion") .error(e) .detail("Filename", self->filename) - .detail("Version", self->pHeader->formatVersion) - .detail("ExpectedVersion", Header::FORMAT_VERSION); + .detail("Version", self->header.formatVersion) + .detail("ExpectedVersion", PagerCommitHeader::FORMAT_VERSION); throw e; } - self->setPageSize(self->pHeader->pageSize); + self->setPageSize(self->header.pageSize); self->filePageCount = fileSize / self->physicalPageSize; self->filePageCountPending = self->filePageCount; if (self->logicalPageSize != self->desiredPageSize) { - TraceEvent(SevWarn, "RedwoodPageSizeNotDesired") - .detail("Filename", self->filename) + TraceEvent(SevWarnAlways, "RedwoodPageSizeMismatch") + .detail("InstanceName", self->getName()) .detail("ExistingPageSize", self->logicalPageSize) .detail("DesiredPageSize", self->desiredPageSize); } - self->setExtentSize(self->pHeader->extentSize); + self->setExtentSize(self->header.extentSize); - self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); - self->extentFreeList.recover(self, self->pHeader->extentFreeList, "ExtentFreeListRecovered"); - self->delayedFreeList.recover(self, self->pHeader->delayedFreeList, "DelayedFreeListRecovered"); - self->extentUsedList.recover(self, self->pHeader->extentUsedList, "ExtentUsedListRecovered"); - self->remapQueue.recover(self, self->pHeader->remapQueue, "RemapQueueRecovered"); + self->freeList.recover(self, self->header.freeList, "FreeListRecovered"); + self->extentFreeList.recover(self, self->header.extentFreeList, "ExtentFreeListRecovered"); + self->delayedFreeList.recover(self, self->header.delayedFreeList, "DelayedFreeListRecovered"); + self->extentUsedList.recover(self, self->header.extentUsedList, "ExtentUsedListRecovered"); + self->remapQueue.recover(self, self->header.remapQueue, "RemapQueueRecovered"); debug_printf("DWALPager(%s) Queue recovery complete.\n", self->filename.c_str()); @@ -2325,7 +2392,7 @@ public: try { loop choose { when(Standalone> remaps = waitNext(remapStream.getFuture())) { - debug_printf("DWALPager(%s) recovery. remaps size: %d, queueEntries: %d\n", + debug_printf("DWALPager(%s) recovery. remaps size: %d, queueEntries: %" PRId64 "\n", self->filename.c_str(), remaps.size(), self->remapQueue.numEntries); @@ -2350,9 +2417,9 @@ public: // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. // If this fails, the backup header is still in tact for the next recovery attempt. - if (recoveredHeader) { + if (recoveredBackupHeader) { // Write the header to page 0 - wait(self->writeHeaderPage(0, self->headerPage)); + wait(self->writeHeaderPage(primaryHeaderPageID, self->headerPage)); // Wait for all outstanding writes to complete wait(waitForAll(self->operations)); @@ -2364,8 +2431,8 @@ public: // Update the last committed header with the one that was recovered (which is the last known committed // header) - self->updateCommittedHeader(); - self->addSnapshot(self->pHeader->committedVersion, self->pHeader->getMetaKey()); + self->updateLastCommittedHeader(); + self->addSnapshot(self->header.committedVersion, self->header.userCommitRecord); // Reset the remapQueue head reader for normal reads self->remapQueue.resetHeadReader(); @@ -2380,7 +2447,7 @@ public: debug_printf("DWALPager(%s) creating new pager\n", self->filename.c_str()); self->headerPage = self->newPageBuffer(); - self->pHeader = (Header*)self->headerPage->begin(); + self->headerPage->init(EncodingType::XXHash64, PageType::HeaderPage, 0); // Now that the header page has been allocated, set page size to desired self->setPageSize(self->desiredPageSize); @@ -2392,19 +2459,17 @@ public: self->setExtentSize(self->desiredExtentSize); // Write new header using desiredPageSize - self->pHeader->formatVersion = Header::FORMAT_VERSION; - self->pHeader->committedVersion = initialVersion; - self->pHeader->oldestVersion = initialVersion; - // No meta key until a user sets one and commits - self->pHeader->setMetaKey(Key()); + self->header.formatVersion = PagerCommitHeader::FORMAT_VERSION; + self->header.committedVersion = initialVersion; + self->header.oldestVersion = initialVersion; // There are 2 reserved pages: // Page 0 - header // Page 1 - header backup - self->pHeader->pageCount = 2; + self->header.pageCount = 2; // Create queues - self->pHeader->queueCount = 0; + self->header.queueCount = 0; self->freeList.create(self, self->newLastPageID(), "FreeList", self->newLastQueueID(), false); self->delayedFreeList.create(self, self->newLastPageID(), "DelayedFreeList", self->newLastQueueID(), false); self->extentFreeList.create(self, self->newLastPageID(), "ExtentFreeList", self->newLastQueueID(), false); @@ -2416,20 +2481,18 @@ public: // The first commit() below will flush the queues and update the queue states in the header, // but since the queues will not be used between now and then their states will not change. // In order to populate lastCommittedHeader, update the header now with the queue states. - self->pHeader->freeList = self->freeList.getState(); - self->pHeader->delayedFreeList = self->delayedFreeList.getState(); - self->pHeader->extentFreeList = self->extentFreeList.getState(); - self->pHeader->extentUsedList = self->extentUsedList.getState(); - self->pHeader->remapQueue = self->remapQueue.getState(); + self->header.freeList = self->freeList.getState(); + self->header.delayedFreeList = self->delayedFreeList.getState(); + self->header.extentFreeList = self->extentFreeList.getState(); + self->header.extentUsedList = self->extentUsedList.getState(); + self->header.remapQueue = self->remapQueue.getState(); - // Set remaining header bytes to \xff - memset(self->headerPage->mutate() + self->pHeader->size(), - 0xff, - self->headerPage->size() - self->pHeader->size()); + // Serialize header to header page + self->updateHeaderPage(); // There is no previously committed header, but the current header state is sufficient to use as the backup // header for the next commit, which if recovered would result in a valid empty pager at version 0. - self->updateCommittedHeader(); + self->updateLastCommittedHeader(); self->addSnapshot(initialVersion, KeyRef()); self->remapCleanupFuture = Void(); @@ -2444,15 +2507,15 @@ public: self->toTraceEvent(e); e.log(); - self->recoveryVersion = self->pHeader->committedVersion; + self->recoveryVersion = self->header.committedVersion; debug_printf("DWALPager(%s) recovered. recoveryVersion=%" PRId64 " oldestVersion=%" PRId64 - " logicalPageSize=%d physicalPageSize=%d\n", + " logicalPageSize=%d physicalPageSize=%d headerPageCount=%" PRId64 " filePageCount=%" PRId64 "\n", self->filename.c_str(), self->recoveryVersion, - self->pHeader->oldestVersion, + self->header.oldestVersion, self->logicalPageSize, self->physicalPageSize, - self->pHeader->pageCount, + self->header.pageCount, self->filePageCount); return Void(); @@ -2460,9 +2523,9 @@ public: void toTraceEvent(TraceEvent& e) const override { e.detail("FileName", filename.c_str()); - e.detail("LogicalFileSize", pHeader->pageCount * physicalPageSize); + e.detail("LogicalFileSize", header.pageCount * physicalPageSize); e.detail("PhysicalFileSize", filePageCountPending * physicalPageSize); - e.detail("CommittedVersion", pHeader->committedVersion); + e.detail("CommittedVersion", header.committedVersion); e.detail("LogicalPageSize", logicalPageSize); e.detail("PhysicalPageSize", physicalPageSize); @@ -2507,21 +2570,12 @@ public: } // Allocate a new queueID - QueueID newLastQueueID() override { - QueueID id = pHeader->queueCount; - ++pHeader->queueCount; - return id; + QueueID newLastQueueID() override { return header.queueCount++; } + + Reference newPageBuffer(size_t blocks = 1) override { + return makeReference(logicalPageSize * blocks, physicalPageSize * blocks); } - Reference newPageBuffer(size_t size = 1) override { - return Reference(new ArenaPage(logicalPageSize * size, physicalPageSize * size)); - } - - // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). - // For a given pager instance, separate calls to this function must return the same value. - // TODO: This is abstraction breaking. This should probably be stored as a member, calculated once on construction - // by creating an ArenaPage and getting its usable size. - int getUsablePageSize() const override { return logicalPageSize - sizeof(ArenaPage::Checksum); } int getPhysicalPageSize() const override { return physicalPageSize; } int getLogicalPageSize() const override { return logicalPageSize; } int getPagesPerExtent() const override { return pagesPerExtent; } @@ -2558,14 +2612,14 @@ public: // Grow the pager file by one page and return it LogicalPageID newLastPageID() { - LogicalPageID id = pHeader->pageCount; + LogicalPageID id = header.pageCount; growPager(1); return id; } Future newPageID() override { return newPageID_impl(this); } - void growPager(int64_t pages) { pHeader->pageCount += pages; } + void growPager(int64_t pages) { header.pageCount += pages; } // Get a new, previously available extent and it's first page ID. The page will be considered in-use after the next // commit regardless of whether or not it was written to, until it is returned to the pager via freePage() @@ -2595,20 +2649,21 @@ public: // We reserve all the pageIDs within the extent during this step // That translates to extentID being same as the return first pageID LogicalPageID newLastExtentID() { - LogicalPageID id = pHeader->pageCount; + LogicalPageID id = header.pageCount; growPager(pagesPerExtent); return id; } Future newExtentPageID(QueueID queueID) override { return newExtentPageID_impl(this, queueID); } - ACTOR static Future writePhysicalPage_impl(DWALPager* self, - Void* data, - PagerEventReasons reason, - unsigned int level, - PhysicalPageID pageID, - int blockSize, - bool header) { + ACTOR static Future writePhysicalBlock(DWALPager* self, + Reference page, + int blockNum, + int blockSize, + PhysicalPageID pageID, + PagerEventReasons reason, + unsigned int level, + bool header) { state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority)); ++g_redwoodMetrics.metric.pagerDiskWrite; @@ -2630,7 +2685,9 @@ public: } // Note: Not using forwardError here so a write error won't be discovered until commit time. - wait(self->pageFile->write(data, blockSize, (int64_t)pageID * blockSize)); + debug_printf("DWALPager(%s) op=writeBlock %s\n", self->filename.c_str(), toString(pageID).c_str()); + wait(self->pageFile->write(page->rawData() + (blockNum * blockSize), blockSize, (int64_t)pageID * blockSize)); + debug_printf("DWALPager(%s) op=writeBlockDone %s\n", self->filename.c_str(), toString(pageID).c_str()); return Void(); } @@ -2661,30 +2718,36 @@ public: bool header = false) { debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), - (header ? "writePhysicalHeader" : "writePhysical"), + (header ? "writePhysicalHeader" : "writePhysicalPage"), toString(pageIDs).c_str(), - page->begin()); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - page->updateChecksum(pageIDs.front()); - debug_printf("DWALPager(%s) writePhysicalPage %s CalculatedChecksum=%d ChecksumInPage=%d\n", - filename.c_str(), - toString(pageIDs).c_str(), - page->calculateChecksum(pageIDs.front()), - page->getChecksum()); + page->rawData()); + + // Set metadata before prewrite so it's in the pre-encrypted page in cache if the page is encrypted + // The actual next commit version is unknown, so the write version of a page is always the + // last committed version + 1 + page->setWriteInfo(pageIDs.front(), this->getLastCommittedVersion() + 1); + + // Copy the page if preWrite will encrypt/modify the payload + bool copy = page->isEncrypted(); + if (copy) { + page = page->clone(); + } + + page->preWrite(pageIDs.front()); + int blockSize = header ? smallestPhysicalBlock : physicalPageSize; Future f; if (pageIDs.size() == 1) { - f = writePhysicalPage_impl(this, (Void*)page->mutate(), reason, level, pageIDs.front(), blockSize, header); - operations.push_back(f); - return f; + f = writePhysicalBlock(this, page, 0, blockSize, pageIDs.front(), reason, level, header); + } else { + std::vector> writers; + for (int i = 0; i < pageIDs.size(); ++i) { + Future p = writePhysicalBlock(this, page, i, blockSize, pageIDs[i], reason, level, header); + writers.push_back(p); + } + f = waitForAll(writers); } - std::vector> writers; - for (int i = 0; i < pageIDs.size(); ++i) { - Future p = writePhysicalPage_impl( - this, (Void*)page->mutate() + i * blockSize, reason, level, pageIDs[i], blockSize, header); - writers.push_back(p); - } - f = waitForAll(writers); + operations.push_back(f); return f; } @@ -2720,19 +2783,18 @@ public: if (!cacheEntry.initialized()) { cacheEntry.writeFuture = detach(writePhysicalPage(reason, level, pageIDs, data)); } else if (cacheEntry.reading()) { - // Wait for the read to finish, then start the write. - cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { - writePhysicalPage(reason, level, pageIDs, data); - return Void(); - }); + // This is very unlikely, maybe impossible in the current pager use cases + // Wait for the outstanding read to finish, then start the write + cacheEntry.writeFuture = mapAsync(Void)>, Void>( + success(cacheEntry.readFuture), [=](Void) { return writePhysicalPage(reason, level, pageIDs, data); }); } // If the page is being written, wait for this write before issuing the new write to ensure the // writes happen in the correct order else if (cacheEntry.writing()) { - cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { - writePhysicalPage(reason, level, pageIDs, data); - return Void(); - }); + // This is very unlikely, maybe impossible in the current pager use cases + // Wait for the previous write to finish, then start new write + cacheEntry.writeFuture = mapAsync(Void)>, Void>( + cacheEntry.writeFuture, [=](Void) { return writePhysicalPage(reason, level, pageIDs, data); }); } else { cacheEntry.writeFuture = detach(writePhysicalPage(reason, level, pageIDs, data)); } @@ -2779,7 +2841,7 @@ public: filename.c_str(), toString(pageID).c_str(), v, - pLastCommittedHeader->oldestVersion); + lastCommittedHeader.oldestVersion); freeList.pushBack(pageID); } else { // Otherwise add it to the delayed free list @@ -2787,7 +2849,7 @@ public: filename.c_str(), toString(pageID).c_str(), v, - pLastCommittedHeader->oldestVersion); + lastCommittedHeader.oldestVersion); delayedFreeList.pushBack({ v, pageID }); } @@ -2818,7 +2880,7 @@ public: toString(pageID).c_str(), toString(newID).c_str(), v, - pLastCommittedHeader->oldestVersion); + lastCommittedHeader.oldestVersion); iLast->second = invalidLogicalPageID; remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); } else { @@ -2827,7 +2889,7 @@ public: toString(pageID).c_str(), toString(newID).c_str(), v, - pLastCommittedHeader->oldestVersion); + lastCommittedHeader.oldestVersion); // Mark id as converted to its last remapped location as of v i->second[v] = 0; remapQueue.pushBack(RemappedPage{ v, pageID, 0 }); @@ -2844,7 +2906,7 @@ public: filename.c_str(), toString(pageID).c_str(), v, - pLastCommittedHeader->oldestVersion); + lastCommittedHeader.oldestVersion); remapQueue.pushBack(RemappedPage{ v, pageID, invalidLogicalPageID }); // A freed page is unlikely to be read again soon so prioritize its cache eviction @@ -2872,14 +2934,15 @@ public: } void freeExtent(LogicalPageID pageID) override { freeExtent_impl(this, pageID); } - ACTOR static Future readPhysicalPage_impl(DWALPager* self, - uint8_t* data, - int blockSize, - int64_t offset, - int priority) { + ACTOR static Future readPhysicalBlock(DWALPager* self, + uint8_t* data, + int blockSize, + int64_t offset, + int priority) { state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority))); - int reader = wait(self->pageFile->read(data, blockSize, offset)); - return reader; + ++g_redwoodMetrics.metric.pagerDiskRead; + int bytes = wait(self->pageFile->read(data, blockSize, offset)); + return bytes; } // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock @@ -2891,50 +2954,59 @@ public: bool header) { ASSERT(!self->memoryOnly); - // if (g_network->getCurrentTask() > TaskPriority::DiskRead) { - // wait(delay(0, TaskPriority::DiskRead)); - // } - state Reference page = - header ? Reference(new ArenaPage(smallestPhysicalBlock, smallestPhysicalBlock)) - : self->newPageBuffer(); - debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", + header ? makeReference(smallestPhysicalBlock, smallestPhysicalBlock) : self->newPageBuffer(); + debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p header=%d\n", self->filename.c_str(), toString(pageID).c_str(), - page->begin()); + page->rawData(), + header); - state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority))); - ++g_redwoodMetrics.metric.pagerDiskRead; - - // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? - int blockSize = header ? smallestPhysicalBlock : self->physicalPageSize; - int readBytes = wait(self->pageFile->read(page->mutate(), blockSize, (int64_t)pageID * blockSize)); - debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", + int readBytes = wait(readPhysicalBlock(self, + page->rawData(), + page->rawSize(), + (int64_t)pageID * page->rawSize(), + std::min(priority, ioMaxPriority))); + debug_printf("DWALPager(%s) op=readPhysicalDiskReadComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), - page->begin(), + page->rawData(), readBytes); - // Header reads are checked explicitly during recovery - if (!header) { - if (!page->verifyChecksum(pageID)) { - debug_printf( - "DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); - Error e = checksum_failed(); - if (g_network->isSimulated() && g_simulator.checkInjectedCorruption()) - e = e.asInjectedFault(); - TraceEvent(SevError, "RedwoodChecksumFailed") - .error(e) - .detail("Filename", self->filename.c_str()) - .detail("PageID", pageID) - .detail("PageSize", self->physicalPageSize) - .detail("Offset", pageID * self->physicalPageSize) - .detail("CalculatedChecksum", page->calculateChecksum(pageID)) - .detail("ChecksumInPage", page->getChecksum()); - ASSERT(false); - throw e; + try { + page->postReadHeader(pageID); + if (page->isEncrypted()) { + EncryptionKey k = wait(self->keyProvider->getSecrets(page->encryptionKey)); + page->encryptionKey = k; } + page->postReadPayload(pageID); + debug_printf("DWALPager(%s) op=readPhysicalVerified %s ptr=%p\n", + self->filename.c_str(), + toString(pageID).c_str(), + page->rawData()); + } catch (Error& e) { + Error err = e; + if (g_network->isSimulated() && g_simulator.checkInjectedCorruption()) { + err = err.asInjectedFault(); + } + + // For header pages, event is a warning because the primary header could be read after an unsync'd write, + // but no other page can. + TraceEvent(header ? SevWarnAlways : SevError, "RedwoodPageError") + .error(err) + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize); + + debug_printf("DWALPager(%s) postread failed for %s with %s\n", + self->filename.c_str(), + toString(pageID).c_str(), + err.what()); + + throw err; } + return page; } @@ -2948,49 +3020,61 @@ public: // } state Reference page = self->newPageBuffer(pageIDs.size()); - debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", + debug_printf("DWALPager(%s) op=readPhysicalMultiStart %s ptr=%p\n", self->filename.c_str(), toString(pageIDs).c_str(), - page->begin()); + page->rawData()); - ++g_redwoodMetrics.metric.pagerDiskRead; // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? state int blockSize = self->physicalPageSize; - state uint8_t* data = page->mutate(); std::vector> reads; for (int i = 0; i < pageIDs.size(); ++i) { - reads.push_back(readPhysicalPage_impl(self, data, blockSize, ((int64_t)pageIDs[i]) * blockSize, priority)); - data += blockSize; + reads.push_back(readPhysicalBlock( + self, page->rawData() + (i * blockSize), blockSize, ((int64_t)pageIDs[i]) * blockSize, priority)); } // wait for all the parallel read futures wait(waitForAll(reads)); - debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", + debug_printf("DWALPager(%s) op=readPhysicalMultiDiskReadsComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageIDs).c_str(), - page->begin(), + page->rawData(), pageIDs.size() * blockSize); - // Header reads are checked explicitly during recovery - if (!page->verifyChecksum(pageIDs.front())) { - debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageIDs).c_str()); - Error e = checksum_failed(); - TraceEvent(SevError, "RedwoodChecksumFailed") + + try { + page->postReadHeader(pageIDs.front()); + if (page->isEncrypted()) { + EncryptionKey k = wait(self->keyProvider->getSecrets(page->encryptionKey)); + page->encryptionKey = k; + } + page->postReadPayload(pageIDs.front()); + debug_printf("DWALPager(%s) op=readPhysicalVerified %s ptr=%p bytes=%d\n", + self->filename.c_str(), + toString(pageIDs).c_str(), + page->rawData(), + pageIDs.size() * blockSize); + } catch (Error& e) { + // For header pages, error is a warning because recovery may still be possible + TraceEvent(SevError, "RedwoodPageError") .error(e) .detail("Filename", self->filename.c_str()) - .detail("PageID", pageIDs) - .detail("PageSize", self->physicalPageSize) - .detail("Offset", pageIDs.front() * self->physicalPageSize) - .detail("CalculatedChecksum", page->calculateChecksum(pageIDs.front())) - .detail("ChecksumInPage", page->getChecksum()); - ASSERT(false); - throw e; + .detail("PageIDs", pageIDs) + .detail("PageSize", self->physicalPageSize); + + debug_printf("DWALPager(%s) postread failed for %s with %s\n", + self->filename.c_str(), + toString(pageIDs).c_str(), + e.what()); + + throw; } + return page; } - static Future> readHeaderPage(DWALPager* self, PhysicalPageID pageID) { - debug_printf("DWALPager(%s) readHeaderPage %s\n", self->filename.c_str(), toString(pageID).c_str()); - return readPhysicalPage(self, pageID, ioMaxPriority, true); + Future> readHeaderPage(PhysicalPageID pageID) { + debug_printf("DWALPager(%s) readHeaderPage %s\n", filename.c_str(), toString(pageID).c_str()); + return readPhysicalPage(this, pageID, ioMaxPriority, true); } // Reads the most recent version of pageID, either previously committed or written using updatePage() @@ -3003,7 +3087,11 @@ public: bool noHit) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache - debug_printf("DWALPager(%s) op=read %s noHit=%d\n", filename.c_str(), toString(pageID).c_str(), noHit); + debug_printf("DWALPager(%s) op=read %s reason=%s noHit=%d\n", + filename.c_str(), + toString(pageID).c_str(), + PagerEventReasonsStrings[(int)reason], + noHit); auto& eventReasons = g_redwoodMetrics.level(level).metrics.events; eventReasons.addEventReason(PagerEvents::CacheLookup, reason); if (!cacheable) { @@ -3042,13 +3130,17 @@ public: Future> readMultiPage(PagerEventReasons reason, unsigned int level, - Standalone> pageIDs, + VectorRef pageIDs, int priority, bool cacheable, bool noHit) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache - debug_printf("DWALPager(%s) op=read %s noHit=%d\n", filename.c_str(), toString(pageIDs).c_str(), noHit); + debug_printf("DWALPager(%s) op=read %s reason=%s noHit=%d\n", + filename.c_str(), + toString(pageIDs).c_str(), + PagerEventReasonsStrings[(int)reason], + noHit); auto& eventReasons = g_redwoodMetrics.level(level).metrics.events; eventReasons.addEventReason(PagerEvents::CacheLookup, reason); if (!cacheable) { @@ -3126,24 +3218,6 @@ public: return readPage(reason, level, physicalID, priority, cacheable, noHit); } - Future> readMultiPageAtVersion(PagerEventReasons reason, - unsigned int level, - VectorRef logicalIDs, - int priority, - Version v, - bool cacheable, - bool noHit) { - Standalone> ids; - ids.resize(ids.arena(), logicalIDs.size()); - for (int i = 0; i < logicalIDs.size(); ++i) { - ids[i] = getPhysicalPageID(logicalIDs[i], v); - } - debug_printf("op=readMultiPageAtVersion, from logicalIDs %s to phsyicalIDs %s\n", - toString(logicalIDs).c_str(), - toString(ids).c_str()); - return readMultiPage(reason, level, ids, priority, cacheable, noHit); - } - void releaseExtentReadLock() override { concurrentExtentReads->release(); } // Read the physical extent at given pageID @@ -3161,24 +3235,26 @@ public: } // readSize may not be equal to the physical extent size (for the first and last extents) - if (!readSize) + // but otherwise use the full physical extent size + if (readSize == 0) { readSize = self->physicalExtentSize; + } - state Reference extent = Reference(new ArenaPage(self->logicalPageSize, readSize)); + state Reference extent = makeReference(readSize, readSize); // physicalReadSize is the size of disk read we intend to issue auto physicalReadSize = SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_READ_SIZE; auto parallelReads = readSize / physicalReadSize; auto lastReadSize = readSize % physicalReadSize; - debug_printf( - "DWALPager(%s) op=readPhysicalExtentStart %s readSize %d offset %d physicalReadSize %d parallelReads %d\n", - self->filename.c_str(), - toString(pageID).c_str(), - readSize, - (int64_t)pageID * (self->physicalPageSize), - physicalReadSize, - parallelReads); + debug_printf("DWALPager(%s) op=readPhysicalExtentStart %s readSize %d offset %" PRId64 + " physicalReadSize %d parallelReads %d\n", + self->filename.c_str(), + toString(pageID).c_str(), + readSize, + (int64_t)pageID * (self->physicalPageSize), + physicalReadSize, + parallelReads); // we split the extent read into a number of parallel disk reads based on the determined physical // disk read size. All those reads are issued in parallel and their futures are stored into the following @@ -3189,23 +3265,23 @@ public: int64_t currentOffset; for (i = 0; i < parallelReads; i++) { currentOffset = i * physicalReadSize; - debug_printf("DWALPager(%s) current offset %d\n", self->filename.c_str(), currentOffset); + debug_printf("DWALPager(%s) current offset %" PRId64 "\n", self->filename.c_str(), currentOffset); ++g_redwoodMetrics.metric.pagerDiskRead; reads.push_back( - self->pageFile->read(extent->mutate() + currentOffset, physicalReadSize, startOffset + currentOffset)); + self->pageFile->read(extent->rawData() + currentOffset, physicalReadSize, startOffset + currentOffset)); } // Handle the last read separately as it may be smaller than physicalReadSize if (lastReadSize) { currentOffset = i * physicalReadSize; - debug_printf("DWALPager(%s) iter %d current offset %d lastReadSize %d\n", + debug_printf("DWALPager(%s) iter %d current offset %" PRId64 " lastReadSize %d\n", self->filename.c_str(), i, currentOffset, lastReadSize); ++g_redwoodMetrics.metric.pagerDiskRead; reads.push_back( - self->pageFile->read(extent->mutate() + currentOffset, lastReadSize, startOffset + currentOffset)); + self->pageFile->read(extent->rawData() + currentOffset, lastReadSize, startOffset + currentOffset)); } // wait for all the parallel read futures for the given extent @@ -3214,7 +3290,7 @@ public: debug_printf("DWALPager(%s) op=readPhysicalExtentComplete %s ptr=%p bytes=%d file offset=%d\n", self->filename.c_str(), toString(pageID).c_str(), - extent->begin(), + extent->rawData(), readSize, (pageID * self->physicalPageSize)); @@ -3232,8 +3308,8 @@ public: } eventReasons.addEventReason(PagerEvents::CacheLookup, PagerEventReasons::MetaData); - LogicalPageID headPageID = pHeader->remapQueue.headPageID; - LogicalPageID tailPageID = pHeader->remapQueue.tailPageID; + LogicalPageID headPageID = header.remapQueue.headPageID; + LogicalPageID tailPageID = header.remapQueue.tailPageID; int readSize = physicalExtentSize; bool headExt = false; bool tailExt = false; @@ -3278,34 +3354,30 @@ public: // Get snapshot as of the most recent committed version of the pager Reference getReadSnapshot(Version v) override; void addSnapshot(Version version, KeyRef meta) { - ASSERT(snapshots.empty() || snapshots.back().version != version); + if (snapshots.empty()) { + oldestSnapshotVersion = version; + } else { + ASSERT(snapshots.back().version != version); + } - Promise expired; - snapshots.push_back( - { version, expired, makeReference(this, meta, version, expired.getFuture()) }); + snapshots.push_back({ version, makeReference(this, meta, version) }); } // Set the pending oldest versiont to keep as of the next commit void setOldestReadableVersion(Version v) override { - ASSERT(v >= pHeader->oldestVersion); - ASSERT(v <= pHeader->committedVersion); - pHeader->oldestVersion = v; + ASSERT(v >= header.oldestVersion); + ASSERT(v <= header.committedVersion); + header.oldestVersion = v; expireSnapshots(v); }; // Get the oldest *readable* version, which is not the same as the oldest retained version as the version // returned could have been set as the oldest version in the pending commit - Version getOldestReadableVersion() const override { return pHeader->oldestVersion; }; + Version getOldestReadableVersion() const override { return header.oldestVersion; }; // Calculate the *effective* oldest version, which can be older than the one set in the last commit since we // are allowing active snapshots to temporarily delay page reuse. - Version effectiveOldestVersion() { - if (snapshots.empty()) { - debug_printf("DWALPager(%s) snapshots list empty\n", filename.c_str()); - return pLastCommittedHeader->oldestVersion; - } - return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); - } + Version effectiveOldestVersion() { return std::min(lastCommittedHeader.oldestVersion, oldestSnapshotVersion); } ACTOR static Future removeRemapEntry(DWALPager* self, RemappedPage p, Version oldestRetainedVersion) { // Get iterator to the versioned page map entry for the original page @@ -3481,7 +3553,7 @@ public: : SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_TOLERANCE_RATIO; // For simplicity, we assume each entry in the remap queue corresponds to one remapped page. uint64_t remapCleanupWindowEntries = - static_cast(self->remapCleanupWindowBytes / self->pHeader->pageSize); + static_cast(self->remapCleanupWindowBytes / self->header.pageSize); state uint64_t minRemapEntries = static_cast(remapCleanupWindowEntries * (1.0 - toleranceRatio)); state uint64_t maxRemapEntries = static_cast(remapCleanupWindowEntries * (1.0 + toleranceRatio)); @@ -3491,7 +3563,7 @@ public: self->filename.c_str(), oldestRetainedVersion, self->remapCleanupWindowBytes, - self->pHeader->pageSize, + self->header.pageSize, minRemapEntries, maxRemapEntries, self->remapQueue.numEntries); @@ -3592,11 +3664,11 @@ public: return Void(); } - ACTOR static Future commit_impl(DWALPager* self, Version v) { + ACTOR static Future commit_impl(DWALPager* self, Version v, Value commitRecord) { debug_printf("DWALPager(%s) commit begin %s\n", self->filename.c_str(), ::toString(v).c_str()); // Write old committed header to Page 1 - self->writeHeaderPage(1, self->lastCommittedHeaderPage); + self->writeHeaderPage(backupHeaderPageID, self->lastCommittedHeaderPage); // Trigger the remap eraser to stop and then wait for it. self->remapCleanupStop = true; @@ -3604,12 +3676,12 @@ public: wait(flushQueues(self)); - self->pHeader->committedVersion = v; - self->pHeader->remapQueue = self->remapQueue.getState(); - self->pHeader->extentFreeList = self->extentFreeList.getState(); - self->pHeader->extentUsedList = self->extentUsedList.getState(); - self->pHeader->freeList = self->freeList.getState(); - self->pHeader->delayedFreeList = self->delayedFreeList.getState(); + self->header.committedVersion = v; + self->header.remapQueue = self->remapQueue.getState(); + self->header.extentFreeList = self->extentFreeList.getState(); + self->header.extentUsedList = self->extentUsedList.getState(); + self->header.freeList = self->freeList.getState(); + self->header.delayedFreeList = self->delayedFreeList.getState(); // Wait for all outstanding writes to complete debug_printf("DWALPager(%s) waiting for outstanding writes\n", self->filename.c_str()); @@ -3626,11 +3698,15 @@ public: wait(self->pageFile->sync()); debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), - self->pHeader->committedVersion); + self->header.committedVersion); } - // Update header on disk and sync again. - wait(self->writeHeaderPage(0, self->headerPage)); + // Update new commit header to the primary header page + self->header.userCommitRecord = commitRecord; + self->updateHeaderPage(); + + // Update primary header page on disk and sync again. + wait(self->writeHeaderPage(primaryHeaderPageID, self->headerPage)); if (g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } @@ -3639,16 +3715,16 @@ public: wait(self->pageFile->sync()); debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), - self->pHeader->committedVersion); + self->header.committedVersion); } // Update the last committed header for use in the next commit. - self->updateCommittedHeader(); - self->addSnapshot(v, self->pHeader->getMetaKey()); + self->updateLastCommittedHeader(); + self->addSnapshot(v, self->header.userCommitRecord); // Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use, // because maybe some are no longer in use. - self->expireSnapshots(self->pHeader->oldestVersion); + self->expireSnapshots(self->header.oldestVersion); // Start unmapping pages for expired versions self->remapCleanupFuture = remapCleanup(self); @@ -3659,17 +3735,15 @@ public: return Void(); } - Future commit(Version v) override { + Future commit(Version v, Value commitRecord) override { // Can't have more than one commit outstanding. ASSERT(commitFuture.isReady()); - ASSERT(v > pLastCommittedHeader->committedVersion); - commitFuture = forwardError(commit_impl(this, v), errorPromise); + ASSERT(v > lastCommittedHeader.committedVersion); + commitFuture = forwardError(commit_impl(this, v, commitRecord), errorPromise); return commitFuture; } - Key getMetaKey() const override { return pHeader->getMetaKey(); } - - void setMetaKey(KeyRef metaKey) override { pHeader->setMetaKey(metaKey); } + Value getCommitRecord() const override { return lastCommittedHeader.userCommitRecord; } ACTOR void shutdown(DWALPager* self, bool dispose) { debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); @@ -3729,7 +3803,7 @@ public: } else { g_network->getDiskBytes(parentDirectory(filename), free, total); } - int64_t pagerSize = pHeader->pageCount * physicalPageSize; + int64_t pagerSize = header.pageCount * physicalPageSize; // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be // known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the @@ -3749,7 +3823,7 @@ public: } int64_t getPageCacheCount() override { return pageCache.getCount(); } - int64_t getPageCount() override { return pHeader->pageCount; } + int64_t getPageCount() override { return header.pageCount; } int64_t getExtentCacheCount() override { return extentCache.getCount(); } ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { @@ -3772,7 +3846,7 @@ public: Future getUserPageCount() override { return map(getUserPageCount_cleanup(this), [=](Void) { int64_t userPages = - pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - + header.pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - ((((remapQueue.numPages - 1) / pagesPerExtent) + 1) * pagesPerExtent) - extentFreeList.numPages - (pagesPerExtent * extentFreeList.numEntries) - extentUsedList.numPages; @@ -3781,7 +3855,7 @@ public: " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", filename.c_str(), userPages, - pHeader->pageCount, + header.pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, @@ -3794,7 +3868,7 @@ public: Future init() override { return recoverFuture; } - Version getLastCommittedVersion() const override { return pLastCommittedHeader->committedVersion; } + Version getLastCommittedVersion() const override { return lastCommittedHeader.committedVersion; } private: ~DWALPager() {} @@ -3802,40 +3876,43 @@ private: // Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use. void expireSnapshots(Version v); -#pragma pack(push, 1) // Header is the format of page 0 of the database - struct Header { - static constexpr int FORMAT_VERSION = 8; + struct PagerCommitHeader { + constexpr static FileIdentifier file_identifier = 11836690; + constexpr static unsigned int FORMAT_VERSION = 10; + uint16_t formatVersion; uint32_t queueCount; uint32_t pageSize; int64_t pageCount; uint32_t extentSize; + Version committedVersion; + Version oldestVersion; + Value userCommitRecord; FIFOQueue::QueueState freeList; FIFOQueue::QueueState extentFreeList; // free list for extents FIFOQueue::QueueState extentUsedList; // in-use list for extents FIFOQueue::QueueState delayedFreeList; FIFOQueue::QueueState remapQueue; - Version committedVersion; - Version oldestVersion; - int32_t metaKeySize; - KeyRef getMetaKey() const { return KeyRef((const uint8_t*)(this + 1), metaKeySize); } - - void setMetaKey(StringRef key) { - ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); - metaKeySize = key.size(); - if (key.size() > 0) { - memcpy(this + 1, key.begin(), key.size()); - } + template + void serialize(Ar& ar) { + serializer(ar, + formatVersion, + queueCount, + pageSize, + pageCount, + extentSize, + committedVersion, + oldestVersion, + userCommitRecord, + freeList, + extentFreeList, + extentUsedList, + delayedFreeList, + remapQueue); } - - int size() const { return sizeof(Header) + metaKeySize; } - - private: - Header(); }; -#pragma pack(pop) ACTOR static Future clearRemapQueue_impl(DWALPager* self) { // Wait for outstanding commit. @@ -3850,7 +3927,7 @@ private: state int attempt = 0; for (attempt = 0; attempt < 2; attempt++) { self->setOldestReadableVersion(self->getLastCommittedVersion()); - wait(self->commit(self->getLastCommittedVersion() + 1)); + wait(self->commit(self->getLastCommittedVersion() + 1, self->getCommitRecord())); } ASSERT(self->remapQueue.numEntries == 0); @@ -3870,7 +3947,7 @@ private: // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires // this in simulation, and it also makes sense for current SSDs. // Allowing a smaller 'logical' page size is very useful for testing. - static constexpr int smallestPhysicalBlock = 4096; + constexpr static int smallestPhysicalBlock = 4096; int physicalPageSize; int logicalPageSize; // In simulation testing it can be useful to use a small logical page size @@ -3878,13 +3955,18 @@ private: int physicalExtentSize; int pagesPerExtent; + std::shared_ptr keyProvider; + PriorityMultiLock ioLock; int64_t pageCacheBytes; // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. Reference headerPage; - Header* pHeader; + PagerCommitHeader header; + + Reference lastCommittedHeaderPage; + PagerCommitHeader lastCommittedHeader; // Pages - pages known to be in the file, truncations complete to that size int64_t filePageCount; @@ -3897,9 +3979,6 @@ private: int desiredExtentSize; Version recoveryVersion; - Reference lastCommittedHeaderPage; - Header* pLastCommittedHeader; - std::string filename; bool memoryOnly; @@ -3935,7 +4014,6 @@ private: struct SnapshotEntry { Version version; - Promise expired; Reference snapshot; }; @@ -3948,14 +4026,15 @@ private: // TODO: Better data structure PageToVersionedMapT remappedPages; + // Readable snapshots in version order std::deque snapshots; + Version oldestSnapshotVersion; }; // Prevents pager from reusing freed pages from version until the snapshot is destroyed class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { public: - DWALPagerSnapshot(DWALPager* pager, Key meta, Version version, Future expiredFuture) - : pager(pager), expired(expiredFuture), version(version), metaKey(meta) {} + DWALPagerSnapshot(DWALPager* pager, Key meta, Version version) : pager(pager), version(version), metaKey(meta) {} ~DWALPagerSnapshot() override {} Future> getPhysicalPage(PagerEventReasons reason, @@ -3964,23 +4043,19 @@ public: int priority, bool cacheable, bool noHit) override { - if (expired.isError()) { - throw expired.getError(); - } + return map(pager->readPageAtVersion(reason, level, pageID, priority, version, cacheable, noHit), [=](Reference p) { return Reference(std::move(p)); }); } Future> getMultiPhysicalPage(PagerEventReasons reason, unsigned int level, - VectorRef pageIDs, + VectorRef pageIDs, int priority, bool cacheable, bool noHit) override { - if (expired.isError()) { - throw expired.getError(); - } - return map(pager->readMultiPageAtVersion(reason, level, pageIDs, priority, version, cacheable, noHit), + + return map(pager->readMultiPage(reason, level, pageIDs, priority, cacheable, noHit), [=](Reference p) { return Reference(std::move(p)); }); } @@ -3993,7 +4068,6 @@ public: void delref() override { ReferenceCounted::delref(); } DWALPager* pager; - Future expired; Version version; Key metaKey; }; @@ -4003,22 +4077,21 @@ void DWALPager::expireSnapshots(Version v) { filename.c_str(), v, (int)snapshots.size()); + + // While there is more than one snapshot and the front snapshot is older than v and has no other reference holders while (snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); - // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it - // probably is already not needed but it will gracefully handle the case where a user begins a page read - // with a snapshot reference, keeps the page read future, and drops the snapshot reference. - snapshots.front().expired.sendError(transaction_too_old()); + + // Expire the snapshot and update the oldest snapshot version snapshots.pop_front(); + oldestSnapshotVersion = snapshots.front().version; } } Reference DWALPager::getReadSnapshot(Version v) { - ASSERT(!snapshots.empty()); - auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); if (i == snapshots.begin()) { throw version_invalid(); @@ -4119,12 +4192,13 @@ struct SplitStringRef { } }; -// A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together. -// NOTE: Uses host byte order -typedef VectorRef BTreePageIDRef; +// A BTree node link is a list of LogicalPageID's whose contents should be concatenated together. +typedef VectorRef BTreeNodeLinkRef; +typedef Standalone BTreeNodeLink; + constexpr LogicalPageID maxPageID = (LogicalPageID)-1; -std::string toString(BTreePageIDRef id) { +std::string toString(BTreeNodeLinkRef id) { return std::string("BTreePageID") + toString(id.begin(), id.end()); } @@ -4149,20 +4223,20 @@ struct RedwoodRecordRef { // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. // Boundary records in internal pages are made from leaf records. // These functions make creating and working with internal page records more convenient. - inline BTreePageIDRef getChildPage() const { + inline BTreeNodeLinkRef getChildPage() const { ASSERT(value.present()); - return BTreePageIDRef((LogicalPageID*)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); + return BTreeNodeLinkRef((LogicalPageID*)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); } - inline void setChildPage(BTreePageIDRef id) { + inline void setChildPage(BTreeNodeLinkRef id) { value = ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline void setChildPage(Arena& arena, BTreePageIDRef id) { + inline void setChildPage(Arena& arena, BTreeNodeLinkRef id) { value = ValueRef(arena, (const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline RedwoodRecordRef withPageID(BTreePageIDRef id) const { + inline RedwoodRecordRef withPageID(BTreeNodeLinkRef id) const { return RedwoodRecordRef(key, ValueRef((const uint8_t*)id.begin(), id.size() * sizeof(LogicalPageID))); } @@ -4242,7 +4316,7 @@ struct RedwoodRecordRef { } LengthFormat3; }; - static constexpr int LengthFormatSizes[] = { sizeof(LengthFormat0), + constexpr static int LengthFormatSizes[] = { sizeof(LengthFormat0), sizeof(LengthFormat1), sizeof(LengthFormat2), sizeof(LengthFormat3) }; @@ -4595,26 +4669,31 @@ struct BTreePage { #pragma pack(push, 1) struct { + // treeOffset allows for newer versions to have additional fields but older code to read them + uint8_t treeOffset; uint8_t height; uint32_t kvBytes; }; + #pragma pack(pop) - int size() const { - const BinaryTree* t = tree(); - return (uint8_t*)t - (uint8_t*)this + t->size(); + void init(unsigned int height, unsigned int kvBytes) { + treeOffset = sizeof(BTreePage); + this->height = height; + this->kvBytes = kvBytes; } + int size() const { return treeOffset + tree()->size(); } + + uint8_t* treeBuffer() const { return (uint8_t*)this + treeOffset; } + BinaryTree* tree() { return (BinaryTree*)treeBuffer(); } + BinaryTree* tree() const { return (BinaryTree*)treeBuffer(); } + ValueTree* valueTree() const { return (ValueTree*)treeBuffer(); } + bool isLeaf() const { return height == 1; } - BinaryTree* tree() { return (BinaryTree*)(this + 1); } - - BinaryTree* tree() const { return (BinaryTree*)(this + 1); } - - ValueTree* valueTree() const { return (ValueTree*)(this + 1); } - std::string toString(const char* context, - BTreePageIDRef id, + BTreeNodeLinkRef id, Version ver, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) const { @@ -4634,8 +4713,7 @@ struct BTreePage { if (tree()->numItems > 0) { // This doesn't use the cached reader for the page because it is only for debugging purposes, // a cached reader may not exist - BinaryTree::DecodeCache cache(lowerBound, upperBound); - BinaryTree::Cursor c(&cache, tree()); + BinaryTree::Cursor c(makeReference(lowerBound, upperBound), tree()); c.moveFirst(); ASSERT(c.valid()); @@ -4684,32 +4762,6 @@ struct BoundaryRefAndPage { } }; -#pragma pack(push, 1) -template -struct InPlaceArray { - SizeT count; - - const T* begin() const { return (T*)(this + 1); } - - T* begin() { return (T*)(this + 1); } - - const T* end() const { return begin() + count; } - - T* end() { return begin() + count; } - - VectorRef get() { return VectorRef(begin(), count); } - - void set(VectorRef v, int availableSpace) { - ASSERT(sizeof(T) * v.size() <= availableSpace); - count = v.size(); - memcpy(begin(), v.begin(), sizeof(T) * v.size()); - } - - int size() const { return count; } - int sizeBytes() const { return count * sizeof(T); } -}; -#pragma pack(pop) - // DecodeBoundaryVerifier provides simulation-only verification of DeltaTree boundaries between // reads and writes by using a static structure to track boundaries used during DeltaTree generation // for all writes and updates across cold starts and virtual process restarts. @@ -4751,7 +4803,7 @@ struct DecodeBoundaryVerifier { return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())]; } - void update(BTreePageIDRef id, Version v, Key lowerBound, Key upperBound) { + void update(BTreeNodeLinkRef id, Version v, Key lowerBound, Key upperBound) { sampleBoundary(lowerBound); sampleBoundary(upperBound); debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s'\n", @@ -4834,7 +4886,7 @@ public: struct LazyClearQueueEntry { uint8_t height; Version version; - Standalone pageID; + BTreeNodeLink pageID; bool operator<(const LazyClearQueueEntry& rhs) const { return version < rhs.version; } @@ -4844,7 +4896,7 @@ public: version = *(Version*)src; src += sizeof(Version); int count = *src++; - pageID = BTreePageIDRef((LogicalPageID*)src, count); + pageID = BTreeNodeLinkRef((LogicalPageID*)src, count); return bytesNeeded(); } @@ -4895,31 +4947,30 @@ public: typedef std::unordered_map ParentInfoMapT; -#pragma pack(push, 1) - struct MetaKey { - static constexpr int FORMAT_VERSION = 15; + struct BTreeCommitHeader { + constexpr static FileIdentifier file_identifier = 10847329; + constexpr static unsigned int FORMAT_VERSION = 17; + // This serves as the format version for the entire tree, individual pages will not be versioned - uint16_t formatVersion; + uint32_t formatVersion; + EncodingType encodingType; uint8_t height; LazyClearQueueT::QueueState lazyDeleteQueue; - InPlaceArray root; - - KeyRef asKeyRef() const { return KeyRef((uint8_t*)this, sizeof(MetaKey) + root.sizeBytes()); } - - void fromKeyRef(KeyRef k) { - memcpy(this, k.begin(), k.size()); - ASSERT(formatVersion == FORMAT_VERSION); - } + BTreeNodeLink root; std::string toString() { return format("{formatVersion=%d height=%d root=%s lazyDeleteQueue=%s}", (int)formatVersion, (int)height, - ::toString(root.get()).c_str(), + ::toString(root).c_str(), lazyDeleteQueue.toString().c_str()); } + + template + void serialize(Ar& ar) { + serializer(ar, formatVersion, encodingType, height, lazyDeleteQueue, root); + } }; -#pragma pack(pop) // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager @@ -4978,8 +5029,26 @@ public: Version getLastCommittedVersion() const { return m_pager->getLastCommittedVersion(); } - VersionedBTree(IPager2* pager, std::string name) - : m_pager(pager), m_pBuffer(nullptr), m_mutationCount(0), m_name(name), m_pHeader(nullptr), m_headerSpace(0) { + // VersionedBTree takes ownership of pager + VersionedBTree(IPager2* pager, + std::string name, + EncodingType defaultEncodingType, + std::shared_ptr keyProvider) + : m_pager(pager), m_encodingType(defaultEncodingType), m_enforceEncodingType(false), m_keyProvider(keyProvider), + m_pBuffer(nullptr), m_mutationCount(0), m_name(name) { + + // For encrypted encoding types, enforce that BTree nodes read from disk use the default encoding type + // This prevents an attack where an encrypted page is replaced by an attacker with an unencrypted page + // or an encrypted page fabricated using a compromised scheme. + if (ArenaPage::isEncodingTypeEncrypted(m_encodingType)) { + ASSERT(keyProvider != nullptr); + m_enforceEncodingType = true; + } + + // If key provider isn't given, instantiate the null provider + if (m_keyProvider == nullptr) { + m_keyProvider = std::make_shared(); + } m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name); m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource(); @@ -4988,6 +5057,20 @@ public: m_latestCommit = m_init; } + ACTOR static Future> makeEmptyRoot(VersionedBTree* self) { + state Reference page = self->m_pager->newPageBuffer(); + page->init(self->m_encodingType, PageType::BTreeNode, 1); + if (page->isEncrypted()) { + EncryptionKey k = wait(self->m_keyProvider->getByRange(dbBegin.key, dbEnd.key)); + page->encryptionKey = k; + } + + BTreePage* btpage = (BTreePage*)page->mutateData(); + btpage->init(1, 0); + btpage->tree()->build(page->dataSize() - sizeof(BTreePage), nullptr, nullptr, nullptr, nullptr); + return page; + } + void toTraceEvent(TraceEvent& e) const { m_pager->toTraceEvent(e); m_lazyClearQueue.toTraceEvent(e, "LazyClearQueue"); @@ -5017,7 +5100,8 @@ public: // Start reading the page, without caching entries.emplace_back(q.get(), - self->readPage(PagerEventReasons::LazyClear, + self->readPage(self, + PagerEventReasons::LazyClear, q.get().height, snapshot.getPtr(), q.get().pageID, @@ -5031,7 +5115,7 @@ public: for (i = 0; i < entries.size(); ++i) { Reference p = wait(entries[i].second); const LazyClearQueueEntry& entry = entries[i].first; - const BTreePage& btPage = *(BTreePage*)p->begin(); + const BTreePage& btPage = *(const BTreePage*)p->data(); ASSERT(btPage.height == entry.height); auto& metrics = g_redwoodMetrics.level(entry.height).metrics; @@ -5042,13 +5126,13 @@ public: // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding - BTreePage::ValueTree::DecodeCache cache(dbBegin, dbEnd); - BTreePage::ValueTree::Cursor c(&cache, btPage.valueTree()); + BTreePage::ValueTree::Cursor c(makeReference(dbBegin, dbEnd), + btPage.valueTree()); ASSERT(c.moveFirst()); Version v = entry.version; while (1) { if (c.get().value.present()) { - BTreePageIDRef btChildPageID = c.get().getChildPage(); + BTreeNodeLinkRef btChildPageID = c.get().getChildPage(); // If this page is height 2, then the children are leaves so free them directly if (entry.height == 2) { debug_printf("LazyClear: freeing leaf child %s\n", toString(btChildPageID).c_str()); @@ -5098,11 +5182,8 @@ public: ACTOR static Future init_impl(VersionedBTree* self) { wait(self->m_pager->init()); self->m_pBuffer.reset(new MutationBuffer()); - // TODO: Get actual max MetaKey size limit from Pager - self->m_headerSpace = self->m_pager->getUsablePageSize(); - self->m_pHeader = (MetaKey*)new uint8_t[self->m_headerSpace]; - self->m_blockSize = self->m_pager->getUsablePageSize(); + self->m_blockSize = self->m_pager->getLogicalPageSize(); self->m_newOldestVersion = self->m_pager->getOldestReadableVersion(); debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", @@ -5114,44 +5195,64 @@ public: self->m_pBoundaryVerifier->removeAfterVersion(self->getLastCommittedVersion()); } - state Key meta = self->m_pager->getMetaKey(); - if (meta.size() == 0) { + state Value btreeHeader = self->m_pager->getCommitRecord(); + if (btreeHeader.size() == 0) { // Create new BTree - self->m_pHeader->formatVersion = MetaKey::FORMAT_VERSION; + self->m_header.formatVersion = BTreeCommitHeader::FORMAT_VERSION; + self->m_header.encodingType = self->m_encodingType; + self->m_header.height = 1; + LogicalPageID id = wait(self->m_pager->newPageID()); - BTreePageIDRef newRoot((LogicalPageID*)&id, 1); - debug_printf("new root %s\n", toString(newRoot).c_str()); - self->m_pHeader->root.set(newRoot, self->m_headerSpace - sizeof(MetaKey)); - self->m_pHeader->height = 1; - Reference page = self->m_pager->newPageBuffer(); - self->makeEmptyRoot(page); - self->m_pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, newRoot, page); + self->m_header.root = BTreeNodeLinkRef((LogicalPageID*)&id, 1); + debug_printf("new root %s\n", toString(self->m_header.root).c_str()); + + Reference page = wait(makeEmptyRoot(self)); + + // Newly allocated page so logical id = physical id and it's a new empty root so no parent + page->setLogicalPageInfo(self->m_header.root.front(), invalidLogicalPageID); + self->m_pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, self->m_header.root, page); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); self->m_lazyClearQueue.create( self->m_pager, newQueuePage, "LazyClearQueue", self->m_pager->newLastQueueID(), false); - self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); - self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); + self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); debug_printf("BTree created (but not committed)\n"); } else { - self->m_pHeader->fromKeyRef(meta); - self->m_lazyClearQueue.recover(self->m_pager, self->m_pHeader->lazyDeleteQueue, "LazyClearQueueRecovered"); + self->m_header = ObjectReader::fromStringRef(btreeHeader, Unversioned()); + + if (self->m_header.formatVersion != BTreeCommitHeader::FORMAT_VERSION) { + Error e = unsupported_format_version(); + TraceEvent(SevWarn, "RedwoodBTreeVersionUnsupported") + .error(e) + .detail("Version", self->m_header.formatVersion) + .detail("ExpectedVersion", BTreeCommitHeader::FORMAT_VERSION); + throw e; + } + + self->m_lazyClearQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyClearQueueRecovered"); debug_printf("BTree recovered.\n"); + + if (self->m_header.encodingType != self->m_encodingType) { + TraceEvent(SevWarn, "RedwoodBTreeNodeEncodingMismatch") + .detail("InstanceName", self->m_pager->getName()) + .detail("EncodingFound", self->m_header.encodingType) + .detail("EncodingDesired", self->m_encodingType); + } } self->m_lazyClearActor = 0; TraceEvent e(SevInfo, "RedwoodRecoveredBTree"); e.detail("FileName", self->m_name); - e.detail("OpenedExisting", meta.size() != 0); + e.detail("OpenedExisting", btreeHeader.size() != 0); e.detail("LatestVersion", self->m_pager->getLastCommittedVersion()); self->m_lazyClearQueue.toTraceEvent(e, "LazyClearQueue"); e.log(); debug_printf("Recovered btree at version %" PRId64 ": %s\n", self->m_pager->getLastCommittedVersion(), - self->m_pHeader->toString().c_str()); + self->m_header.toString().c_str()); return Void(); } @@ -5164,14 +5265,12 @@ public: // uncommitted writes so it should not be committed. m_init.cancel(); m_latestCommit.cancel(); - - if (m_pHeader != nullptr) { - delete[](uint8_t*) m_pHeader; - } } Future commit(Version v) { return commit_impl(this, v); } + // Clear all btree data, allow pager remap to fully process its queue, and verify final + // page counts in pager and queues. ACTOR static Future clearAllAndCheckSanity_impl(VersionedBTree* self) { // Clear and commit debug_printf("Clearing tree.\n"); @@ -5197,8 +5296,8 @@ public: ASSERT(s.numPages == 1); // The btree should now be a single non-oversized root page. - ASSERT(self->m_pHeader->height == 1); - ASSERT(self->m_pHeader->root.count == 1); + ASSERT(self->m_header.height == 1); + ASSERT(self->m_header.root.size() == 1); // Let pager do more commits to finish all cleanup of old pages wait(self->m_pager->clearRemapQueue()); @@ -5210,7 +5309,7 @@ public: // From the pager's perspective the only pages that should be in use are the btree root and // the previously mentioned lazy delete queue page. int64_t userPageCount = wait(self->m_pager->getUserPageCount()); - debug_printf("clearAllAndCheckSanity: userPageCount: %d\n", userPageCount); + debug_printf("clearAllAndCheckSanity: userPageCount: %" PRId64 "\n", userPageCount); ASSERT(userPageCount == 2); return Void(); @@ -5428,6 +5527,9 @@ private: */ IPager2* m_pager; + EncodingType m_encodingType; + bool m_enforceEncodingType; + std::shared_ptr m_keyProvider; // Counter to update with DecodeCache memory usage int64_t* m_pDecodeCacheMemory = nullptr; @@ -5452,36 +5554,39 @@ private: int m_blockSize; ParentInfoMapT childUpdateTracker; - // MetaKey has a variable size, it can be as large as m_headerSpace - MetaKey* m_pHeader; - int m_headerSpace; - + BTreeCommitHeader m_header; LazyClearQueueT m_lazyClearQueue; Future m_lazyClearActor; bool m_lazyClearStop; - // Describes a range of a vector of records that should be built into a BTreePage + // Describes a range of a vector of records that should be built into a single BTreePage struct PageToBuild { - PageToBuild(int index, int blockSize) + PageToBuild(int index, int blockSize, EncodingType t) : startIndex(index), count(0), pageSize(blockSize), - bytesLeft(blockSize - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree)), largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1), - kvBytes(0) {} + kvBytes(0) { + + // Subtrace Page header overhead, BTreePage overhead, and DeltaTree (BTreePage::BinaryTree) overhead. + bytesLeft = ArenaPage::getUsableSize(blockSize, t) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree); + } + + PageToBuild next(EncodingType t) { return PageToBuild(endIndex(), blockSize, t); } int startIndex; // Index of the first record int count; // Number of records added to the page - int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize + int pageSize; // Page or Multipage size required to hold a BTreePage of the added records, which is a multiple + // of blockSize int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far bool largeDeltaTree; // Whether or not the tree in the generated page is in the 'large' size range int blockSize; // Base block size by which pageSize can be incremented int blockCount; // The number of blocks in pageSize int kvBytes; // The amount of user key/value bytes added to the page - // Number of bytes used by the generated/serialized BTreePage - int size() const { return pageSize - bytesLeft; } + // Number of bytes used by the generated/serialized BTreePage, including all headers + int usedBytes() const { return pageSize - bytesLeft; } // Used fraction of pageSize bytes - double usedFraction() const { return (double)size() / pageSize; } + double usedFraction() const { return (double)usedBytes() / pageSize; } // Unused fraction of pageSize bytes double slackFraction() const { return (double)bytesLeft / pageSize; } @@ -5500,7 +5605,7 @@ private: "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}", startIndex, count, - size(), + usedBytes(), pageSize, slackFraction() * 100, kvBytes, @@ -5571,12 +5676,12 @@ private: }; // Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build - static std::vector splitPages(const RedwoodRecordRef* lowerBound, - const RedwoodRecordRef* upperBound, - int prefixLen, - VectorRef records, - unsigned int height, - int blockSize) { + std::vector splitPages(const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, + int prefixLen, + VectorRef records, + unsigned int height) { + debug_printf("splitPages height=%d records=%d\n\tlowerBound=%s\n\tupperBound=%s\n", height, records.size(), @@ -5597,7 +5702,7 @@ private: deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true); } - PageToBuild p(0, blockSize); + PageToBuild p(0, m_blockSize, m_encodingType); for (int i = 0; i < records.size(); ++i) { bool force = p.count < minRecords || p.slackFraction() > maxSlack; @@ -5613,7 +5718,7 @@ private: if (!p.addRecord(records[i], deltaSizes[i], force)) { pages.push_back(p); - p = PageToBuild(p.endIndex(), blockSize); + p = p.next(m_encodingType); p.addRecord(records[i], deltaSizes[i], true); } } @@ -5625,9 +5730,9 @@ private: debug_printf(" Before shift: %s\n", ::toString(pages).c_str()); // If page count is > 1, try to balance slack between last two pages - // The buggify disables this balancing as this will result in more edge - // cases of pages with very few records. - if (pages.size() > 1 && !BUGGIFY) { + // In simulation, disable this balance half the time to create more edge cases + // of underfilled pages + if (pages.size() > 1 && !(g_network->isSimulated() && deterministicRandom()->coinflip())) { PageToBuild& a = pages[pages.size() - 2]; PageToBuild& b = pages.back(); @@ -5646,13 +5751,6 @@ private: return pages; } - void makeEmptyRoot(Reference page) { - BTreePage* btpage = (BTreePage*)page->begin(); - btpage->height = 1; - btpage->kvBytes = 0; - btpage->tree()->build(page->size(), nullptr, nullptr, nullptr, nullptr); - } - // Writes entries to 1 or more pages and return a vector of boundary keys with their ArenaPage(s) ACTOR static Future>> writePages(VersionedBTree* self, const RedwoodRecordRef* lowerBound, @@ -5660,7 +5758,8 @@ private: VectorRef entries, unsigned int height, Version v, - BTreePageIDRef previousID) { + BTreeNodeLinkRef previousID, + LogicalPageID parentID) { ASSERT(entries.size() > 0); state Standalone> records; @@ -5669,7 +5768,7 @@ private: state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound); state std::vector pagesToBuild = - splitPages(lowerBound, upperBound, prefixLen, entries, height, self->m_blockSize); + self->splitPages(lowerBound, upperBound, prefixLen, entries, height); debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str()); // Lower bound of the page being added to @@ -5680,9 +5779,11 @@ private: state int pageIndex; for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) { - auto& p = pagesToBuild[pageIndex]; - debug_printf("building page %d of %d %s\n", pageIndex + 1, pagesToBuild.size(), p.toString().c_str()); - ASSERT(p.count != 0); + debug_printf("building page %d of %zu %s\n", + pageIndex + 1, + pagesToBuild.size(), + pagesToBuild[pageIndex].toString().c_str()); + ASSERT(pagesToBuild[pageIndex].count != 0); // For internal pages, skip first entry if child link is null. Such links only exist // to maintain a borrow-able prefix for the previous subtree after a subtree deletion. @@ -5690,7 +5791,8 @@ private: // being built now will serve as the previous subtree's upper boundary as it is the same // key as entries[p.startIndex] and there is no need to actually store the null link in // the new page. - if (height != 1 && !entries[p.startIndex].value.present()) { + if (height != 1 && !entries[pagesToBuild[pageIndex].startIndex].value.present()) { + auto& p = pagesToBuild[pageIndex]; p.kvBytes -= entries[p.startIndex].key.size(); ++p.startIndex; --p.count; @@ -5711,7 +5813,7 @@ private: } // Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page - int endIndex = p.endIndex(); + int endIndex = pagesToBuild[pageIndex].endIndex(); bool lastPage = endIndex == entries.size(); pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue(); @@ -5721,20 +5823,19 @@ private: pageUpperBound.truncate(commonPrefix + 1); } - state Reference pages; - BTreePage* btPage; - - if (p.blockCount == 1) { - Reference page = self->m_pager->newPageBuffer(); - btPage = (BTreePage*)page->mutate(); - pages = std::move(page); - } else { - ASSERT(p.blockCount > 1); - btPage = (BTreePage*)new uint8_t[p.pageSize]; + // Create and init page here otherwise many variables must become state vars + state Reference page = self->m_pager->newPageBuffer(pagesToBuild[pageIndex].blockCount); + page->init(self->m_encodingType, + (pagesToBuild[pageIndex].blockCount == 1) ? PageType::BTreeNode : PageType::BTreeSuperNode, + height); + if (page->isEncrypted()) { + EncryptionKey k = wait(self->m_keyProvider->getByRange(pageLowerBound.key, pageUpperBound.key)); + page->encryptionKey = k; } - btPage->height = height; - btPage->kvBytes = p.kvBytes; + auto& p = pagesToBuild[pageIndex]; + BTreePage* btPage = (BTreePage*)page->mutateData(); + btPage->init(height, p.kvBytes); g_redwoodMetrics.kvSizeWritten->sample(p.kvBytes); debug_printf("Building tree for %s\nlower: %s\nupper: %s\n", @@ -5742,9 +5843,13 @@ private: pageLowerBound.toString(false).c_str(), pageUpperBound.toString(false).c_str()); - int deltaTreeSpace = p.pageSize - sizeof(BTreePage); + int deltaTreeSpace = page->dataSize() - sizeof(BTreePage); + debug_printf("Building tree at %p deltaTreeSpace %d p.usedBytes=%d\n", + btPage->tree(), + deltaTreeSpace, + p.usedBytes()); state int written = btPage->tree()->build( - deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound); + deltaTreeSpace, &entries[p.startIndex], &entries[p.endIndex()], &pageLowerBound, &pageUpperBound); if (written > deltaTreeSpace) { debug_printf("ERROR: Wrote %d bytes to page %s deltaTreeSpace=%d\n", @@ -5764,55 +5869,56 @@ private: metrics.buildStoredPctSketch->samplePercentage(p.kvFraction()); metrics.buildItemCountSketch->sampleRecordCounter(p.count); - // Create chunked pages - // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. - if (p.blockCount != 1) { - // Mark the slack in the page buffer as defined - VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (p.blockCount * p.blockSize) - written); - Reference page = self->m_pager->newPageBuffer(p.blockCount); - const uint8_t* rptr = (const uint8_t*)btPage; - for (int b = 0; b < p.blockCount; ++b) { - memcpy(page->mutate() + b * p.blockSize, rptr, p.blockSize); - rptr += p.blockSize; - } - pages = std::move(page); - delete[](uint8_t*) btPage; - } - // Write this btree page, which is made of 1 or more pager pages. - state BTreePageIDRef childPageID; + state BTreeNodeLinkRef childPageID; // If we are only writing 1 BTree node and its block count is 1 and the original node also had 1 block // then try to update the page atomically so its logical page ID does not change if (pagesToBuild.size() == 1 && p.blockCount == 1 && previousID.size() == 1) { + page->setLogicalPageInfo(previousID.front(), parentID); LogicalPageID id = wait( - self->m_pager->atomicUpdatePage(PagerEventReasons::Commit, height, previousID.front(), pages, v)); + self->m_pager->atomicUpdatePage(PagerEventReasons::Commit, height, previousID.front(), page, v)); childPageID.push_back(records.arena(), id); } else { - // Either the original node is being split, or it's not but it has changed BTreePageID size or - // it is now a multi-page node (and maybe was before as well) - // Either way, there is no point in reusing any of the original page IDs because either - // the parent must be rewritten anyway to account for the change in child link count or size, - // or the parent would be rewritten because it the same cost or cheaper than incurring the second - // write of 2 or more pages of a multi-page node. + // Either the original node is being split, or only a single node was produced but + // its size is > 1 block. + // + // If the node was split, then the parent must be updated anyways to add + // any new children so there is no reason to do an atomic update on this node to + // preserve any page IDs. + // + // If the node was not split, then there is still good reason to update the parent + // node to point to new page IDs rather than use atomic update on this multi block + // node to preserve its page IDs. The parent node is almost certainly 1 block, so + // updating it will be 1 or 2 writes depending on whether or not its second write + // can be avoided. Updating this N>=2 block node atomically, however, will be N + // writes plus possibly another N writes if the second writes cannot be avoided. + // + // Therefore, assuming the parent is 1 block which is almost certain, the worst + // case number of writes for updating the parent is equal to the best case number + // of writes for doing an atomic update of this multi-block node. + // + // Additionally, if we did choose to atomically update this mult-block node here + // then the "multiple sibling updates -> single parent update" conversion optimization + // would likely convert the changes to a parent update anyways, skipping the second writes + // on this multi-block node. It is more efficient to just take this path directly. // Free the old IDs, but only once (before the first output record is added). if (records.empty()) { self->freeBTreePage(height, previousID, v); } - state Standalone> emptyPages; - emptyPages.resize(emptyPages.arena(), p.blockCount); + childPageID.resize(records.arena(), p.blockCount); state int i = 0; - for (i = 0; i < emptyPages.size(); ++i) { + for (i = 0; i < childPageID.size(); ++i) { LogicalPageID id = wait(self->m_pager->newPageID()); - emptyPages[i] = id; - } - debug_printf("writePages: newPages %s", toString(emptyPages).c_str()); - self->m_pager->updatePage(PagerEventReasons::Commit, height, emptyPages, pages); - for (const LogicalPageID id : emptyPages) { - childPageID.push_back(records.arena(), id); + childPageID[i] = id; } + debug_printf("writePages: newPages %s", toString(childPageID).c_str()); + + // Newly allocated page so logical id = physical id + page->setLogicalPageInfo(childPageID.front(), parentID); + self->m_pager->updatePage(PagerEventReasons::Commit, height, childPageID, page); } if (self->m_pBoundaryVerifier != nullptr) { @@ -5857,15 +5963,15 @@ private: Version version, Standalone> records, unsigned int height) { - debug_printf("buildNewRoot start version %" PRId64 ", %lu records\n", version, records.size()); + debug_printf("buildNewRoot start version %" PRId64 ", %d records\n", version, records.size()); // While there are multiple child pages for this version we must write new tree levels. while (records.size() > 1) { - self->m_pHeader->height = ++height; + self->m_header.height = ++height; ASSERT(height < std::numeric_limits::max()); - Standalone> newRecords = - wait(writePages(self, &dbBegin, &dbEnd, records, height, version, BTreePageIDRef())); - debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", + Standalone> newRecords = wait( + writePages(self, &dbBegin, &dbEnd, records, height, version, BTreeNodeLinkRef(), invalidLogicalPageID)); + debug_printf("Wrote a new root level at version %" PRId64 " height %d size %d pages\n", version, height, newRecords.size()); @@ -5875,10 +5981,11 @@ private: return records; } - ACTOR static Future> readPage(PagerEventReasons reason, + ACTOR static Future> readPage(VersionedBTree* self, + PagerEventReasons reason, unsigned int level, IPagerSnapshot* snapshot, - BTreePageIDRef id, + BTreeNodeLinkRef id, int priority, bool forLazyClear, bool cacheable) { @@ -5900,11 +6007,23 @@ private: page = std::move(p); } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); - const BTreePage* btPage = (const BTreePage*)page->begin(); + const BTreePage* btPage = (const BTreePage*)page->data(); auto& metrics = g_redwoodMetrics.level(btPage->height).metrics; metrics.pageRead += 1; metrics.pageReadExt += (id.size() - 1); + // If BTree encryption is enabled, pages read must be encrypted using the desired encryption type + if (self->m_enforceEncodingType && (page->getEncodingType() != self->m_encodingType)) { + Error e = unexpected_encoding_type(); + TraceEvent(SevError, "RedwoodBTreeUnexpectedNodeEncoding") + .error(e) + .detail("PhysicalPageID", page->getPhysicalPageID()) + .detail("IsEncrypted", page->isEncrypted()) + .detail("EncodingTypeFound", page->getEncodingType()) + .detail("EncodingTypeExpected", self->m_encodingType); + throw e; + } + return std::move(page); } @@ -5912,40 +6031,46 @@ private: inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page, const RedwoodRecordRef& lowerBound, const RedwoodRecordRef& upperBound) { - if (page->userData == nullptr) { - debug_printf("Creating DecodeCache for ptr=%p lower=%s upper=%s %s\n", - page->begin(), + + Reference cache; + + if (page->extra.valid()) { + cache = page->extra.getReference(); + } else { + cache = makeReference(lowerBound, upperBound, m_pDecodeCacheMemory); + + debug_printf("Created DecodeCache for ptr=%p lower=%s upper=%s %s\n", + page->data(), lowerBound.toString(false).c_str(), upperBound.toString(false).c_str(), - ((BTreePage*)page->begin()) + ((BTreePage*)page->data()) ->toString("cursor", - lowerBound.value.present() ? lowerBound.getChildPage() : BTreePageIDRef(), + lowerBound.value.present() ? lowerBound.getChildPage() : BTreeNodeLinkRef(), -1, lowerBound, upperBound) .c_str()); - BTreePage::BinaryTree::DecodeCache* cache = - new BTreePage::BinaryTree::DecodeCache(lowerBound, upperBound, m_pDecodeCacheMemory); - page->userData = cache; - page->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; + // Store decode cache into page based on height + if (((BTreePage*)page->data())->height >= SERVER_KNOBS->REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT) { + page->extra = cache; + } } - return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, - ((BTreePage*)page->begin())->tree()); + return BTreePage::BinaryTree::Cursor(cache, ((BTreePage*)page->mutateData())->tree()); } // Get cursor into a BTree node from a child link inline BTreePage::BinaryTree::Cursor getCursor(const ArenaPage* page, const BTreePage::BinaryTree::Cursor& link) { - if (page->userData == nullptr) { + if (!page->extra.valid()) { return getCursor(page, link.get(), link.next().getOrUpperBound()); } - return BTreePage::BinaryTree::Cursor((BTreePage::BinaryTree::DecodeCache*)page->userData, - ((BTreePage*)page->begin())->tree()); + return BTreePage::BinaryTree::Cursor(page->extra.getReference(), + ((BTreePage*)page->mutateData())->tree()); } - static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef pageIDs, int priority) { + static void preLoadPage(IPagerSnapshot* snapshot, BTreeNodeLinkRef pageIDs, int priority) { g_redwoodMetrics.metric.btreeLeafPreload += 1; g_redwoodMetrics.metric.btreeLeafPreloadExt += (pageIDs.size() - 1); if (pageIDs.size() == 1) { @@ -5957,7 +6082,7 @@ private: } } - void freeBTreePage(int height, BTreePageIDRef btPageID, Version v) { + void freeBTreePage(int height, BTreeNodeLinkRef btPageID, Version v) { // Free individual pages at v for (LogicalPageID id : btPageID) { m_pager->freePage(id, v); @@ -5973,17 +6098,19 @@ private: // If oldID size is 1, attempts to keep logical page ID via an atomic page update. // Returns resulting BTreePageID which might be the same as the input // updateBTreePage is only called from commitSubTree function so write reason is always btree commit - ACTOR static Future updateBTreePage(VersionedBTree* self, - BTreePageIDRef oldID, - Arena* arena, - Reference page, - Version writeVersion) { - state BTreePageIDRef newID; + ACTOR static Future updateBTreePage(VersionedBTree* self, + BTreeNodeLinkRef oldID, + LogicalPageID parentID, + Arena* arena, + Reference page, + Version writeVersion) { + state BTreeNodeLinkRef newID; newID.resize(*arena, oldID.size()); if (REDWOOD_DEBUG) { - BTreePage* btPage = (BTreePage*)page->begin(); - BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; + const BTreePage* btPage = (const BTreePage*)page->mutateData(); + BTreePage::BinaryTree::DecodeCache* cache = page->extra.getPtr(); + debug_printf_always( "updateBTreePage(%s, %s) start, page:\n%s\n", ::toString(oldID).c_str(), @@ -5994,31 +6121,28 @@ private: .c_str()); } - state unsigned int height = (unsigned int)((BTreePage*)page->begin())->height; + state unsigned int height = (unsigned int)((const BTreePage*)page->data())->height; if (oldID.size() == 1) { + page->setLogicalPageInfo(oldID.front(), parentID); LogicalPageID id = wait( self->m_pager->atomicUpdatePage(PagerEventReasons::Commit, height, oldID.front(), page, writeVersion)); newID.front() = id; return newID; } - state Standalone> emptyPages; + state int i = 0; - emptyPages.resize(emptyPages.arena(), oldID.size()); for (i = 0; i < oldID.size(); ++i) { LogicalPageID id = wait(self->m_pager->newPageID()); - emptyPages[i] = id; + newID[i] = id; } debug_printf("updateBTreePage(%s, %s): newPages %s", ::toString(oldID).c_str(), ::toString(writeVersion).c_str(), - toString(emptyPages).c_str()); + toString(newID).c_str()); - self->m_pager->updatePage(PagerEventReasons::Commit, height, emptyPages, page); - i = 0; - for (const LogicalPageID id : emptyPages) { - newID[i] = id; - ++i; - } + // Newly allocated page so logical id = physical id + page->setLogicalPageInfo(newID.front(), parentID); + self->m_pager->updatePage(PagerEventReasons::Commit, height, newID, page); if (self->m_pBoundaryVerifier != nullptr) { self->m_pBoundaryVerifier->update(writeVersion, oldID.front(), newID.front()); @@ -6030,14 +6154,13 @@ private: // Copy page to a new page which shares the same DecodeCache with the old page static Reference clonePageForUpdate(Reference page) { - Reference newPage = page->cloneContents(); + Reference newPage = page->clone(); - BTreePage::BinaryTree::DecodeCache* cache = (BTreePage::BinaryTree::DecodeCache*)page->userData; - cache->addref(); - newPage->userData = cache; - newPage->userDataDestructor = [](void* cache) { ((BTreePage::BinaryTree::DecodeCache*)cache)->delref(); }; + if (page->extra.valid()) { + newPage->extra = page->extra.getReference(); + } - debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->begin(), newPage->begin(), page->size()); + debug_printf("cloneForUpdate(%p -> %p size=%d\n", page->data(), newPage->data(), page->dataSize()); return newPage; } @@ -6107,7 +6230,7 @@ private: } // Page was updated in-place through edits and written to maybeNewID - void updatedInPlace(BTreePageIDRef maybeNewID, BTreePage* btPage, int capacity) { + void updatedInPlace(BTreeNodeLinkRef maybeNewID, BTreePage* btPage, int capacity) { inPlaceUpdate = true; auto& metrics = g_redwoodMetrics.level(btPage->height); @@ -6200,7 +6323,7 @@ private: bool changesMade; ParentInfo* parentInfo; - BTreePage* btPage() const { return (BTreePage*)page->begin(); } + BTreePage* btPage() const { return (BTreePage*)page->mutateData(); } bool empty() const { if (updating) { @@ -6223,7 +6346,7 @@ private: int i = 0; if (updating) { // Update must be done in the new tree, not the original tree where the end cursor will be from - end.tree = btPage()->tree(); + end.switchTree(btPage()->tree()); // TODO: insert recs in a random order to avoid new subtree being entirely right child links while (i != recs.size()) { @@ -6295,7 +6418,7 @@ private: if (c != u.cEnd) { cloneForUpdate(); // must point c to the tree to erase from - c.tree = btPage()->tree(); + c.switchTree(btPage()->tree()); } while (c != u.cEnd) { @@ -6349,7 +6472,8 @@ private: ACTOR static Future commitSubtree( VersionedBTree* self, CommitBatch* batch, - BTreePageIDRef rootID, + BTreeNodeLinkRef rootID, + LogicalPageID parentID, unsigned int height, MutationBuffer::const_iterator mBegin, // greatest mutation boundary <= subtreeLowerBound->key MutationBuffer::const_iterator mEnd, // least boundary >= subtreeUpperBound->key @@ -6382,8 +6506,8 @@ private: } } - state Reference page = - wait(readPage(PagerEventReasons::Commit, height, batch->snapshot.getPtr(), rootID, height, false, true)); + state Reference page = wait( + readPage(self, PagerEventReasons::Commit, height, batch->snapshot.getPtr(), rootID, height, false, true)); // If the page exists in the cache, it must be copied before modification. // That copy will be referenced by pageCopy, as page must stay in scope in case anything references its @@ -6391,7 +6515,7 @@ private: // If the page is not in the cache, then no copy is needed so we will initialize pageCopy to page state Reference pageCopy; - state BTreePage* btPage = (BTreePage*)page->begin(); + state BTreePage* btPage = (BTreePage*)page->mutateData(); ASSERT(height == btPage->height); ++g_redwoodMetrics.level(height).metrics.pageCommitStart; @@ -6433,7 +6557,7 @@ private: auto copyForUpdate = [&]() { if (!pageCopy.isValid()) { pageCopy = clonePageForUpdate(page); - btPage = (BTreePage*)pageCopy->begin(); + btPage = (BTreePage*)pageCopy->mutateData(); cursor.switchTree(btPage->tree()); } }; @@ -6627,7 +6751,7 @@ private: // If we don't have to remove the records and we are updating, do nothing. // If we do have to remove the records and we are not updating, do nothing. if (remove != updatingDeltaTree) { - debug_printf("%s Ignoring remaining records, remove=%d updating=%d\n", + debug_printf("%s Ignoring remaining records, remove=%d updatingDeltaTree=%d\n", context.c_str(), remove, updatingDeltaTree); @@ -6678,8 +6802,12 @@ private: debug_print(addPrefix(context, update->toString())); } else { // Otherwise update it. - BTreePageIDRef newID = wait(self->updateBTreePage( - self, rootID, &update->newLinks.arena(), pageCopy.castTo(), batch->writeVersion)); + BTreeNodeLinkRef newID = wait(self->updateBTreePage(self, + rootID, + parentID, + &update->newLinks.arena(), + pageCopy.castTo(), + batch->writeVersion)); debug_printf("%s Leaf node updated in-place at version %s, new contents:\n", context.c_str(), @@ -6715,7 +6843,8 @@ private: merged, height, batch->writeVersion, - rootID)); + rootID, + parentID)); // Put new links into update and tell update that pages were rebuilt update->rebuilt(entries); @@ -6758,7 +6887,7 @@ private: } } - BTreePageIDRef pageID = cursor.get().getChildPage(); + BTreeNodeLinkRef pageID = cursor.get().getChildPage(); ASSERT(!pageID.empty()); // The decode upper bound is always the next key after the child link, or the decode upper bound for @@ -6889,17 +7018,18 @@ private: debug_printf("%s Recursing for %s\n", context.c_str(), toString(pageID).c_str()); debug_print(addPrefix(context, u.toString())); - recursions.push_back(self->commitSubtree(self, batch, pageID, height - 1, mBegin, mEnd, &u)); + recursions.push_back( + self->commitSubtree(self, batch, pageID, rootID.front(), height - 1, mBegin, mEnd, &u)); } - debug_printf( - "%s Recursions from internal page started. pageSize=%d level=%d children=%d slices=%d recursions=%d\n", - context.c_str(), - btPage->size(), - btPage->height, - btPage->tree()->numItems, - slices.size(), - recursions.size()); + debug_printf("%s Recursions from internal page started. pageSize=%d level=%d children=%d slices=%zu " + "recursions=%zu\n", + context.c_str(), + btPage->size(), + btPage->height, + btPage->tree()->numItems, + slices.size(), + recursions.size()); wait(waitForAll(recursions)); debug_printf("%s Recursions done, processing slice updates.\n", context.c_str()); @@ -6986,11 +7116,14 @@ private: auto& stats = g_redwoodMetrics.level(height); while (cursor.valid()) { if (cursor.get().value.present()) { - for (auto& p : cursor.get().getChildPage()) { + BTreeNodeLinkRef child = cursor.get().getChildPage(); + // Nodes larger than 1 page are never remapped. + if (child.size() == 1) { + LogicalPageID& p = child.front(); if (parentInfo->maybeUpdated(p)) { - LogicalPageID newID = + PhysicalPageID newID = self->m_pager->detachRemappedPage(p, batch->writeVersion); - if (newID != invalidLogicalPageID) { + if (newID != invalidPhysicalPageID) { debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); if (self->m_pBoundaryVerifier != nullptr) { self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); @@ -7014,11 +7147,12 @@ private: } } - BTreePageIDRef newID = wait(self->updateBTreePage(self, - rootID, - &update->newLinks.arena(), - pageCopy.castTo(), - batch->writeVersion)); + BTreeNodeLinkRef newID = wait(self->updateBTreePage(self, + rootID, + parentID, + &update->newLinks.arena(), + pageCopy.castTo(), + batch->writeVersion)); debug_printf( "%s commitSubtree(): Internal node updated in-place at version %s, new contents:\n", context.c_str(), @@ -7042,24 +7176,24 @@ private: auto& stats = g_redwoodMetrics.level(height); for (auto& rec : modifier.rebuild) { if (rec.value.present()) { - BTreePageIDRef oldPages = rec.getChildPage(); - BTreePageIDRef newPages; - for (int i = 0; i < oldPages.size(); ++i) { - LogicalPageID p = oldPages[i]; + BTreeNodeLinkRef oldPages = rec.getChildPage(); + + // Nodes larger than 1 page are never remapped. + if (oldPages.size() == 1) { + LogicalPageID p = oldPages.front(); if (parentInfo->maybeUpdated(p)) { - LogicalPageID newID = + PhysicalPageID newID = self->m_pager->detachRemappedPage(p, batch->writeVersion); - if (newID != invalidLogicalPageID) { - // Rebuild record values reference original page memory so make a copy - if (newPages.empty()) { - newPages = BTreePageIDRef(modifier.rebuild.arena(), oldPages); - rec.setChildPage(newPages); - } + if (newID != invalidPhysicalPageID) { + // Records in the rebuild vector reference original page memory so make + // a new child link in the rebuild arena + BTreeNodeLinkRef newPages = BTreeNodeLinkRef( + modifier.rebuild.arena(), BTreeNodeLinkRef(&newID, 1)); + rec.setChildPage(newPages); debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); if (self->m_pBoundaryVerifier != nullptr) { self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); } - newPages[i] = newID; ++stats.metrics.detachChild; } } @@ -7076,7 +7210,8 @@ private: modifier.rebuild, height, batch->writeVersion, - rootID)); + rootID, + parentID)); update->rebuilt(newChildEntries); debug_printf("%s Internal page rebuilt, returning slice:\n", context.c_str()); @@ -7131,9 +7266,9 @@ private: batch.snapshot = self->m_pager->getReadSnapshot(batch.readVersion); - state Standalone rootPageID = self->m_pHeader->root.get(); + state BTreeNodeLink rootNodeLink = self->m_header.root; state InternalPageSliceUpdate all; - state RedwoodRecordRef rootLink = dbBegin.withPageID(rootPageID); + state RedwoodRecordRef rootLink = dbBegin.withPageID(rootNodeLink); all.subtreeLowerBound = rootLink; all.decodeLowerBound = rootLink; all.subtreeUpperBound = dbEnd; @@ -7144,47 +7279,46 @@ private: --mBegin; MutationBuffer::const_iterator mEnd = batch.mutations->lower_bound(all.subtreeUpperBound.key); - wait(commitSubtree(self, &batch, rootPageID, self->m_pHeader->height, mBegin, mEnd, &all)); + wait( + commitSubtree(self, &batch, rootNodeLink, invalidLogicalPageID, self->m_header.height, mBegin, mEnd, &all)); // If the old root was deleted, write a new empty tree root node and free the old roots if (all.childrenChanged) { if (all.newLinks.empty()) { debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); - Reference page = self->m_pager->newPageBuffer(); - self->makeEmptyRoot(page); - self->m_pHeader->height = 1; - VectorRef rootID((LogicalPageID*)&newRootID, 1); - self->m_pager->updatePage(PagerEventReasons::Commit, self->m_pHeader->height, rootID, page); - rootPageID = BTreePageIDRef((LogicalPageID*)&newRootID, 1); + rootNodeLink = BTreeNodeLinkRef((LogicalPageID*)&newRootID, 1); + self->m_header.height = 1; + + Reference page = wait(makeEmptyRoot(self)); + // Newly allocated page so logical id = physical id and there is no parent as this is a new root + page->setLogicalPageInfo(rootNodeLink.front(), invalidLogicalPageID); + self->m_pager->updatePage(PagerEventReasons::Commit, self->m_header.height, rootNodeLink, page); } else { Standalone> newRootRecords(all.newLinks, all.newLinks.arena()); if (newRootRecords.size() == 1) { - rootPageID = newRootRecords.front().getChildPage(); + rootNodeLink = newRootRecords.front().getChildPage(); } else { // If the new root level's size is not 1 then build new root level(s) Standalone> newRootPage = - wait(buildNewRoot(self, batch.writeVersion, newRootRecords, self->m_pHeader->height)); - rootPageID = newRootPage.front().getChildPage(); + wait(buildNewRoot(self, batch.writeVersion, newRootRecords, self->m_header.height)); + rootNodeLink = newRootPage.front().getChildPage(); } } } - debug_printf("new root %s\n", toString(rootPageID).c_str()); - self->m_pHeader->root.set(rootPageID, self->m_headerSpace - sizeof(MetaKey)); + debug_printf("new root %s\n", toString(rootNodeLink).c_str()); + self->m_header.root = rootNodeLink; self->m_lazyClearStop = true; wait(success(self->m_lazyClearActor)); debug_printf("Lazy delete freed %u pages\n", self->m_lazyClearActor.get()); wait(self->m_lazyClearQueue.flush()); - self->m_pHeader->lazyDeleteQueue = self->m_lazyClearQueue.getState(); - - debug_printf("Setting metakey\n"); - self->m_pager->setMetaKey(self->m_pHeader->asKeyRef()); + self->m_header.lazyDeleteQueue = self->m_lazyClearQueue.getState(); debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion); - wait(self->m_pager->commit(writeVersion)); + wait(self->m_pager->commit(writeVersion, ObjectWriter::toValue(self->m_header, Unversioned()))); debug_printf("%s: Committed version %" PRId64 "\n", self->m_name.c_str(), writeVersion); ++g_redwoodMetrics.metric.opCommit; @@ -7204,10 +7338,10 @@ public: Reference page; BTreePage::BinaryTree::Cursor cursor; #if REDWOOD_DEBUG - Standalone id; + BTreeNodeLink id; #endif - const BTreePage* btPage() const { return (BTreePage*)page->begin(); }; + const BTreePage* btPage() const { return (const BTreePage*)page->data(); }; }; private: @@ -7238,7 +7372,7 @@ public: r += format("\n\t[Level=%d ID=%s ptr=%p Cursor=%s] ", height, id.c_str(), - path[i].page->begin(), + path[i].page->data(), path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage()->isLeaf()).c_str() : ""); if (height <= dumpHeight) { @@ -7270,7 +7404,8 @@ public: Future pushPage(const BTreePage::BinaryTree::Cursor& link) { debug_printf("pushPage(link=%s)\n", link.get().toString(false).c_str()); - return map(readPage(reason, + return map(readPage(btree, + reason, path.back().btPage()->height - 1, pager.getPtr(), link.get().getChildPage(), @@ -7294,9 +7429,9 @@ public: }); } - Future pushPage(BTreePageIDRef id) { + Future pushPage(BTreeNodeLinkRef id) { debug_printf("pushPage(root=%s)\n", ::toString(id).c_str()); - return map(readPage(reason, btree->m_pHeader->height, pager.getPtr(), id, ioMaxPriority, false, true), + return map(readPage(btree, reason, btree->m_header.height, pager.getPtr(), id, ioMaxPriority, false, true), [=](Reference p) { #if REDWOOD_DEBUG path.push_back({ p, btree->getCursor(p.getPtr(), dbBegin, dbEnd), id }); @@ -7311,7 +7446,7 @@ public: Future init(VersionedBTree* btree_in, PagerEventReasons reason_in, Reference pager_in, - BTreePageIDRef root) { + BTreeNodeLink root) { btree = btree_in; reason = reason_in; pager = pager_in; @@ -7424,7 +7559,7 @@ public: // Prefetch the sibling if the link is not null if (c.get().value.present()) { - BTreePageIDRef childPage = c.get().getChildPage(); + BTreeNodeLinkRef childPage = c.get().getChildPage(); if (childPage.size() > 0) preLoadPage(pager.getPtr(), childPage, ioLeafPriority); recordsRead += estRecordsPerPage; @@ -7489,13 +7624,13 @@ public: // The last entry in an internal page could be a null link, if so move back if (!forward && !entry.cursor.get().value.present()) { - ASSERT(entry.cursor.movePrev()); - ASSERT(entry.cursor.get().value.present()); + UNSTOPPABLE_ASSERT(entry.cursor.movePrev()); + UNSTOPPABLE_ASSERT(entry.cursor.get().value.present()); } wait(self->pushPage(entry.cursor)); auto& newEntry = self->path.back(); - ASSERT(forward ? newEntry.cursor.moveFirst() : newEntry.cursor.moveLast()); + UNSTOPPABLE_ASSERT(forward ? newEntry.cursor.moveFirst() : newEntry.cursor.moveLast()); } self->valid = true; @@ -7511,11 +7646,21 @@ public: Future initBTreeCursor(BTreeCursor* cursor, Version snapshotVersion, PagerEventReasons reason) { Reference snapshot = m_pager->getReadSnapshot(snapshotVersion); - // This is a ref because snapshot will continue to hold the metakey value memory - KeyRef m = snapshot->getMetaKey(); + BTreeNodeLinkRef root; + // Parse the metakey just once to get the root pointer and store it as the extra object + if (!snapshot->extra.valid()) { + KeyRef m = snapshot->getMetaKey(); + if (!m.empty()) { + BTreeCommitHeader h = ObjectReader::fromStringRef(m, Unversioned()); + root = h.root; + // Copy the BTreeNodeLink but keep the same arena and BTreeNodeLinkRef + snapshot->extra = new BTreeNodeLink(h.root, h.root.arena()); + } + } else { + root = *snapshot->extra.getPtr(); + } - return cursor->init( - this, reason, snapshot, m.size() == 0 ? BTreePageIDRef() : ((MetaKey*)m.begin())->root.get()); + return cursor->init(this, reason, snapshot, root); } }; @@ -7526,8 +7671,8 @@ RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")) class KeyValueStoreRedwood : public IKeyValueStore { public: - KeyValueStoreRedwood(std::string filePrefix, UID logID) - : m_filename(filePrefix), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0), + KeyValueStoreRedwood(std::string filename, UID logID) + : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) { int pageSize = @@ -7547,15 +7692,24 @@ public: : 100 * 1024 * 1024) // 100M : SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW_BYTES; + EncodingType encodingType = EncodingType::XXHash64; + + // Deterministically enable encryption based on uid + if (g_network->isSimulated() && logID.hash() % 2 == 0) { + encodingType = EncodingType::XOREncryption; + m_keyProvider = std::make_shared(filename); + } + IPager2* pager = new DWALPager(pageSize, extentSize, - filePrefix, + filename, pageCacheBytes, remapCleanupWindowBytes, SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS, false, + m_keyProvider, m_error); - m_tree = new VersionedBTree(pager, filePrefix); + m_tree = new VersionedBTree(pager, filename, encodingType, m_keyProvider); m_init = catchError(init_impl(this)); } @@ -7573,6 +7727,23 @@ public: ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) { TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose); + + // In simulation, if the instance is being disposed of then sometimes run destructive sanity check. + if (g_network->isSimulated() && dispose && BUGGIFY) { + // Only proceed if the last commit is a success, but don't throw if it's not because shutdown + // should not throw. + wait(ready(self->m_lastCommit)); + if (!self->m_lastCommit.isError()) { + // Run the destructive sanity check, but don't throw. + ErrorOr err = wait(errorOr(self->m_tree->clearAllAndCheckSanity())); + // If the test threw an error, it must be an injected fault or something has gone wrong. + ASSERT(!err.isError() || err.getError().isInjectedFault()); + } + } else { + // The KVS user shouldn't be holding a commit future anymore so self shouldn't either. + self->m_lastCommit = Void(); + } + if (self->m_error.canBeSet()) { self->m_error.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } @@ -7595,11 +7766,11 @@ public: Future onClosed() const override { return m_closed.getFuture(); } Future commit(bool sequential = false) override { - Future c = m_tree->commit(m_nextCommitVersion); + m_lastCommit = catchError(m_tree->commit(m_nextCommitVersion)); // Currently not keeping history m_tree->setOldestReadableVersion(m_nextCommitVersion); ++m_nextCommitVersion; - return catchError(c); + return m_lastCommit; } KeyValueStoreType getType() const override { return KeyValueStoreType::SSD_REDWOOD_V1; } @@ -7814,6 +7985,8 @@ private: PriorityMultiLock m_concurrentReads; bool prefetch; Version m_nextCommitVersion; + std::shared_ptr m_keyProvider; + Future m_lastCommit = Void(); template inline Future catchError(Future f) { @@ -8166,7 +8339,7 @@ ACTOR Future verify(VersionedBTree* btree, printf("Verified version %" PRId64 "\n", v); } } catch (Error& e) { - if (e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { + if (e.code() != error_code_end_of_stream) { throw; } } @@ -8175,32 +8348,24 @@ ACTOR Future verify(VersionedBTree* btree, // Does a random range read, doesn't trap/report errors ACTOR Future randomReader(VersionedBTree* btree, int64_t* pRecordsRead) { - try { - state VersionedBTree::BTreeCursor cur; + state VersionedBTree::BTreeCursor cur; - loop { - wait(yield()); - if (!cur.intialized() || deterministicRandom()->random01() > .01) { - wait(btree->initBTreeCursor(&cur, btree->getLastCommittedVersion(), PagerEventReasons::RangeRead)); - } - - state KeyValue kv = randomKV(10, 0); - wait(cur.seekGTE(kv.key)); - state int c = deterministicRandom()->randomInt(0, 100); - state bool direction = deterministicRandom()->coinflip(); - while (cur.isValid() && c-- > 0) { - ++*pRecordsRead; - wait(success(direction ? cur.moveNext() : cur.movePrev())); - wait(yield()); - } + loop { + wait(yield()); + if (!cur.intialized() || deterministicRandom()->random01() > .01) { + wait(btree->initBTreeCursor(&cur, btree->getLastCommittedVersion(), PagerEventReasons::RangeRead)); } - } catch (Error& e) { - if (e.code() != error_code_transaction_too_old) { - throw e; + + state KeyValue kv = randomKV(10, 0); + wait(cur.seekGTE(kv.key)); + state int c = deterministicRandom()->randomInt(0, 100); + state bool direction = deterministicRandom()->coinflip(); + while (cur.isValid() && c-- > 0) { + ++*pRecordsRead; + wait(success(direction ? cur.moveNext() : cur.movePrev())); + wait(yield()); } } - - return Void(); } struct IntIntPair { @@ -8462,7 +8627,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { // Test pageID stuff. { LogicalPageID ids[] = { 1, 5 }; - BTreePageIDRef id(ids, 2); + BTreeNodeLinkRef id(ids, 2); RedwoodRecordRef r; r.setChildPage(id); ASSERT(r.getChildPage() == id); @@ -8815,8 +8980,7 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/RedwoodRecordRef2") { largeTree); debug_printf("Data(%p): %s\n", tree, StringRef((uint8_t*)tree, tree->size()).toHexString().c_str()); - DeltaTree2::DecodeCache cache(prev, next); - DeltaTree2::Cursor c(&cache, tree); + DeltaTree2::Cursor c(makeReference::DecodeCache>(prev, next), tree); // Test delete/insert behavior for each item, making no net changes printf("Testing seek/delete/insert for existing keys with random values\n"); @@ -8846,9 +9010,9 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/RedwoodRecordRef2") { DeltaTree2::Cursor fwd = c; DeltaTree2::Cursor rev = c; - DeltaTree2::DecodeCache cacheValuesOnly(prev, next); DeltaTree2::Cursor fwdValueOnly( - &cacheValuesOnly, (DeltaTree2*)tree); + makeReference::DecodeCache>(prev, next), + (DeltaTree2*)tree); printf("Verifying tree contents using forward, reverse, and value-only iterators\n"); ASSERT(fwd.moveFirst()); @@ -8898,8 +9062,8 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/RedwoodRecordRef2") { ASSERT(i == items.size()); { - DeltaTree2::DecodeCache cache(prev, next); - DeltaTree2::Cursor c(&cache, tree); + DeltaTree2::Cursor c(makeReference::DecodeCache>(prev, next), + tree); printf("Doing 20M random seeks using the same cursor from the same mirror.\n"); double start = timer(); @@ -8995,12 +9159,12 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/IntIntPair") { DeltaTree2* tree2 = (DeltaTree2*)new uint8_t[bufferSize]; int builtSize2 = tree2->build(bufferSize, &items[0], &items[0] + items.size(), &lowerBound, &upperBound); ASSERT(builtSize2 <= bufferSize); - DeltaTree2::DecodeCache cache(lowerBound, upperBound); - DeltaTree2::Cursor cur2(&cache, tree2); + auto cache = makeReference::DecodeCache>(lowerBound, upperBound); + DeltaTree2::Cursor cur2(cache, tree2); auto printItems = [&] { for (int k = 0; k < items.size(); ++k) { - debug_printf("%d/%d %s\n", k + 1, items.size(), items[k].toString().c_str()); + debug_printf("%d/%zu %s\n", k + 1, items.size(), items[k].toString().c_str()); } }; @@ -9066,8 +9230,8 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/IntIntPair") { auto scanAndVerify2 = [&]() { printf("Verify DeltaTree2 contents.\n"); - DeltaTree2::Cursor fwd(&cache, tree2); - DeltaTree2::Cursor rev(&cache, tree2); + DeltaTree2::Cursor fwd(cache, tree2); + DeltaTree2::Cursor rev(cache, tree2); ASSERT(fwd.moveFirst()); ASSERT(rev.moveLast()); @@ -9152,7 +9316,7 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/IntIntPair") { // Create a new mirror, decoding the tree from scratch since insert() modified both the tree and the mirror r = DeltaTree::Mirror(tree, &lowerBound, &upperBound); - cache.clear(); + cache->clear(); scanAndVerify(); scanAndVerify2(); @@ -9211,7 +9375,7 @@ TEST_CASE("Lredwood/correctness/unit/deltaTree/IntIntPair") { printf("Verifying seek behaviors\n"); DeltaTree::Cursor s = r.getCursor(); - DeltaTree2::Cursor s2(&cache, tree2); + DeltaTree2::Cursor s2(cache, tree2); // SeekLTE to each element for (int i = 0; i < items.size(); ++i) { @@ -9534,13 +9698,13 @@ TEST_CASE(":/redwood/performance/mutationBuffer") { TEST_CASE(":/redwood/pager/ArenaPage") { Arena x; printf("Making p\n"); - Reference p(new ArenaPage(4096, 4096)); - printf("Made p=%p\n", p->begin()); + Reference p = makeReference(4096, 4096); + printf("Made p=%p\n", p->data()); printf("Clearing p\n"); p.clear(); printf("Making p\n"); - p = Reference(new ArenaPage(4096, 4096)); - printf("Made p=%p\n", p->begin()); + p = makeReference(4096, 4096); + printf("Made p=%p\n", p->data()); printf("Making x depend on p\n"); x.dependsOn(p->getArena()); printf("Clearing p\n"); @@ -9606,12 +9770,20 @@ TEST_CASE("Lredwood/correctness/btree") { state int64_t maxPageOps = params.getInt("maxPageOps").orDefault((shortTest || serialTest) ? 50e3 : 1e6); state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3); state int maxColdStarts = params.getInt("maxColdStarts").orDefault(300); - // Max number of records in the BTree or the versioned written map to visit state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6); + state EncodingType encodingType = EncodingType::XXHash64; + state std::shared_ptr keyProvider; + + if (deterministicRandom()->coinflip()) { + encodingType = EncodingType::XOREncryption; + keyProvider = std::make_shared(file); + } + printf("\n"); printf("file: %s\n", file.c_str()); + printf("encodingType: %d\n", encodingType); printf("maxPageOps: %" PRId64 "\n", maxPageOps); printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries); printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead); @@ -9641,9 +9813,15 @@ TEST_CASE("Lredwood/correctness/btree") { deleteFile(file); printf("Initializing...\n"); - pager = new DWALPager( - pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads, pagerMemoryOnly); - state VersionedBTree* btree = new VersionedBTree(pager, file); + pager = new DWALPager(pageSize, + extentSize, + file, + pageCacheBytes, + remapCleanupWindowBytes, + concurrentExtentReads, + pagerMemoryOnly, + keyProvider); + state VersionedBTree* btree = new VersionedBTree(pager, file, encodingType, keyProvider); wait(btree->init()); state DecodeBoundaryVerifier* pBoundaries = DecodeBoundaryVerifier::getVerifier(file); @@ -9652,7 +9830,7 @@ TEST_CASE("Lredwood/correctness/btree") { state std::set keys; state int coldStarts = 0; - state Version lastVer = btree->getLastCommittedVersion(); + Version lastVer = btree->getLastCommittedVersion(); printf("Starting from version: %" PRId64 "\n", lastVer); state Version version = lastVer + 1; @@ -9879,9 +10057,16 @@ TEST_CASE("Lredwood/correctness/btree") { wait(closedFuture); printf("Reopening btree from disk.\n"); - IPager2* pager = new DWALPager( - pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads); - btree = new VersionedBTree(pager, file); + IPager2* pager = new DWALPager(pageSize, + extentSize, + file, + pageCacheBytes, + remapCleanupWindowBytes, + concurrentExtentReads, + false, + keyProvider); + btree = new VersionedBTree(pager, file, encodingType, keyProvider); + wait(btree->init()); Version v = btree->getLastCommittedVersion(); @@ -9916,20 +10101,28 @@ TEST_CASE("Lredwood/correctness/btree") { debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); - // Reopen pager and btree with a remap cleanup window of 0 to reclaim all old pages - state Future closedFuture = btree->onClosed(); - btree->close(); - wait(closedFuture); - // If buggify, test starting with empty remap cleanup window. - btree = new VersionedBTree( - new DWALPager( - pageSize, extentSize, file, pageCacheBytes, (BUGGIFY ? 0 : remapCleanupWindowBytes), concurrentExtentReads), - file); - wait(btree->init()); + // Sometimes close and reopen before destructive sanity check + if (deterministicRandom()->coinflip()) { + Future closedFuture = btree->onClosed(); + btree->close(); + wait(closedFuture); + btree = new VersionedBTree(new DWALPager(pageSize, + extentSize, + file, + pageCacheBytes, + (BUGGIFY ? 0 : remapCleanupWindowBytes), + concurrentExtentReads, + pagerMemoryOnly, + keyProvider), + file, + encodingType, + keyProvider); + wait(btree->init()); + } wait(btree->clearAllAndCheckSanity()); - closedFuture = btree->onClosed(); + Future closedFuture = btree->onClosed(); btree->close(); debug_printf("Closing.\n"); wait(closedFuture); @@ -9940,14 +10133,28 @@ TEST_CASE("Lredwood/correctness/btree") { return Void(); } -ACTOR Future randomSeeks(VersionedBTree* btree, int count, char firstChar, char lastChar) { - state Version readVer = btree->getLastCommittedVersion(); +ACTOR Future randomSeeks(VersionedBTree* btree, + Optional v, + bool reInitCursor, + int count, + char firstChar, + char lastChar, + int keyLen) { state int c = 0; state double readStart = timer(); state VersionedBTree::BTreeCursor cur; + state Version readVer = v.orDefault(btree->getLastCommittedVersion()); wait(btree->initBTreeCursor(&cur, readVer, PagerEventReasons::PointRead)); + while (c < count) { - state Key k = randomString(20, firstChar, lastChar); + if (!v.present()) { + readVer = btree->getLastCommittedVersion(); + } + if (reInitCursor) { + wait(btree->initBTreeCursor(&cur, readVer, PagerEventReasons::PointRead)); + } + + state Key k = randomString(keyLen, firstChar, lastChar); wait(cur.seekGTE(k)); ++c; } @@ -10000,37 +10207,6 @@ ACTOR Future randomScans(VersionedBTree* btree, return Void(); } -TEST_CASE(":/redwood/correctness/pager/cow") { - state std::string pagerFile = "unittest_pageFile.redwood-v1"; - printf("Deleting old test data\n"); - deleteFile(pagerFile); - - int pageSize = 4096; - int extentSize = SERVER_KNOBS->REDWOOD_DEFAULT_EXTENT_SIZE; - state IPager2* pager = - new DWALPager(pageSize, extentSize, pagerFile, 0, 0, SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS); - - wait(success(pager->init())); - state LogicalPageID id = wait(pager->newPageID()); - state VectorRef pageID(&id, 1); - Reference p = pager->newPageBuffer(); - memset(p->mutate(), (char)id, p->size()); - pager->updatePage(PagerEventReasons::MetaData, nonBtreeLevel, pageID, p); - pager->setMetaKey(LiteralStringRef("asdfasdf")); - wait(pager->commit(pager->getLastCommittedVersion() + 1)); - Reference p2 = - wait(pager->readPage(PagerEventReasons::PointRead, nonBtreeLevel, id, ioMinPriority, true, false)); - printf("%s\n", StringRef(p2->begin(), p2->size()).toHexString().c_str()); - - // TODO: Verify reads, do more writes and reads to make this a real pager validator - - Future onClosed = pager->onClosed(); - pager->close(); - wait(onClosed); - - return Void(); -} - template struct ExtentQueueEntry { uint8_t entry[size]; @@ -10077,8 +10253,14 @@ TEST_CASE(":/redwood/performance/extentQueue") { // Do random pushes into the queue and commit periodically if (reload) { - pager = new DWALPager( - pageSize, extentSize, fileName, cacheSizeBytes, remapCleanupWindowBytes, concurrentExtentReads); + pager = new DWALPager(pageSize, + extentSize, + fileName, + cacheSizeBytes, + remapCleanupWindowBytes, + concurrentExtentReads, + false, + nullptr); wait(success(pager->init())); @@ -10098,7 +10280,8 @@ TEST_CASE(":/redwood/performance/extentQueue") { cumulativeCommitSize, pager->getPageCacheCount()); wait(m_extentQueue.flush()); - wait(pager->commit(pager->getLastCommittedVersion() + 1)); + wait(pager->commit(pager->getLastCommittedVersion() + 1, + ObjectWriter::toValue(m_extentQueue.getState(), Unversioned()))); cumulativeCommitSize += currentCommitSize; targetCommitSize = deterministicRandom()->randomInt(2e6, 30e6); currentCommitSize = 0; @@ -10118,10 +10301,9 @@ TEST_CASE(":/redwood/performance/extentQueue") { fmt::print( "Final cumulativeCommitSize: {0}, pageCacheCount: {1}\n", cumulativeCommitSize, pager->getPageCacheCount()); wait(m_extentQueue.flush()); - extentQueueState = m_extentQueue.getState(); - printf("Commit ExtentQueue getState(): %s\n", extentQueueState.toString().c_str()); - pager->setMetaKey(extentQueueState.asKeyRef()); - wait(pager->commit(pager->getLastCommittedVersion() + 1)); + printf("Commit ExtentQueue getState(): %s\n", m_extentQueue.getState().toString().c_str()); + wait(pager->commit(pager->getLastCommittedVersion() + 1, + ObjectWriter::toValue(m_extentQueue.getState(), Unversioned()))); Future onClosed = pager->onClosed(); pager->close(); @@ -10129,16 +10311,14 @@ TEST_CASE(":/redwood/performance/extentQueue") { } printf("Reopening pager file from disk.\n"); - pager = - new DWALPager(pageSize, extentSize, fileName, cacheSizeBytes, remapCleanupWindowBytes, concurrentExtentReads); + pager = new DWALPager( + pageSize, extentSize, fileName, cacheSizeBytes, remapCleanupWindowBytes, concurrentExtentReads, false, nullptr); wait(success(pager->init())); printf("Starting ExtentQueue FastPath Recovery from Disk.\n"); // reopen the pager from disk - state Key meta = pager->getMetaKey(); - memcpy(&extentQueueState, meta.begin(), meta.size()); - extentQueueState.fromKeyRef(meta); + extentQueueState = ObjectReader::fromStringRef(pager->getCommitRecord(), Unversioned()); printf("Recovered ExtentQueue getState(): %s\n", extentQueueState.toString().c_str()); m_extentQueue.recover(pager, extentQueueState, "ExtentQueueRecovered"); @@ -10270,9 +10450,15 @@ TEST_CASE(":/redwood/performance/set") { deleteFile(file); } - DWALPager* pager = new DWALPager( - pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads, pagerMemoryOnly); - state VersionedBTree* btree = new VersionedBTree(pager, file); + DWALPager* pager = new DWALPager(pageSize, + extentSize, + file, + pageCacheBytes, + remapCleanupWindowBytes, + concurrentExtentReads, + pagerMemoryOnly, + nullptr); + state VersionedBTree* btree = new VersionedBTree(pager, file, EncodingType::XXHash64, nullptr); wait(btree->init()); printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); @@ -10388,7 +10574,8 @@ TEST_CASE(":/redwood/performance/set") { if (seeks > 0) { printf("Parallel seeks, concurrency=%d, seeks=%d ...\n", concurrentSeeks, seeks); for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + actors.add( + randomSeeks(btree, Optional(), true, seeks / concurrentSeeks, firstKeyChar, lastKeyChar, 4)); } wait(actors.signalAndReset()); } diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index 5b12f7e3c6..d648b1d097 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -227,7 +227,8 @@ struct ConfigureDatabaseWorkload : TestWorkload { double testDuration; int additionalDBs; bool allowDescriptorChange; - bool allowTestStorageMigration; + bool allowTestStorageMigration; // allow change storage migration and perpetual wiggle conf + bool storageMigrationCompatibleConf; // only allow generating configuration suitable for storage migration test bool waitStoreTypeCheck; bool downgradeTest1; // if this is true, don't pick up downgrade incompatible config std::vector> clients; @@ -239,6 +240,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { getOption(options, LiteralStringRef("allowDescriptorChange"), SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT); allowTestStorageMigration = getOption(options, "allowTestStorageMigration"_sr, false) && g_simulator.allowStorageMigrationTypeChange; + storageMigrationCompatibleConf = getOption(options, "storageMigrationCompatibleConf"_sr, false); waitStoreTypeCheck = getOption(options, "waitStoreTypeCheck"_sr, false); downgradeTest1 = getOption(options, "downgradeTest1"_sr, false); g_simulator.usableRegions = 1; @@ -349,7 +351,11 @@ struct ConfigureDatabaseWorkload : TestWorkload { } state int randomChoice; if (self->allowTestStorageMigration) { - randomChoice = deterministicRandom()->randomInt(4, 9); + randomChoice = (deterministicRandom()->random01() < 0.375) ? deterministicRandom()->randomInt(0, 3) + : deterministicRandom()->randomInt(4, 9); + } else if (self->storageMigrationCompatibleConf) { + randomChoice = (deterministicRandom()->random01() < 3.0 / 7) ? deterministicRandom()->randomInt(0, 3) + : deterministicRandom()->randomInt(5, 9); } else { randomChoice = deterministicRandom()->randomInt(0, 8); } diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index 487491048b..973224e265 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -20,6 +20,7 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/NativeAPI.actor.h" +#include "flow/EncryptUtils.h" #include "flow/IRandom.h" #include "flow/BlobCipher.h" #include "fdbserver/workloads/workloads.actor.h" @@ -116,9 +117,10 @@ struct EncryptionOpsWorkload : TestWorkload { Arena arena; std::unique_ptr metrics; - BlobCipherDomainId minDomainId; - BlobCipherDomainId maxDomainId; - BlobCipherBaseKeyId minBaseCipherId; + EncryptCipherDomainId minDomainId; + EncryptCipherDomainId maxDomainId; + EncryptCipherBaseKeyId minBaseCipherId; + EncryptCipherBaseKeyId headerBaseCipherId; EncryptionOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { mode = getOption(options, LiteralStringRef("fixedSize"), 1); @@ -131,6 +133,7 @@ struct EncryptionOpsWorkload : TestWorkload { minDomainId = wcx.clientId * 100 + mode * 30 + 1; maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; minBaseCipherId = 100; + headerBaseCipherId = wcx.clientId * 100 + 1; metrics = std::make_unique(); @@ -167,17 +170,21 @@ struct EncryptionOpsWorkload : TestWorkload { uint8_t buff[AES_256_KEY_LENGTH]; std::vector> cipherKeys; - for (BlobCipherDomainId id = minDomainId; id <= maxDomainId; id++) { - int cipherLen = 0; + int cipherLen = 0; + for (EncryptCipherDomainId id = minDomainId; id <= maxDomainId; id++) { generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen); cipherKeyCache.insertCipherKey(id, minBaseCipherId, buff, cipherLen); ASSERT(cipherLen > 0 && cipherLen <= AES_256_KEY_LENGTH); cipherKeys = cipherKeyCache.getAllCiphers(id); - ASSERT(cipherKeys.size() == 1); + ASSERT_EQ(cipherKeys.size(), 1); } + // insert the Encrypt Header cipherKey + generateRandomBaseCipher(AES_256_KEY_LENGTH, &buff[0], &cipherLen); + cipherKeyCache.insertCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId, buff, cipherLen); + TraceEvent("SetupCipherEssentials_Done").detail("MinDomainId", minDomainId).detail("MaxDomainId", maxDomainId); } @@ -188,10 +195,10 @@ struct EncryptionOpsWorkload : TestWorkload { TraceEvent("ResetCipherEssentials_Done").log(); } - void updateLatestBaseCipher(const BlobCipherDomainId encryptDomainId, + void updateLatestBaseCipher(const EncryptCipherDomainId encryptDomainId, uint8_t* baseCipher, int* baseCipherLen, - BlobCipherBaseKeyId* nextBaseCipherId) { + EncryptCipherBaseKeyId* nextBaseCipherId) { auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); Reference cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId); *nextBaseCipherId = cipherKey->getBaseCipherId() + 1; @@ -202,22 +209,24 @@ struct EncryptionOpsWorkload : TestWorkload { TraceEvent("UpdateBaseCipher").detail("DomainId", encryptDomainId).detail("BaseCipherId", *nextBaseCipherId); } - Reference doEncryption(Reference key, + Reference doEncryption(Reference textCipherKey, + Reference headerCipherKey, uint8_t* payload, int len, + const EncryptAuthTokenMode authMode, BlobCipherEncryptHeader* header) { uint8_t iv[AES_256_IV_LENGTH]; generateRandomData(&iv[0], AES_256_IV_LENGTH); - EncryptBlobCipherAes265Ctr encryptor(key, &iv[0], AES_256_IV_LENGTH); + EncryptBlobCipherAes265Ctr encryptor(textCipherKey, headerCipherKey, &iv[0], AES_256_IV_LENGTH, authMode); auto start = std::chrono::high_resolution_clock::now(); Reference encrypted = encryptor.encrypt(payload, len, header, arena); auto end = std::chrono::high_resolution_clock::now(); // validate encrypted buffer size and contents (not matching with plaintext) - ASSERT(encrypted->getLogicalSize() == len); - ASSERT(memcmp(encrypted->begin(), payload, len) != 0); - ASSERT(header->flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(encrypted->getLogicalSize(), len); + ASSERT_NE(memcmp(encrypted->begin(), payload, len), 0); + ASSERT_EQ(header->flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); metrics->updateEncryptionTime(std::chrono::duration(end - start).count()); return encrypted; @@ -228,23 +237,30 @@ struct EncryptionOpsWorkload : TestWorkload { const BlobCipherEncryptHeader& header, uint8_t* originalPayload, Reference orgCipherKey) { - ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); - ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); - Reference cipherKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId); + Reference cipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId); + Reference headerCipherKey = cipherKeyCache.getCipherKey( + header.cipherHeaderDetails.encryptDomainId, header.cipherHeaderDetails.baseCipherId); ASSERT(cipherKey.isValid()); ASSERT(cipherKey->isEqual(orgCipherKey)); - DecryptBlobCipherAes256Ctr decryptor(cipherKey, &header.iv[0]); + DecryptBlobCipherAes256Ctr decryptor(cipherKey, headerCipherKey, &header.cipherTextDetails.iv[0]); + const bool validateHeaderAuthToken = deterministicRandom()->randomInt(0, 100) < 65; auto start = std::chrono::high_resolution_clock::now(); + if (validateHeaderAuthToken) { + decryptor.verifyHeaderAuthToken(header, arena); + } Reference decrypted = decryptor.decrypt(encrypted->begin(), len, header, arena); auto end = std::chrono::high_resolution_clock::now(); // validate decrypted buffer size and contents (matching with original plaintext) - ASSERT(decrypted->getLogicalSize() == len); - ASSERT(memcmp(decrypted->begin(), originalPayload, len) == 0); + ASSERT_EQ(decrypted->getLogicalSize(), len); + ASSERT_EQ(memcmp(decrypted->begin(), originalPayload, len), 0); metrics->updateDecryptionTime(std::chrono::duration(end - start).count()); } @@ -256,7 +272,7 @@ struct EncryptionOpsWorkload : TestWorkload { Future start(Database const& cx) override { uint8_t baseCipher[AES_256_KEY_LENGTH]; int baseCipherLen = 0; - BlobCipherBaseKeyId nextBaseCipherId; + EncryptCipherBaseKeyId nextBaseCipherId; // Setup encryptDomainIds and corresponding baseCipher details setupCipherEssentials(); @@ -268,7 +284,7 @@ struct EncryptionOpsWorkload : TestWorkload { auto& cipherKeyCache = BlobCipherKeyCache::getInstance(); // randomly select a domainId - const BlobCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId); + const EncryptCipherDomainId encryptDomainId = deterministicRandom()->randomInt(minDomainId, maxDomainId); ASSERT(encryptDomainId >= minDomainId && encryptDomainId <= maxDomainId); if (updateBaseCipher) { @@ -279,14 +295,17 @@ struct EncryptionOpsWorkload : TestWorkload { auto start = std::chrono::high_resolution_clock::now(); Reference cipherKey = cipherKeyCache.getLatestCipherKey(encryptDomainId); + // Each client working with their own version of encryptHeaderCipherKey, avoid using getLatest() + Reference headerCipherKey = + cipherKeyCache.getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, headerBaseCipherId); auto end = std::chrono::high_resolution_clock::now(); metrics->updateKeyDerivationTime(std::chrono::duration(end - start).count()); // Validate sanity of "getLatestCipher", especially when baseCipher gets updated if (updateBaseCipher) { - ASSERT(cipherKey->getBaseCipherId() == nextBaseCipherId); - ASSERT(cipherKey->getBaseCipherLen() == baseCipherLen); - ASSERT(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen) == 0); + ASSERT_EQ(cipherKey->getBaseCipherId(), nextBaseCipherId); + ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipherLen); + ASSERT_EQ(memcmp(cipherKey->rawBaseCipher(), baseCipher, baseCipherLen), 0); } int dataLen = isFixedSizePayload() ? pageSize : deterministicRandom()->randomInt(100, maxBufSize); @@ -294,8 +313,12 @@ struct EncryptionOpsWorkload : TestWorkload { // Encrypt the payload - generates BlobCipherEncryptHeader to assist decryption later BlobCipherEncryptHeader header; + const EncryptAuthTokenMode authMode = deterministicRandom()->randomInt(0, 100) < 50 + ? ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE + : ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI; try { - Reference encrypted = doEncryption(cipherKey, buff.get(), dataLen, &header); + Reference encrypted = + doEncryption(cipherKey, headerCipherKey, buff.get(), dataLen, authMode, &header); // Decrypt the payload - parses the BlobCipherEncryptHeader, fetch corresponding cipherKey and // decrypt @@ -303,7 +326,8 @@ struct EncryptionOpsWorkload : TestWorkload { } catch (Error& e) { TraceEvent("Failed") .detail("DomainId", encryptDomainId) - .detail("BaseCipherId", cipherKey->getBaseCipherId()); + .detail("BaseCipherId", cipherKey->getBaseCipherId()) + .detail("AuthMode", authMode); throw; } diff --git a/flow/BlobCipher.cpp b/flow/BlobCipher.cpp index a909b783a6..5f82a86612 100644 --- a/flow/BlobCipher.cpp +++ b/flow/BlobCipher.cpp @@ -19,6 +19,7 @@ */ #include "flow/BlobCipher.h" +#include "flow/EncryptUtils.h" #include "flow/Error.h" #include "flow/FastRef.h" #include "flow/IRandom.h" @@ -29,21 +30,23 @@ #include #include +#include #if ENCRYPTION_ENABLED -// BlobCipherEncryptHeader -BlobCipherEncryptHeader::BlobCipherEncryptHeader() { - flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_NONE; +namespace { +bool isEncryptHeaderAuthTokenModeValid(const EncryptAuthTokenMode mode) { + return mode >= ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && mode < ENCRYPT_HEADER_AUTH_TOKEN_LAST; } +} // namespace // BlobCipherKey class methods -BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId, - const BlobCipherBaseKeyId& baseCiphId, +BlobCipherKey::BlobCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCiphId, const uint8_t* baseCiph, int baseCiphLen) { - BlobCipherRandomSalt salt; + EncryptCipherRandomSalt salt; if (g_network->isSimulated()) { salt = deterministicRandom()->randomUInt64(); } else { @@ -58,11 +61,11 @@ BlobCipherKey::BlobCipherKey(const BlobCipherDomainId& domainId, .detail("CreationTime", creationTime);*/ } -void BlobCipherKey::initKey(const BlobCipherDomainId& domainId, +void BlobCipherKey::initKey(const EncryptCipherDomainId& domainId, const uint8_t* baseCiph, int baseCiphLen, - const BlobCipherBaseKeyId& baseCiphId, - const BlobCipherRandomSalt& salt) { + const EncryptCipherBaseKeyId& baseCiphId, + const EncryptCipherRandomSalt& salt) { // Set the base encryption key properties baseCipher = std::make_unique(AES_256_KEY_LENGTH); memset(baseCipher.get(), 0, AES_256_KEY_LENGTH); @@ -82,11 +85,11 @@ void BlobCipherKey::initKey(const BlobCipherDomainId& domainId, void BlobCipherKey::applyHmacSha256Derivation() { Arena arena; - uint8_t buf[baseCipherLen + sizeof(BlobCipherRandomSalt)]; + uint8_t buf[baseCipherLen + sizeof(EncryptCipherRandomSalt)]; memcpy(&buf[0], baseCipher.get(), baseCipherLen); - memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(BlobCipherRandomSalt)); + memcpy(&buf[0] + baseCipherLen, &randomSalt, sizeof(EncryptCipherRandomSalt)); HmacSha256DigestGen hmacGen(baseCipher.get(), baseCipherLen); - StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(BlobCipherRandomSalt), arena); + StringRef digest = hmacGen.digest(&buf[0], baseCipherLen + sizeof(EncryptCipherRandomSalt), arena); std::copy(digest.begin(), digest.end(), cipher.get()); if (digest.size() < AES_256_KEY_LENGTH) { memcpy(cipher.get() + digest.size(), buf, AES_256_KEY_LENGTH - digest.size()); @@ -101,10 +104,10 @@ void BlobCipherKey::reset() { // BlobKeyIdCache class methods BlobCipherKeyIdCache::BlobCipherKeyIdCache() - : domainId(INVALID_DOMAIN_ID), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) {} + : domainId(ENCRYPT_INVALID_DOMAIN_ID), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) {} -BlobCipherKeyIdCache::BlobCipherKeyIdCache(BlobCipherDomainId dId) - : domainId(dId), latestBaseCipherKeyId(INVALID_CIPHER_KEY_ID) { +BlobCipherKeyIdCache::BlobCipherKeyIdCache(EncryptCipherDomainId dId) + : domainId(dId), latestBaseCipherKeyId(ENCRYPT_INVALID_CIPHER_KEY_ID) { TraceEvent("Init_BlobCipherKeyIdCache").detail("DomainId", domainId); } @@ -112,7 +115,7 @@ Reference BlobCipherKeyIdCache::getLatestCipherKey() { return getCipherByBaseCipherId(latestBaseCipherKeyId); } -Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId) { +Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId) { BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherKeyId); if (itr == keyIdCache.end()) { throw encrypt_key_not_found(); @@ -120,10 +123,10 @@ Reference BlobCipherKeyIdCache::getCipherByBaseCipherId(BlobCiphe return itr->second; } -void BlobCipherKeyIdCache::insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, +void BlobCipherKeyIdCache::insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen) { - ASSERT(baseCipherId > INVALID_CIPHER_KEY_ID); + ASSERT_GT(baseCipherId, ENCRYPT_INVALID_CIPHER_KEY_ID); // BaseCipherKeys are immutable, ensure that cached value doesn't get updated. BlobCipherKeyIdCacheMapCItr itr = keyIdCache.find(baseCipherId); @@ -165,11 +168,11 @@ std::vector> BlobCipherKeyIdCache::getAllCipherKeys() { // BlobCipherKeyCache class methods -void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId, - const BlobCipherBaseKeyId& baseCipherId, +void BlobCipherKeyCache::insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen) { - if (domainId == INVALID_DOMAIN_ID || baseCipherId == INVALID_CIPHER_KEY_ID) { + if (domainId == ENCRYPT_INVALID_DOMAIN_ID || baseCipherId == ENCRYPT_INVALID_CIPHER_KEY_ID) { throw encrypt_invalid_id(); } @@ -193,7 +196,7 @@ void BlobCipherKeyCache::insertCipherKey(const BlobCipherDomainId& domainId, } } -Reference BlobCipherKeyCache::getLatestCipherKey(const BlobCipherDomainId& domainId) { +Reference BlobCipherKeyCache::getLatestCipherKey(const EncryptCipherDomainId& domainId) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { TraceEvent("GetLatestCipherKey_DomainNotFound").detail("DomainId", domainId); @@ -212,8 +215,8 @@ Reference BlobCipherKeyCache::getLatestCipherKey(const BlobCipher return cipherKey; } -Reference BlobCipherKeyCache::getCipherKey(const BlobCipherDomainId& domainId, - const BlobCipherBaseKeyId& baseCipherId) { +Reference BlobCipherKeyCache::getCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { throw encrypt_key_not_found(); @@ -223,7 +226,7 @@ Reference BlobCipherKeyCache::getCipherKey(const BlobCipherDomain return keyIdCache->getCipherByBaseCipherId(baseCipherId); } -void BlobCipherKeyCache::resetEncyrptDomainId(const BlobCipherDomainId domainId) { +void BlobCipherKeyCache::resetEncyrptDomainId(const EncryptCipherDomainId domainId) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { throw encrypt_key_not_found(); @@ -245,7 +248,7 @@ void BlobCipherKeyCache::cleanup() noexcept { instance.domainCacheMap.clear(); } -std::vector> BlobCipherKeyCache::getAllCiphers(const BlobCipherDomainId& domainId) { +std::vector> BlobCipherKeyCache::getAllCiphers(const EncryptCipherDomainId& domainId) { auto domainItr = domainCacheMap.find(domainId); if (domainItr == domainCacheMap.end()) { return {}; @@ -255,13 +258,17 @@ std::vector> BlobCipherKeyCache::getAllCiphers(const Bl return keyIdCache->getAllCipherKeys(); } -// EncryptBlobCipher class methods +// EncryptBlobCipherAes265Ctr class methods -EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference key, +EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, const uint8_t* cipherIV, - const int ivLen) - : ctx(EVP_CIPHER_CTX_new()), cipherKey(key) { - ASSERT(ivLen == AES_256_IV_LENGTH); + const int ivLen, + const EncryptAuthTokenMode mode) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode) { + ASSERT(isEncryptHeaderAuthTokenModeValid(mode)); + ASSERT_EQ(ivLen, AES_256_IV_LENGTH); + memcpy(&iv[0], cipherIV, ivLen); if (ctx == nullptr) { @@ -270,7 +277,7 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference if (EVP_EncryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr) != 1) { throw encrypt_ops_error(); } - if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), cipherIV) != 1) { + if (EVP_EncryptInit_ex(ctx, nullptr, nullptr, textCipherKey.getPtr()->data(), cipherIV) != 1) { throw encrypt_ops_error(); } } @@ -281,21 +288,29 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte Arena& arena) { TEST(true); // Encrypting data with BlobCipher - Reference encryptBuf = makeReference(plaintextLen + AES_BLOCK_SIZE, arena); + memset(reinterpret_cast(header), 0, sizeof(BlobCipherEncryptHeader)); + + // Alloc buffer computation accounts for 'header authentication' generation scheme. If single-auth-token needs to be + // generated, allocate buffer sufficient to append header to the cipherText optimizing memcpy cost. + + const int allocSize = authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE + ? plaintextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader) + : plaintextLen + AES_BLOCK_SIZE; + Reference encryptBuf = makeReference(allocSize, arena); uint8_t* ciphertext = encryptBuf->begin(); int bytes{ 0 }; if (EVP_EncryptUpdate(ctx, ciphertext, &bytes, plaintext, plaintextLen) != 1) { TraceEvent("Encrypt_UpdateFailed") - .detail("BaseCipherId", cipherKey->getBaseCipherId()) - .detail("EncryptDomainId", cipherKey->getDomainId()); + .detail("BaseCipherId", textCipherKey->getBaseCipherId()) + .detail("EncryptDomainId", textCipherKey->getDomainId()); throw encrypt_ops_error(); } int finalBytes{ 0 }; if (EVP_EncryptFinal_ex(ctx, ciphertext + bytes, &finalBytes) != 1) { TraceEvent("Encrypt_FinalFailed") - .detail("BaseCipherId", cipherKey->getBaseCipherId()) - .detail("EncryptDomainId", cipherKey->getDomainId()); + .detail("BaseCipherId", textCipherKey->getBaseCipherId()) + .detail("EncryptDomainId", textCipherKey->getDomainId()); throw encrypt_ops_error(); } @@ -306,19 +321,57 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte throw encrypt_ops_error(); } - // populate header details for the encrypted blob. + // Populate encryption header flags details header->flags.size = sizeof(BlobCipherEncryptHeader); header->flags.headerVersion = EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION; - header->flags.encryptMode = BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR; - header->baseCipherId = cipherKey->getBaseCipherId(); - header->encryptDomainId = cipherKey->getDomainId(); - header->salt = cipherKey->getSalt(); - memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH); + header->flags.encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR; + header->flags.authTokenMode = authTokenMode; - // Preserve checksum of encrypted bytes in the header; approach protects against disk induced bit-rot/flip - // scenarios. AES CTR mode doesn't generate 'tag' by default as with schemes such as: AES 256 GCM. + // Populate cipherText encryption-key details + header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId(); + header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId(); + header->cipherTextDetails.salt = textCipherKey->getSalt(); + memcpy(&header->cipherTextDetails.iv[0], &iv[0], AES_256_IV_LENGTH); - header->ciphertextChecksum = computeEncryptChecksum(ciphertext, bytes + finalBytes, cipherKey->getSalt(), arena); + if (authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + // No header 'authToken' generation needed. + } else { + // Populate header encryption-key details + header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId(); + header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId(); + + // Populate header authToken details + if (header->flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + ASSERT_GE(allocSize, (bytes + finalBytes + sizeof(BlobCipherEncryptHeader))); + ASSERT_GE(encryptBuf->getLogicalSize(), (bytes + finalBytes + sizeof(BlobCipherEncryptHeader))); + + memcpy(&ciphertext[bytes + finalBytes], + reinterpret_cast(header), + sizeof(BlobCipherEncryptHeader)); + StringRef authToken = computeAuthToken(ciphertext, + bytes + finalBytes + sizeof(BlobCipherEncryptHeader), + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + arena); + memcpy(&header->singleAuthToken.authToken[0], authToken.begin(), AUTH_TOKEN_SIZE); + } else { + ASSERT_EQ(header->flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + + StringRef cipherTextAuthToken = + computeAuthToken(ciphertext, + bytes + finalBytes, + reinterpret_cast(&header->cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + arena); + memcpy(&header->multiAuthTokens.cipherTextAuthToken[0], cipherTextAuthToken.begin(), AUTH_TOKEN_SIZE); + StringRef headerAuthToken = computeAuthToken(reinterpret_cast(header), + sizeof(BlobCipherEncryptHeader), + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + arena); + memcpy(&header->multiAuthTokens.headerAuthToken[0], headerAuthToken.begin(), AUTH_TOKEN_SIZE); + } + } encryptBuf->setLogicalSize(plaintextLen); return encryptBuf; @@ -330,45 +383,137 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { } } -// DecryptBlobCipher class methods +// DecryptBlobCipherAes256Ctr class methods -DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference key, const uint8_t* iv) - : ctx(EVP_CIPHER_CTX_new()) { +DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* iv) + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), + headerAuthTokenValidationDone(false), authTokensValidationDone(false) { if (ctx == nullptr) { throw encrypt_ops_error(); } if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_ctr(), nullptr, nullptr, nullptr)) { throw encrypt_ops_error(); } - if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, key.getPtr()->data(), iv)) { + if (!EVP_DecryptInit_ex(ctx, nullptr, nullptr, tCipherKey.getPtr()->data(), iv)) { throw encrypt_ops_error(); } } -void DecryptBlobCipherAes256Ctr::verifyEncryptBlobHeader(const uint8_t* ciphertext, - const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena) { - // validate header flag sanity - if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION || - header.flags.encryptMode != BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR) { - TraceEvent("VerifyEncryptBlobHeader") - .detail("HeaderVersion", header.flags.headerVersion) - .detail("HeaderMode", header.flags.encryptMode) - .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION) - .detail("ExpectedMode", BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); - throw encrypt_header_metadata_mismatch(); +void DecryptBlobCipherAes256Ctr::verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena) { + if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI) { + // NoneAuthToken mode; no authToken is generated; nothing to do + // SingleAuthToken mode; verification will happen as part of decryption. + return; } - // encrypted byte checksum sanity; protection against data bit-rot/flip. - BlobCipherChecksum computed = computeEncryptChecksum(ciphertext, ciphertextLen, header.salt, arena); - if (computed != header.ciphertextChecksum) { - TraceEvent("VerifyEncryptBlobHeader_ChecksumMismatch") + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + + BlobCipherEncryptHeader headerCopy; + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + memset(reinterpret_cast(&headerCopy.multiAuthTokens.headerAuthToken), 0, AUTH_TOKEN_SIZE); + StringRef computedHeaderAuthToken = computeAuthToken(reinterpret_cast(&headerCopy), + sizeof(BlobCipherEncryptHeader), + headerCipherKey->rawCipher(), + AES_256_KEY_LENGTH, + arena); + if (memcmp(&header.multiAuthTokens.headerAuthToken[0], computedHeaderAuthToken.begin(), AUTH_TOKEN_SIZE) != 0) { + TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderMode", header.flags.encryptMode) - .detail("CiphertextChecksum", header.ciphertextChecksum) - .detail("ComputedCiphertextChecksum", computed); - throw encrypt_header_checksum_mismatch(); + .detail("MultiAuthHeaderAuthToken", + StringRef(arena, &header.multiAuthTokens.headerAuthToken[0], AUTH_TOKEN_SIZE).toString()) + .detail("ComputedHeaderAuthToken", computedHeaderAuthToken.toString()); + throw encrypt_header_authtoken_mismatch(); + } + + headerAuthTokenValidationDone = true; +} + +void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena) { + // Header authToken not set for single auth-token mode. + ASSERT(!headerAuthTokenValidationDone); + + // prepare the payload {cipherText + encryptionHeader} + memcpy(&buff[0], ciphertext, ciphertextLen); + memcpy(&buff[ciphertextLen], reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); + // ensure the 'authToken' is reset before computing the 'authentication token' + BlobCipherEncryptHeader* eHeader = (BlobCipherEncryptHeader*)(&buff[ciphertextLen]); + memset(reinterpret_cast(&eHeader->singleAuthToken), 0, 2 * AUTH_TOKEN_SIZE); + + StringRef computed = computeAuthToken( + buff, ciphertextLen + sizeof(BlobCipherEncryptHeader), headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, arena); + if (memcmp(&header.singleAuthToken.authToken[0], computed.begin(), AUTH_TOKEN_SIZE) != 0) { + TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("SingleAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()) + .detail("ComputedSingleAuthToken", computed.toString()); + throw encrypt_header_authtoken_mismatch(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyHeaderMultiAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena) { + if (!headerAuthTokenValidationDone) { + verifyHeaderAuthToken(header, arena); + } + StringRef computedCipherTextAuthToken = + computeAuthToken(ciphertext, + ciphertextLen, + reinterpret_cast(&header.cipherTextDetails.salt), + sizeof(EncryptCipherRandomSalt), + arena); + if (memcmp(&header.multiAuthTokens.cipherTextAuthToken[0], computedCipherTextAuthToken.begin(), AUTH_TOKEN_SIZE) != + 0) { + TraceEvent("VerifyEncryptBlobHeader_AuthTokenMismatch") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderMode", header.flags.encryptMode) + .detail("MultiAuthCipherTextAuthToken", + StringRef(arena, &header.multiAuthTokens.cipherTextAuthToken[0], AUTH_TOKEN_SIZE).toString()) + .detail("ComputedCipherTextAuthToken", computedCipherTextAuthToken.toString()); + throw encrypt_header_authtoken_mismatch(); + } +} + +void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena) { + if (header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE) { + verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, buff, arena); + } else { + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + verifyHeaderMultiAuthToken(ciphertext, ciphertextLen, header, buff, arena); + } + + authTokensValidationDone = true; +} + +void DecryptBlobCipherAes256Ctr::verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header) { + // validate header flag sanity + if (header.flags.headerVersion != EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION || + header.flags.encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR || + !isEncryptHeaderAuthTokenModeValid((EncryptAuthTokenMode)header.flags.authTokenMode)) { + TraceEvent("VerifyEncryptBlobHeader") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("ExpectedVersion", EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION) + .detail("EncryptCipherMode", header.flags.encryptMode) + .detail("ExpectedCipherMode", ENCRYPT_CIPHER_MODE_AES_256_CTR) + .detail("EncryptHeaderAuthTokenMode", header.flags.authTokenMode); + throw encrypt_header_metadata_mismatch(); } } @@ -378,23 +523,37 @@ Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert Arena& arena) { TEST(true); // Decrypting data with BlobCipher - verifyEncryptBlobHeader(ciphertext, ciphertextLen, header, arena); + verifyEncryptHeaderMetadata(header); + + if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && !headerCipherKey.isValid()) { + TraceEvent("Decrypt_InvalidHeaderCipherKey").detail("AuthTokenMode", header.flags.authTokenMode); + throw encrypt_ops_error(); + } + + const int allocSize = header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE + ? ciphertextLen + AES_BLOCK_SIZE + sizeof(BlobCipherEncryptHeader) + : ciphertextLen + AES_BLOCK_SIZE; + Reference decrypted = makeReference(allocSize, arena); + + if (header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + verifyAuthTokens(ciphertext, ciphertextLen, header, decrypted->begin(), arena); + ASSERT(authTokensValidationDone); + } - Reference decrypted = makeReference(ciphertextLen + AES_BLOCK_SIZE, arena); uint8_t* plaintext = decrypted->begin(); int bytesDecrypted{ 0 }; if (!EVP_DecryptUpdate(ctx, plaintext, &bytesDecrypted, ciphertext, ciphertextLen)) { TraceEvent("Decrypt_UpdateFailed") - .detail("BaseCipherId", header.baseCipherId) - .detail("EncryptDomainId", header.encryptDomainId); + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); throw encrypt_ops_error(); } int finalBlobBytes{ 0 }; if (EVP_DecryptFinal_ex(ctx, plaintext + bytesDecrypted, &finalBlobBytes) <= 0) { TraceEvent("Decrypt_FinalFailed") - .detail("BaseCipherId", header.baseCipherId) - .detail("EncryptDomainId", header.encryptDomainId); + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("EncryptDomainId", header.cipherTextDetails.encryptDomainId); throw encrypt_ops_error(); } @@ -443,6 +602,18 @@ StringRef HmacSha256DigestGen::digest(const unsigned char* data, size_t len, Are return StringRef(digest, digestLen); } +StringRef computeAuthToken(const uint8_t* payload, + const int payloadLen, + const uint8_t* key, + const int keyLen, + Arena& arena) { + HmacSha256DigestGen hmacGenerator(key, keyLen); + StringRef digest = hmacGenerator.digest(payload, payloadLen, arena); + + ASSERT_GE(digest.size(), AUTH_TOKEN_SIZE); + return digest; +} + // Only used to link unit tests void forceLinkBlobCipherTests() {} @@ -453,41 +624,42 @@ void forceLinkBlobCipherTests() {} // 4. Inserting of 'non-identical' cipherKey (already cached) more than once works as desired. // 5. Validation encryption ops (correctness): // 5.1. Encyrpt a buffer followed by decryption of the buffer, validate the contents. -// 5.2. Simulate anomolies such as: EncyrptionHeader corruption, checkSum mismatch / encryptionMode mismatch etc. +// 5.2. Simulate anomalies such as: EncyrptionHeader corruption, authToken mismatch / encryptionMode mismatch etc. // 6. Cache cleanup // 6.1 cleanup cipherKeys by given encryptDomainId // 6.2. Cleanup all cached cipherKeys TEST_CASE("flow/BlobCipher") { TraceEvent("BlobCipherTest_Start").log(); + // Construct a dummy External Key Manager representation and populate with some keys class BaseCipher : public ReferenceCounted, NonCopyable { public: - BlobCipherDomainId domainId; + EncryptCipherDomainId domainId; int len; - BlobCipherBaseKeyId keyId; + EncryptCipherBaseKeyId keyId; std::unique_ptr key; - BaseCipher(const BlobCipherDomainId& dId, const BlobCipherBaseKeyId& kId) + BaseCipher(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& kId) : domainId(dId), len(deterministicRandom()->randomInt(AES_256_KEY_LENGTH / 2, AES_256_KEY_LENGTH + 1)), keyId(kId), key(std::make_unique(len)) { generateRandomData(key.get(), len); } }; - using BaseKeyMap = std::unordered_map>; - using DomainKeyMap = std::unordered_map; + using BaseKeyMap = std::unordered_map>; + using DomainKeyMap = std::unordered_map; DomainKeyMap domainKeyMap; - const BlobCipherDomainId minDomainId = 1; - const BlobCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; - const BlobCipherBaseKeyId minBaseCipherKeyId = 100; - const BlobCipherBaseKeyId maxBaseCipherKeyId = + const EncryptCipherDomainId minDomainId = 1; + const EncryptCipherDomainId maxDomainId = deterministicRandom()->randomInt(minDomainId, minDomainId + 10) + 5; + const EncryptCipherBaseKeyId minBaseCipherKeyId = 100; + const EncryptCipherBaseKeyId maxBaseCipherKeyId = deterministicRandom()->randomInt(minBaseCipherKeyId, minBaseCipherKeyId + 50) + 15; for (int dId = minDomainId; dId <= maxDomainId; dId++) { for (int kId = minBaseCipherKeyId; kId <= maxBaseCipherKeyId; kId++) { domainKeyMap[dId].emplace(kId, makeReference(dId, kId)); } } - ASSERT(domainKeyMap.size() == maxDomainId); + ASSERT_EQ(domainKeyMap.size(), maxDomainId); // insert BlobCipher keys into BlobCipherKeyCache map and validate TraceEvent("BlobCipherTest_InsertKeys").log(); @@ -500,6 +672,11 @@ TEST_CASE("flow/BlobCipher") { baseCipher->domainId, baseCipher->keyId, baseCipher->key.get(), baseCipher->len); } } + // insert EncryptHeader BlobCipher key + Reference headerBaseCipher = makeReference(ENCRYPT_HEADER_DOMAIN_ID, 1); + cipherKeyCache.insertCipherKey( + headerBaseCipher->domainId, headerBaseCipher->keyId, headerBaseCipher->key.get(), headerBaseCipher->len); + TraceEvent("BlobCipherTest_InsertKeysDone").log(); // validate the cipherKey lookups work as desired @@ -509,13 +686,13 @@ TEST_CASE("flow/BlobCipher") { Reference cipherKey = cipherKeyCache.getCipherKey(baseCipher->domainId, baseCipher->keyId); ASSERT(cipherKey.isValid()); // validate common cipher properties - domainId, baseCipherId, baseCipherLen, rawBaseCipher - ASSERT(cipherKey->getBaseCipherId() == baseCipher->keyId); - ASSERT(cipherKey->getDomainId() == baseCipher->domainId); - ASSERT(cipherKey->getBaseCipherLen() == baseCipher->len); + ASSERT_EQ(cipherKey->getBaseCipherId(), baseCipher->keyId); + ASSERT_EQ(cipherKey->getDomainId(), baseCipher->domainId); + ASSERT_EQ(cipherKey->getBaseCipherLen(), baseCipher->len); // ensure that baseCipher matches with the cached information - ASSERT(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) == 0); + ASSERT_EQ(std::memcmp(cipherKey->rawBaseCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0); // validate the encryption derivation - ASSERT(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()) != 0); + ASSERT_NE(std::memcmp(cipherKey->rawCipher(), baseCipher->key.get(), cipherKey->getBaseCipherLen()), 0); } } TraceEvent("BlobCipherTest_LooksupDone").log(); @@ -548,6 +725,7 @@ TEST_CASE("flow/BlobCipher") { // Validate Encyrption ops Reference cipherKey = cipherKeyCache.getLatestCipherKey(minDomainId); + Reference headerCipherKey = cipherKeyCache.getLatestCipherKey(ENCRYPT_HEADER_DOMAIN_ID); const int bufLen = deterministicRandom()->randomInt(786, 2127) + 512; uint8_t orgData[bufLen]; generateRandomData(&orgData[0], bufLen); @@ -556,68 +734,317 @@ TEST_CASE("flow/BlobCipher") { uint8_t iv[AES_256_IV_LENGTH]; generateRandomData(&iv[0], AES_256_IV_LENGTH); - // validate basic encrypt followed by decrypt operation - EncryptBlobCipherAes265Ctr encryptor(cipherKey, iv, AES_256_IV_LENGTH); - BlobCipherEncryptHeader header; - Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + BlobCipherEncryptHeader headerCopy; + // validate basic encrypt followed by decrypt operation for AUTH_MODE_NONE + { + TraceEvent("NoneAuthMode_Start").log(); - ASSERT(encrypted->getLogicalSize() == bufLen); - ASSERT(memcmp(&orgData[0], encrypted->begin(), bufLen) != 0); - ASSERT(header.flags.headerVersion == EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); - ASSERT(header.flags.encryptMode == BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR); + EncryptBlobCipherAes265Ctr encryptor( + cipherKey, Reference(), iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); - TraceEvent("BlobCipherTest_EncryptDone") - .detail("HeaderVersion", header.flags.headerVersion) - .detail("HeaderEncryptMode", header.flags.encryptMode) - .detail("DomainId", header.encryptDomainId) - .detail("BaseCipherId", header.baseCipherId) - .detail("HeaderChecksum", header.ciphertextChecksum); + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); - Reference encyrptKey = cipherKeyCache.getCipherKey(header.encryptDomainId, header.baseCipherId); - ASSERT(encyrptKey->isEqual(cipherKey)); - DecryptBlobCipherAes256Ctr decryptor(encyrptKey, &header.iv[0]); - Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + TraceEvent("BlobCipherTest_EncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId); - ASSERT(decrypted->getLogicalSize() == bufLen); - ASSERT(memcmp(decrypted->begin(), &orgData[0], bufLen) == 0); + Reference tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.cipherTextDetails.iv[0]); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); - TraceEvent("BlobCipherTest_DecryptDone").log(); + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); - // induce encryption header corruption - headerVersion corrupted - header.flags.headerVersion += 1; - try { - decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); - } catch (Error& e) { - if (e.code() != error_code_encrypt_header_metadata_mismatch) { - throw; + TraceEvent("BlobCipherTest_DecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } } - header.flags.headerVersion -= 1; + + // induce encryption header corruption - encryptionMode corrupted + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor( + tCipherKeyKey, Reference(), &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + // No authToken, hence, no corruption detection supported + ASSERT(false); + } + + TraceEvent("NoneAuthMode_Done").log(); } - // induce encryption header corruption - encryptionMode corrupted - header.flags.encryptMode += 1; - try { - decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); - } catch (Error& e) { - if (e.code() != error_code_encrypt_header_metadata_mismatch) { - throw; + // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE + { + TraceEvent("SingleAuthMode_Start").log(); + + EncryptBlobCipherAes265Ctr encryptor( + cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + + TraceEvent("BlobCipherTest_EncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()); + + Reference tCipherKeyKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId); + Reference hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId); + ASSERT(tCipherKeyKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTest_DecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } } - header.flags.encryptMode -= 1; + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + headerCopy.singleAuthToken.authToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encrypted buffer payload corruption + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("SingleAuthMode_Done").log(); } - // induce encryption header corruption - checksum mismatch - header.ciphertextChecksum += 1; - try { - decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); - } catch (Error& e) { - if (e.code() != error_code_encrypt_header_checksum_mismatch) { - throw; + // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_MULTI + { + TraceEvent("MultiAuthMode_Start").log(); + + EncryptBlobCipherAes265Ctr encryptor( + cipherKey, headerCipherKey, iv, AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + BlobCipherEncryptHeader header; + Reference encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + + ASSERT_EQ(encrypted->getLogicalSize(), bufLen); + ASSERT_NE(memcmp(&orgData[0], encrypted->begin(), bufLen), 0); + ASSERT_EQ(header.flags.headerVersion, EncryptBlobCipherAes265Ctr::ENCRYPT_HEADER_VERSION); + ASSERT_EQ(header.flags.encryptMode, ENCRYPT_CIPHER_MODE_AES_256_CTR); + ASSERT_EQ(header.flags.authTokenMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI); + + TraceEvent("BlobCipherTest_EncryptDone") + .detail("HeaderVersion", header.flags.headerVersion) + .detail("HeaderEncryptMode", header.flags.encryptMode) + .detail("DomainId", header.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_SIZE).toString()); + + Reference tCipherKey = cipherKeyCache.getCipherKey(header.cipherTextDetails.encryptDomainId, + header.cipherTextDetails.baseCipherId); + Reference hCipherKey = cipherKeyCache.getCipherKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId); + + ASSERT(tCipherKey->isEqual(cipherKey)); + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + Reference decrypted = decryptor.decrypt(encrypted->begin(), bufLen, header, arena); + + ASSERT_EQ(decrypted->getLogicalSize(), bufLen); + ASSERT_EQ(memcmp(decrypted->begin(), &orgData[0], bufLen), 0); + + TraceEvent("BlobCipherTest_DecryptDone").log(); + + // induce encryption header corruption - headerVersion corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.headerVersion += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } } - header.ciphertextChecksum -= 1; + + // induce encryption header corruption - encryptionMode corrupted + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + headerCopy.flags.encryptMode += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_metadata_mismatch) { + throw; + } + } + + // induce encryption header corruption - cipherText authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + int hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + headerCopy.multiAuthTokens.cipherTextAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + // induce encryption header corruption - header authToken mismatch + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + memcpy(reinterpret_cast(&headerCopy), + reinterpret_cast(&header), + sizeof(BlobCipherEncryptHeader)); + hIdx = deterministicRandom()->randomInt(0, AUTH_TOKEN_SIZE - 1); + headerCopy.multiAuthTokens.headerAuthToken[hIdx] += 1; + try { + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(encrypted->begin(), bufLen, headerCopy, arena); + ASSERT(false); // error expected + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + try { + encrypted = encryptor.encrypt(&orgData[0], bufLen, &header, arena); + uint8_t temp[bufLen]; + memcpy(encrypted->begin(), &temp[0], bufLen); + int tIdx = deterministicRandom()->randomInt(0, bufLen - 1); + temp[tIdx] += 1; + DecryptBlobCipherAes256Ctr decryptor(tCipherKey, hCipherKey, &header.cipherTextDetails.iv[0]); + decrypted = decryptor.decrypt(&temp[0], bufLen, header, arena); + } catch (Error& e) { + if (e.code() != error_code_encrypt_header_authtoken_mismatch) { + throw; + } + } + + TraceEvent("MultiAuthMode_Done").log(); } // Validate dropping encyrptDomainId cached keys - const BlobCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); + const EncryptCipherDomainId candidate = deterministicRandom()->randomInt(minDomainId, maxDomainId); cipherKeyCache.resetEncyrptDomainId(candidate); std::vector> cachedKeys = cipherKeyCache.getAllCiphers(candidate); ASSERT(cachedKeys.empty()); @@ -633,20 +1060,4 @@ TEST_CASE("flow/BlobCipher") { return Void(); } -BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload, - const int payloadLen, - const BlobCipherRandomSalt& salt, - Arena& arena) { - // FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate checksum - // Leverage HMAC_SHA256 using header.randomSalt as the initialization 'key' for the hmac digest. - - HmacSha256DigestGen hmacGenerator((const uint8_t*)&salt, sizeof(salt)); - StringRef digest = hmacGenerator.digest(payload, payloadLen, arena); - ASSERT(digest.size() >= sizeof(BlobCipherChecksum)); - - BlobCipherChecksum checksum; - memcpy((uint8_t*)&checksum, digest.begin(), sizeof(BlobCipherChecksum)); - return checksum; -} - #endif // ENCRYPTION_ENABLED diff --git a/flow/BlobCipher.h b/flow/BlobCipher.h index 151e60efd0..8fc0242e2b 100644 --- a/flow/BlobCipher.h +++ b/flow/BlobCipher.h @@ -33,6 +33,7 @@ #if ENCRYPTION_ENABLED #include "flow/Arena.h" +#include "flow/EncryptUtils.h" #include "flow/FastRef.h" #include "flow/flow.h" #include "flow/xxhash.h" @@ -45,15 +46,6 @@ #define AES_256_KEY_LENGTH 32 #define AES_256_IV_LENGTH 16 -#define INVALID_DOMAIN_ID 0 -#define INVALID_CIPHER_KEY_ID 0 - -using BlobCipherDomainId = uint64_t; -using BlobCipherRandomSalt = uint64_t; -using BlobCipherBaseKeyId = uint64_t; -using BlobCipherChecksum = uint64_t; - -typedef enum { BLOB_CIPHER_ENCRYPT_MODE_NONE = 0, BLOB_CIPHER_ENCRYPT_MODE_AES_256_CTR = 1 } BlockCipherEncryptMode; // Encryption operations buffer management // Approach limits number of copies needed during encryption or decryption operations. @@ -89,51 +81,94 @@ private: // This header is persisted along with encrypted buffer, it contains information necessary // to assist decrypting the buffers to serve read requests. // -// The total space overhead is 56 bytes. +// The total space overhead is 96 bytes. #pragma pack(push, 1) // exact fit - no padding typedef struct BlobCipherEncryptHeader { + static constexpr int headerSize = 96; union { struct { uint8_t size; // reading first byte is sufficient to determine header // length. ALWAYS THE FIRST HEADER ELEMENT. uint8_t headerVersion{}; uint8_t encryptMode{}; - uint8_t _reserved[5]{}; + uint8_t authTokenMode{}; + uint8_t _reserved[4]{}; } flags; uint64_t _padding{}; }; - // Encyrption domain boundary identifier. - BlobCipherDomainId encryptDomainId{}; - // BaseCipher encryption key identifier - BlobCipherBaseKeyId baseCipherId{}; - // Random salt - BlobCipherRandomSalt salt{}; - // Checksum of the encrypted buffer. It protects against 'tampering' of ciphertext as well 'bit rots/flips'. - BlobCipherChecksum ciphertextChecksum{}; - // Initialization vector used to encrypt the payload. - uint8_t iv[AES_256_IV_LENGTH]; - BlobCipherEncryptHeader(); + // Cipher text encryption information + struct { + // Encyrption domain boundary identifier. + EncryptCipherDomainId encryptDomainId{}; + // BaseCipher encryption key identifier + EncryptCipherBaseKeyId baseCipherId{}; + // Random salt + EncryptCipherRandomSalt salt{}; + // Initialization vector used to encrypt the payload. + uint8_t iv[AES_256_IV_LENGTH]; + } cipherTextDetails; + + struct { + // Encryption domainId for the header + EncryptCipherDomainId encryptDomainId{}; + // BaseCipher encryption key identifier. + EncryptCipherBaseKeyId baseCipherId{}; + } cipherHeaderDetails; + + // Encryption header is stored as plaintext on a persistent storage to assist reconstruction of cipher-key(s) for + // reads. FIPS compliance recommendation is to leverage cryptographic digest mechanism to generate 'authentication + // token' (crypto-secure) to protect against malicious tampering and/or bit rot/flip scenarios. + + union { + // Encryption header support two modes of generation 'authentication tokens': + // 1) SingleAuthTokenMode: the scheme generates single crypto-secrure auth token to protect {cipherText + + // header} payload. Scheme is geared towards optimizing cost due to crypto-secure auth-token generation, + // however, on decryption client needs to be read 'header' + 'encrypted-buffer' to validate the 'auth-token'. + // The scheme is ideal for usecases where payload represented by the encryptionHeader is not large and it is + // desirable to minimize CPU/latency penalty due to crypto-secure ops, such as: CommitProxies encrypted inline + // transactions, StorageServer encrypting pages etc. 2) MultiAuthTokenMode: Scheme generates separate authTokens + // for 'encrypted buffer' & 'encryption-header'. The scheme is ideal where payload represented by + // encryptionHeader is large enough such that it is desirable to optimize cost of upfront reading full + // 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance: + // backup-files. + + struct { + // Cipher text authentication token + uint8_t cipherTextAuthToken[AUTH_TOKEN_SIZE]{}; + uint8_t headerAuthToken[AUTH_TOKEN_SIZE]{}; + } multiAuthTokens; + struct { + uint8_t authToken[AUTH_TOKEN_SIZE]{}; + uint8_t _reserved[AUTH_TOKEN_SIZE]{}; + } singleAuthToken; + }; + + BlobCipherEncryptHeader() {} } BlobCipherEncryptHeader; #pragma pack(pop) +// Ensure no struct-packing issues +static_assert(sizeof(BlobCipherEncryptHeader) == BlobCipherEncryptHeader::headerSize, + "BlobCipherEncryptHeader size mismatch"); + // This interface is in-memory representation of CipherKey used for encryption/decryption information. // It caches base encryption key properties as well as caches the 'derived encryption' key obtained by applying // HMAC-SHA-256 derivation technique. class BlobCipherKey : public ReferenceCounted, NonCopyable { public: - BlobCipherKey(const BlobCipherDomainId& domainId, - const BlobCipherBaseKeyId& baseCiphId, + BlobCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCiphId, const uint8_t* baseCiph, int baseCiphLen); uint8_t* data() const { return cipher.get(); } uint64_t getCreationTime() const { return creationTime; } - BlobCipherDomainId getDomainId() const { return encryptDomainId; } - BlobCipherRandomSalt getSalt() const { return randomSalt; } - BlobCipherBaseKeyId getBaseCipherId() const { return baseCipherId; } + EncryptCipherDomainId getDomainId() const { return encryptDomainId; } + EncryptCipherRandomSalt getSalt() const { return randomSalt; } + EncryptCipherBaseKeyId getBaseCipherId() const { return baseCipherId; } int getBaseCipherLen() const { return baseCipherLen; } uint8_t* rawCipher() const { return cipher.get(); } uint8_t* rawBaseCipher() const { return baseCipher.get(); } @@ -147,23 +182,23 @@ public: private: // Encryption domain boundary identifier - BlobCipherDomainId encryptDomainId; + EncryptCipherDomainId encryptDomainId; // Base encryption cipher key properties std::unique_ptr baseCipher; int baseCipherLen; - BlobCipherBaseKeyId baseCipherId; + EncryptCipherBaseKeyId baseCipherId; // Random salt used for encryption cipher key derivation - BlobCipherRandomSalt randomSalt; + EncryptCipherRandomSalt randomSalt; // Creation timestamp for the derived encryption cipher key uint64_t creationTime; // Derived encryption cipher key std::unique_ptr cipher; - void initKey(const BlobCipherDomainId& domainId, + void initKey(const EncryptCipherDomainId& domainId, const uint8_t* baseCiph, int baseCiphLen, - const BlobCipherBaseKeyId& baseCiphId, - const BlobCipherRandomSalt& salt); + const EncryptCipherBaseKeyId& baseCiphId, + const EncryptCipherRandomSalt& salt); void applyHmacSha256Derivation(); }; @@ -190,37 +225,45 @@ private: // required encryption key, however, CPs/SSs cache-miss would result in RPC to // EncryptKeyServer to refresh the desired encryption key. -using BlobCipherKeyIdCacheMap = std::unordered_map>; -using BlobCipherKeyIdCacheMapCItr = std::unordered_map>::const_iterator; +using BlobCipherKeyIdCacheMap = std::unordered_map>; +using BlobCipherKeyIdCacheMapCItr = + std::unordered_map>::const_iterator; struct BlobCipherKeyIdCache : ReferenceCounted { public: BlobCipherKeyIdCache(); - explicit BlobCipherKeyIdCache(BlobCipherDomainId dId); + explicit BlobCipherKeyIdCache(EncryptCipherDomainId dId); // API returns the last inserted cipherKey. // If none exists, 'encrypt_key_not_found' is thrown. + Reference getLatestCipherKey(); + // API returns cipherKey corresponding to input 'baseCipherKeyId'. // If none exists, 'encrypt_key_not_found' is thrown. - Reference getCipherByBaseCipherId(BlobCipherBaseKeyId baseCipherKeyId); + + Reference getCipherByBaseCipherId(EncryptCipherBaseKeyId baseCipherKeyId); + // API enables inserting base encryption cipher details to the BlobCipherKeyIdCache. // Given cipherKeys are immutable, attempting to re-insert same 'identical' cipherKey // is treated as a NOP (success), however, an attempt to update cipherKey would throw // 'encrypt_update_cipher' exception. - void insertBaseCipherKey(BlobCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen); + + void insertBaseCipherKey(EncryptCipherBaseKeyId baseCipherId, const uint8_t* baseCipher, int baseCipherLen); + // API cleanup the cache by dropping all cached cipherKeys void cleanup(); + // API returns list of all 'cached' cipherKeys std::vector> getAllCipherKeys(); private: - BlobCipherDomainId domainId; + EncryptCipherDomainId domainId; BlobCipherKeyIdCacheMap keyIdCache; - BlobCipherBaseKeyId latestBaseCipherKeyId; + EncryptCipherBaseKeyId latestBaseCipherKeyId; }; -using BlobCipherDomainCacheMap = std::unordered_map>; +using BlobCipherDomainCacheMap = std::unordered_map>; class BlobCipherKeyCache : NonCopyable { public: @@ -228,21 +271,28 @@ public: // The cipherKeys are indexed using 'baseCipherId', given cipherKeys are immutable, // attempting to re-insert same 'identical' cipherKey is treated as a NOP (success), // however, an attempt to update cipherKey would throw 'encrypt_update_cipher' exception. - void insertCipherKey(const BlobCipherDomainId& domainId, - const BlobCipherBaseKeyId& baseCipherId, + + void insertCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId, const uint8_t* baseCipher, int baseCipherLen); // API returns the last insert cipherKey for a given encyryption domain Id. // If none exists, it would throw 'encrypt_key_not_found' exception. - Reference getLatestCipherKey(const BlobCipherDomainId& domainId); + + Reference getLatestCipherKey(const EncryptCipherDomainId& domainId); + // API returns cipherKey corresponding to {encryptionDomainId, baseCipherId} tuple. // If none exists, it would throw 'encrypt_key_not_found' exception. - Reference getCipherKey(const BlobCipherDomainId& domainId, const BlobCipherBaseKeyId& baseCipherId); + + Reference getCipherKey(const EncryptCipherDomainId& domainId, + const EncryptCipherBaseKeyId& baseCipherId); // API returns point in time list of all 'cached' cipherKeys for a given encryption domainId. - std::vector> getAllCiphers(const BlobCipherDomainId& domainId); + std::vector> getAllCiphers(const EncryptCipherDomainId& domainId); + // API enables dropping all 'cached' cipherKeys for a given encryption domain Id. // Useful to cleanup cache if an encryption domain gets removed/destroyed etc. - void resetEncyrptDomainId(const BlobCipherDomainId domainId); + + void resetEncyrptDomainId(const EncryptCipherDomainId domainId); static BlobCipherKeyCache& getInstance() { static BlobCipherKeyCache instance; @@ -262,14 +312,19 @@ private: // This interface enables data block encryption. An invocation to encrypt() will // do two things: // 1) generate encrypted ciphertext for given plaintext input. -// 2) generate BlobCipherEncryptHeader (including the 'header checksum') and persit for decryption on reads. +// 2) generate BlobCipherEncryptHeader (including the 'header authTokens') and persit for decryption on reads. class EncryptBlobCipherAes265Ctr final : NonCopyable, public ReferenceCounted { public: static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1; - EncryptBlobCipherAes265Ctr(Reference key, const uint8_t* iv, const int ivLen); + EncryptBlobCipherAes265Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* iv, + const int ivLen, + const EncryptAuthTokenMode mode); ~EncryptBlobCipherAes265Ctr(); + Reference encrypt(const uint8_t* plaintext, const int plaintextLen, BlobCipherEncryptHeader* header, @@ -277,7 +332,9 @@ public: private: EVP_CIPHER_CTX* ctx; - Reference cipherKey; + Reference textCipherKey; + Reference headerCipherKey; + EncryptAuthTokenMode authTokenMode; uint8_t iv[AES_256_IV_LENGTH]; }; @@ -286,20 +343,44 @@ private: class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted { public: - DecryptBlobCipherAes256Ctr(Reference key, const uint8_t* iv); + DecryptBlobCipherAes256Ctr(Reference tCipherKey, + Reference hCipherKey, + const uint8_t* iv); ~DecryptBlobCipherAes256Ctr(); + Reference decrypt(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header, Arena&); + // Enable caller to validate encryption header auth-token (if available) without needing to read the full encyrpted + // payload. The call is NOP unless header.flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI. + + void verifyHeaderAuthToken(const BlobCipherEncryptHeader& header, Arena& arena); + private: EVP_CIPHER_CTX* ctx; + Reference textCipherKey; + Reference headerCipherKey; + bool headerAuthTokenValidationDone; + bool authTokensValidationDone; - void verifyEncryptBlobHeader(const uint8_t* cipherText, - const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena); + void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header); + void verifyAuthTokens(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena); + void verifyHeaderSingleAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena); + void verifyHeaderMultiAuthToken(const uint8_t* ciphertext, + const int ciphertextLen, + const BlobCipherEncryptHeader& header, + uint8_t* buff, + Arena& arena); }; class HmacSha256DigestGen final : NonCopyable { @@ -313,9 +394,10 @@ private: HMAC_CTX* ctx; }; -BlobCipherChecksum computeEncryptChecksum(const uint8_t* payload, - const int payloadLen, - const BlobCipherRandomSalt& salt, - Arena& arena); +StringRef computeAuthToken(const uint8_t* payload, + const int payloadLen, + const uint8_t* key, + const int keyLen, + Arena& arena); #endif // ENCRYPTION_ENABLED diff --git a/flow/EncryptUtils.h b/flow/EncryptUtils.h new file mode 100644 index 0000000000..e386602b5b --- /dev/null +++ b/flow/EncryptUtils.h @@ -0,0 +1,66 @@ +/* + * EncryptUtils.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENCRYPT_UTILS_H +#define ENCRYPT_UTILS_H +#pragma once + +#include +#include + +#define ENCRYPT_INVALID_DOMAIN_ID 0 +#define ENCRYPT_INVALID_CIPHER_KEY_ID 0 + +#define AUTH_TOKEN_SIZE 16 + +#define SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID -1 +#define ENCRYPT_HEADER_DOMAIN_ID -2 + +using EncryptCipherDomainId = int64_t; +using EncryptCipherBaseKeyId = uint64_t; +using EncryptCipherRandomSalt = uint64_t; + +typedef enum { + ENCRYPT_CIPHER_MODE_NONE = 0, + ENCRYPT_CIPHER_MODE_AES_256_CTR = 1, + ENCRYPT_CIPHER_MODE_LAST = 2 +} EncryptCipherMode; + +static_assert(EncryptCipherMode::ENCRYPT_CIPHER_MODE_LAST <= std::numeric_limits::max(), + "EncryptCipherMode value overflow"); + +// EncryptionHeader authentication modes +// 1. NONE - No 'authentication token' generation needed for EncryptionHeader i.e. no protection against header OR +// cipherText 'tampering' and/or bit rot/flip corruptions. +// 2. Single/Multi - Encyrption header would generate one or more 'authentication tokens' to protect the header against +// 'tempering' and/or bit rot/flip corruptions. Refer to BlobCipher.h for detailed usage recommendations. +// 3. LAST - Invalid mode, used for static asserts. + +typedef enum { + ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE = 0, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE = 1, + ENCRYPT_HEADER_AUTH_TOKEN_MODE_MULTI = 2, + ENCRYPT_HEADER_AUTH_TOKEN_LAST = 3 // Always the last element +} EncryptAuthTokenMode; + +static_assert(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_LAST <= std::numeric_limits::max(), + "EncryptHeaderAuthToken value overflow"); + +#endif diff --git a/flow/actorcompiler.h b/flow/actorcompiler.h index 1ed8c5391c..3a60aa3588 100644 --- a/flow/actorcompiler.h +++ b/flow/actorcompiler.h @@ -73,3 +73,11 @@ T waitNext(const FutureStream&); #ifdef _MSC_VER #pragma warning(disable : 4355) // 'this' : used in base member initializer list #endif + +// Currently, #ifdef can't be used inside actors, so define no-op versions of these valgrind +// functions if valgrind is not defined +#ifndef VALGRIND +#define VALGRIND_MAKE_MEM_UNDEFINED(x, y) +#define VALGRIND_MAKE_MEM_DEFINED(x, y) +#define VALGRIND_CHECK_MEM_IS_DEFINED(x, y) 0 +#endif diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 5e2e8b1758..1a054ce43e 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -88,7 +88,14 @@ ERROR( blob_granule_transaction_too_old, 1064, "Read version is older than blob ERROR( blob_manager_replaced, 1065, "This blob manager has been replaced." ) ERROR( change_feed_popped, 1066, "Tried to read a version older than what has been popped from the change feed" ) ERROR( remote_kvs_cancelled, 1067, "The remote key-value store is cancelled" ) -ERROR( stale_version_vector, 1068, "Client version vector is stale" ) +ERROR( page_header_wrong_page_id, 1068, "Page header does not match location on disk" ) +ERROR( page_header_checksum_failed, 1069, "Page header checksum failed" ) +ERROR( page_header_version_not_supported, 1070, "Page header version is not supported" ) +ERROR( page_encoding_not_supported, 1071, "Page encoding type is not supported or not valid" ) +ERROR( page_decoding_failed, 1072, "Page content decoding failed" ) +ERROR( unexpected_encoding_type, 1073, "Page content decoding failed" ) +ERROR( encryption_key_not_found, 1074, "Encryption key not found" ) +ERROR( stale_version_vector, 1075, "Client version vector is stale" ) ERROR( broken_promise, 1100, "Broken promise" ) ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" ) @@ -291,14 +298,14 @@ ERROR( snap_log_anti_quorum_unsupported, 2507, "Unsupported when log anti quorum ERROR( snap_with_recovery_unsupported, 2508, "Cluster recovery during snapshot operation not supported") ERROR( snap_invalid_uid_string, 2509, "The given uid string is not a 32-length hex string") -// 3XXX - Encryption operations errors -ERROR( encrypt_ops_error, 3000, "Encryption operation error") -ERROR( encrypt_header_metadata_mismatch, 3001, "Encryption header metadata mismatch") -ERROR( encrypt_key_not_found, 3002, "Expected encryption key is missing") -ERROR( encrypt_key_ttl_expired, 3003, "Expected encryption key TTL has expired") -ERROR( encrypt_header_checksum_mismatch, 3004, "Encryption header checksum mismatch") -ERROR( encrypt_update_cipher, 3005, "Attempt to update encryption cipher key") -ERROR( encrypt_invalid_id, 3006, "Invalid encryption domainId or encryption cipher key id") +// 27XX - Encryption operations errors +ERROR( encrypt_ops_error, 2700, "Encryption operation error") +ERROR( encrypt_header_metadata_mismatch, 2701, "Encryption header metadata mismatch") +ERROR( encrypt_key_not_found, 2702, "Expected encryption key is missing") +ERROR( encrypt_key_ttl_expired, 2703, "Expected encryption key TTL has expired") +ERROR( encrypt_header_authtoken_mismatch, 2704, "Encryption header authentication token mismatch") +ERROR( encrypt_update_cipher, 2705, "Attempt to update encryption cipher key") +ERROR( encrypt_invalid_id, 2706, "Invalid encryption domainId or encryption cipher key id") // 4xxx Internal errors (those that should be generated only by bugs) are decimal 4xxx ERROR( unknown_error, 4000, "An unknown error occurred" ) // C++ exception not of type Error diff --git a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt index 4f80e50ebb..569a05ef64 100644 --- a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt +++ b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.txt @@ -1,3 +1,5 @@ +storageEngineExcludeTypes=3 + logAntiQuorum = 0 testTitle=SubmitBackup diff --git a/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt b/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt index 401a075c0d..a4f26975c7 100644 --- a/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt +++ b/tests/restarting/from_7.0.0/SnapTestAttrition-1.txt @@ -1,3 +1,5 @@ +storageEngineExcludeTypes=3 + ;write 1000 Keys ending with even numbers testTitle=SnapTestPre clearAfterTest=false diff --git a/tests/restarting/from_7.0.0/SnapTestRestart-1.txt b/tests/restarting/from_7.0.0/SnapTestRestart-1.txt index 319a617628..ae229b0458 100644 --- a/tests/restarting/from_7.0.0/SnapTestRestart-1.txt +++ b/tests/restarting/from_7.0.0/SnapTestRestart-1.txt @@ -1,3 +1,5 @@ +storageEngineExcludeTypes=3 + ;write 1000 Keys ending with even numbers testTitle=SnapTestPre clearAfterTest=false diff --git a/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt b/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt index 2416f27d29..118e00a16b 100644 --- a/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt +++ b/tests/restarting/from_7.0.0/SnapTestSimpleRestart-1.txt @@ -1,3 +1,5 @@ +storageEngineExcludeTypes=3 + ;write 1000 Keys ending with even number testTitle=SnapSimplePre clearAfterTest=false diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml index 7604a8d017..f0f2141778 100644 --- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml +++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml @@ -1,3 +1,5 @@ +storageEngineExcludeTypes=3 + [[test]] testTitle = 'SubmitBackup' simBackupAgents= 'BackupToFile' diff --git a/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml b/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml index 9fa2c4c784..e9ae14f0c3 100644 --- a/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml +++ b/tests/restarting/from_7.1.0/ConfigureStorageMigrationTestRestart-2.toml @@ -10,6 +10,7 @@ waitForQuiescenceBegin=false testName = 'ConfigureDatabase' testDuration = 300.0 waitStoreTypeCheck = true + storageMigrationCompatibleConf = true [[test.workload]] testName = 'RandomClogging' diff --git a/tests/restarting/to_7.0.0/CycleTestRestart-1.txt b/tests/restarting/to_7.0.0/CycleTestRestart-1.txt index 78c3c64e04..4b5c917a1d 100644 --- a/tests/restarting/to_7.0.0/CycleTestRestart-1.txt +++ b/tests/restarting/to_7.0.0/CycleTestRestart-1.txt @@ -1,4 +1,4 @@ -storageEngineExcludeTypes=-1,-2 +storageEngineExcludeTypes=-1,-2,3 maxTLogVersion=6 disableTss=true disableHostname=true diff --git a/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml b/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml index 9fa2c4c784..e9ae14f0c3 100644 --- a/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml +++ b/tests/restarting/to_7.1.0/ConfigureStorageMigrationTestRestart-2.toml @@ -10,6 +10,7 @@ waitForQuiescenceBegin=false testName = 'ConfigureDatabase' testDuration = 300.0 waitStoreTypeCheck = true + storageMigrationCompatibleConf = true [[test.workload]] testName = 'RandomClogging'