Split Redwood page by tenant boundary (#7979)

Redwood encrypt with page granularity. To do per-tenant encryption (i.e. each tenant are encrypted with different set of cipher keys), we need to split Redwood pages by tenant boundary. Moreover, it also needs to handle different tenant modes:
* tenantMode = disabled: do not split page by tenant, all data encrypt using default encryption domain.
* tenantMode = required: look at the prefix of the keys and split by tenant accordingly, and encrypt in tenant specific encryption domain accordingly.
* tenantMode = optional: some key ranges may not map to a tenant. In additional to looking at the key prefix, the key provider also query the tenant prefix index. For prefixes not found in the tenant prefix index, corresponding key should be encrypted using the default encryption domain.

The change also enforce data for each tenant forms a subtree, and key of the link to the subtree is exactly the tenant prefix.

This PR is building on top of #8172 and use the IPageEncryptionKeyProvider interface added there. Changes:
* In `writePages` and `splitPages`, query the key provider to split page accordingly.
* In `commitSubtree`, when doing in-place update (to both of leaf or internal page), check if the entry being insert belong to the same encryption domain as existing data of the page. If not, fallback to full page rebuild, where `writePages` will handle the page split.
* When updating the root, check if it is encrypted using non-default (i.e. tenant specific) domain. If so, add a redundant root node which will be encrypted with default encryption domain.

Tested with 100K run of `Lredwood/correctness/btree` unit test, where it uses `RandomEncryptionKeyProvider`, which is updated to support and generate random encryption domain with 4 byte domain prefixes.
This commit is contained in:
Yi Wu 2022-10-04 12:53:55 -07:00 committed by GitHub
parent 55b880432e
commit 5c549601d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 608 additions and 152 deletions

View File

@ -18,37 +18,40 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fmt/format.h" #include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
#include "fdbclient/Tuple.h"
#include "fdbrpc/ContinuousSample.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/DeltaTree.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/IPager.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "flow/Error.h" #include "flow/Error.h"
#include "flow/FastRef.h"
#include "flow/flow.h"
#include "flow/genericactors.actor.h"
#include "flow/Histogram.h"
#include "flow/IAsyncFile.h"
#include "flow/IRandom.h" #include "flow/IRandom.h"
#include "flow/Knobs.h" #include "flow/Knobs.h"
#include "flow/ObjectSerializer.h" #include "flow/ObjectSerializer.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/Histogram.h"
#include <limits>
#include <random>
#include "fdbrpc/ContinuousSample.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/IPager.h"
#include "fdbclient/Tuple.h"
#include "flow/serialize.h" #include "flow/serialize.h"
#include "flow/genericactors.actor.h" #include "flow/Trace.h"
#include "flow/UnitTest.h" #include "flow/UnitTest.h"
#include "flow/IAsyncFile.h" #include "fmt/format.h"
#include "flow/ActorCollection.h"
#include <boost/intrusive/list.hpp>
#include <cinttypes>
#include <limits>
#include <map> #include <map>
#include <random>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "fdbclient/CommitTransaction.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/DeltaTree.h"
#include <string.h>
#include <cinttypes>
#include <boost/intrusive/list.hpp>
#include "flow/actorcompiler.h" // must be last include #include "flow/actorcompiler.h" // must be last include
#define REDWOOD_DEBUG 0 #define REDWOOD_DEBUG 0
@ -4768,6 +4771,8 @@ struct DecodeBoundaryVerifier {
struct DecodeBoundaries { struct DecodeBoundaries {
Key lower; Key lower;
Key upper; Key upper;
unsigned int height;
Optional<int64_t> domainId;
bool empty() const { return lower.empty() && upper.empty(); } bool empty() const { return lower.empty() && upper.empty(); }
}; };
@ -4776,6 +4781,11 @@ struct DecodeBoundaryVerifier {
std::vector<Key> boundarySamples; std::vector<Key> boundarySamples;
int boundarySampleSize = 1000; int boundarySampleSize = 1000;
int boundaryPopulation = 0; int boundaryPopulation = 0;
Reference<IPageEncryptionKeyProvider> keyProvider;
// Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement.
double domainPrefixScanProbability = 0.01;
uint64_t domainPrefixScanCount = 0;
static DecodeBoundaryVerifier* getVerifier(std::string name) { static DecodeBoundaryVerifier* getVerifier(std::string name) {
static std::map<std::string, DecodeBoundaryVerifier> verifiers; static std::map<std::string, DecodeBoundaryVerifier> verifiers;
@ -4786,6 +4796,8 @@ struct DecodeBoundaryVerifier {
return nullptr; return nullptr;
} }
void setKeyProvider(Reference<IPageEncryptionKeyProvider> kp) { keyProvider = kp; }
void sampleBoundary(Key b) { void sampleBoundary(Key b) {
if (boundaryPopulation <= boundarySampleSize) { if (boundaryPopulation <= boundarySampleSize) {
boundarySamples.push_back(b); boundarySamples.push_back(b);
@ -4802,21 +4814,53 @@ struct DecodeBoundaryVerifier {
return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())]; return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())];
} }
void update(BTreeNodeLinkRef id, Version v, Key lowerBound, Key upperBound) { bool update(BTreeNodeLinkRef id,
Version v,
Key lowerBound,
Key upperBound,
unsigned int height,
Optional<int64_t> domainId) {
sampleBoundary(lowerBound); sampleBoundary(lowerBound);
sampleBoundary(upperBound); sampleBoundary(upperBound);
debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s'\n", debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s', %u, %s\n",
::toString(id).c_str(), ::toString(id).c_str(),
::toString(v).c_str(), ::toString(v).c_str(),
lowerBound.printable().c_str(), lowerBound.printable().c_str(),
upperBound.printable().c_str()); upperBound.printable().c_str(),
height,
Traceable<decltype(domainId)>::toString(domainId).c_str());
if (domainId.present()) {
ASSERT(keyProvider && keyProvider->enableEncryptionDomain());
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId() &&
!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, false)) {
fprintf(stderr,
"Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
lowerBound.printable().c_str());
return false;
}
*/
}
auto& b = boundariesByPageID[id.front()][v]; auto& b = boundariesByPageID[id.front()][v];
ASSERT(b.empty()); ASSERT(b.empty());
b = { lowerBound, upperBound }; b = { lowerBound, upperBound, height, domainId };
return true;
} }
bool verify(LogicalPageID id, Version v, Key lowerBound, Key upperBound) { bool verify(LogicalPageID id,
Version v,
Key lowerBound,
Key upperBound,
Optional<int64_t> domainId,
BTreePage::BinaryTree::Cursor& cursor) {
auto i = boundariesByPageID.find(id); auto i = boundariesByPageID.find(id);
ASSERT(i != boundariesByPageID.end()); ASSERT(i != boundariesByPageID.end());
ASSERT(!i->second.empty()); ASSERT(!i->second.empty());
@ -4835,10 +4879,66 @@ struct DecodeBoundaryVerifier {
b->second.upper.printable().c_str()); b->second.upper.printable().c_str());
return false; return false;
} }
if (!b->second.domainId.present()) {
ASSERT(!keyProvider || !keyProvider->enableEncryptionDomain());
ASSERT(!domainId.present());
} else {
ASSERT(keyProvider->enableEncryptionDomain());
if (b->second.domainId != domainId) {
fprintf(stderr,
"Page encrypted with incorrect domain: %s %s, using %s, written %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
::toString(b->second.domainId).c_str());
return false;
}
// Temporarily disabling the check, since if a tenant is removed, where the key provider
// would not find the domain, the data for the tenant may still be in Redwood and being read.
// TODO(yiwu): re-enable the check.
/*
ASSERT(domainId.present());
auto checkKeyFitsInDomain = [&]() -> bool {
if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) {
fprintf(stderr,
"Encryption domain mismatch on %s, %s, domain: %s, key %s\n",
::toString(id).c_str(),
::toString(v).c_str(),
::toString(domainId).c_str(),
cursor.get().key.printable().c_str());
return false;
}
return true;
};
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId()) {
cursor.moveFirst();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
cursor.moveLast();
if (cursor.valid() && !checkKeyFitsInDomain()) {
return false;
}
} else {
if (deterministicRandom()->random01() < domainPrefixScanProbability) {
cursor.moveFirst();
while (cursor.valid()) {
if (!checkKeyFitsInDomain()) {
return false;
}
cursor.moveNext();
}
domainPrefixScanCount++;
}
}
*/
}
return true; return true;
} }
void update(Version v, LogicalPageID oldID, LogicalPageID newID) { void updatePageId(Version v, LogicalPageID oldID, LogicalPageID newID) {
auto& old = boundariesByPageID[oldID]; auto& old = boundariesByPageID[oldID];
ASSERT(!old.empty()); ASSERT(!old.empty());
auto i = old.end(); auto i = old.end();
@ -5053,6 +5153,10 @@ public:
} }
m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name); m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name);
if (m_pBoundaryVerifier != nullptr) {
m_pBoundaryVerifier->setKeyProvider(m_keyProvider);
}
m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource(); m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource();
m_lazyClearActor = 0; m_lazyClearActor = 0;
m_init = init_impl(this); m_init = init_impl(this);
@ -5574,16 +5678,27 @@ private:
// Describes a range of a vector of records that should be built into a single BTreePage // Describes a range of a vector of records that should be built into a single BTreePage
struct PageToBuild { struct PageToBuild {
PageToBuild(int index, int blockSize, EncodingType t) PageToBuild(int index,
int blockSize,
EncodingType encodingType,
unsigned int height,
bool useEncryptionDomain,
bool splitByDomain,
IPageEncryptionKeyProvider* keyProvider)
: startIndex(index), count(0), pageSize(blockSize), : startIndex(index), count(0), pageSize(blockSize),
largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1), largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
kvBytes(0) { kvBytes(0), encodingType(encodingType), height(height), useEncryptionDomain(useEncryptionDomain),
splitByDomain(splitByDomain), keyProvider(keyProvider) {
// Subtrace Page header overhead, BTreePage overhead, and DeltaTree (BTreePage::BinaryTree) overhead. // Subtrace Page header overhead, BTreePage overhead, and DeltaTree (BTreePage::BinaryTree) overhead.
bytesLeft = ArenaPage::getUsableSize(blockSize, t) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree); bytesLeft =
ArenaPage::getUsableSize(blockSize, encodingType) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree);
} }
PageToBuild next(EncodingType t) { return PageToBuild(endIndex(), blockSize, t); } PageToBuild next() {
return PageToBuild(
endIndex(), blockSize, encodingType, height, useEncryptionDomain, splitByDomain, keyProvider);
}
int startIndex; // Index of the first record int startIndex; // Index of the first record
int count; // Number of records added to the page int count; // Number of records added to the page
@ -5595,6 +5710,16 @@ private:
int blockCount; // The number of blocks in pageSize int blockCount; // The number of blocks in pageSize
int kvBytes; // The amount of user key/value bytes added to the page int kvBytes; // The amount of user key/value bytes added to the page
EncodingType encodingType;
unsigned int height;
bool useEncryptionDomain;
bool splitByDomain;
IPageEncryptionKeyProvider* keyProvider;
Optional<int64_t> domainId = Optional<int64_t>();
size_t domainPrefixLength = 0;
bool canUseDefaultDomain = false;
// Number of bytes used by the generated/serialized BTreePage, including all headers // Number of bytes used by the generated/serialized BTreePage, including all headers
int usedBytes() const { return pageSize - bytesLeft; } int usedBytes() const { return pageSize - bytesLeft; }
@ -5614,17 +5739,18 @@ private:
int endIndex() const { return startIndex + count; } int endIndex() const { return startIndex + count; }
std::string toString() const { std::string toString() const {
return format( return format("{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d "
"{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}", "large=%d, domain=%s}",
startIndex, startIndex,
count, count,
usedBytes(), usedBytes(),
pageSize, pageSize,
slackFraction() * 100, slackFraction() * 100,
kvBytes, kvBytes,
blockCount, blockCount,
blockSize, blockSize,
largeDeltaTree); largeDeltaTree,
::toString(domainId).c_str());
} }
// Move an item from a to b if a has 2 or more items and the item fits in b // Move an item from a to b if a has 2 or more items and the item fits in b
@ -5656,7 +5782,8 @@ private:
// Try to add a record of the given delta size to the page. // Try to add a record of the given delta size to the page.
// If force is true, the page will be expanded to make the record fit if needed. // If force is true, the page will be expanded to make the record fit if needed.
// Return value is whether or not the record was added to the page. // Return value is whether or not the record was added to the page.
bool addRecord(const RedwoodRecordRef& rec, int deltaSize, bool force) { bool addRecord(const RedwoodRecordRef& rec, const RedwoodRecordRef& nextRecord, int deltaSize, bool force) {
int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree); int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree);
// If the record doesn't fit and the page can't be expanded then return false // If the record doesn't fit and the page can't be expanded then return false
@ -5664,6 +5791,53 @@ private:
return false; return false;
} }
if (useEncryptionDomain) {
int64_t defaultDomainId = keyProvider->getDefaultEncryptionDomainId();
int64_t currentDomainId;
size_t prefixLength;
if (count == 0 || (splitByDomain && count > 0)) {
std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key, domainId);
}
if (count == 0) {
domainId = currentDomainId;
domainPrefixLength = prefixLength;
canUseDefaultDomain =
(height > 1 && (currentDomainId == defaultDomainId || prefixLength == rec.key.size()));
} else if (splitByDomain) {
ASSERT(domainId.present());
if (domainId == currentDomainId) {
// The new record falls in the same domain as the rest of the page.
// Since this is not the first record, the key must contain a non-prefix portion,
// so we cannot use the default domain the encrypt the page (unless domainId is the default
// domain).
if (domainId != defaultDomainId) {
ASSERT(prefixLength < rec.key.size());
canUseDefaultDomain = false;
}
} else if (canUseDefaultDomain &&
(currentDomainId == defaultDomainId ||
(prefixLength == rec.key.size() &&
(nextRecord.key.empty() ||
!nextRecord.key.startsWith(rec.key.substr(0, prefixLength)))))) {
// The new record meets one of the following conditions:
// 1. it falls in the default domain, or
// 2. its key contain only the domain prefix, and
// 2a. the following record doesn't fall in the same domain.
// In this case switch to use the default domain to encrypt the page.
// Condition 2a is needed, because if there are multiple records from the same domain,
// they need to form their own page(s).
domainId = defaultDomainId;
domainPrefixLength = 0;
} else {
// The new record doesn't fit in the same domain as the existing page.
return false;
}
} else {
ASSERT(domainPrefixLength < rec.key.size());
canUseDefaultDomain = false;
}
}
++count; ++count;
bytesLeft -= nodeSize; bytesLeft -= nodeSize;
kvBytes += rec.kvBytes(); kvBytes += rec.kvBytes();
@ -5686,6 +5860,12 @@ private:
} }
return true; return true;
} }
void finish() {
if (useEncryptionDomain && canUseDefaultDomain) {
domainId = keyProvider->getDefaultEncryptionDomainId();
}
}
}; };
// Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build // Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build
@ -5705,8 +5885,25 @@ private:
// Leaves can have just one record if it's large, but internal pages should have at least 4 // Leaves can have just one record if it's large, but internal pages should have at least 4
int minRecords = height == 1 ? 1 : 4; int minRecords = height == 1 ? 1 : 4;
double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK; double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK;
RedwoodRecordRef emptyRecord;
std::vector<PageToBuild> pages; std::vector<PageToBuild> pages;
// Whether encryption is used and we need to set encryption domain for a page.
bool useEncryptionDomain =
ArenaPage::isEncodingTypeEncrypted(m_encodingType) && m_keyProvider->enableEncryptionDomain();
// Whether we may need to split by encryption domain. It is mean to be an optimization to avoid
// unnecessary domain check and may not be exhaust all cases.
bool splitByDomain = false;
if (useEncryptionDomain && records.size() > 1) {
int64_t firstDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[0].key));
int64_t lastDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[records.size() - 1].key));
// If the two record falls in the same non-default domain, we know all the records fall in the
// same domain. Otherwise we may need to split pages by domain.
if (firstDomain != lastDomain || firstDomain == m_keyProvider->getDefaultEncryptionDomainId()) {
splitByDomain = true;
}
}
// deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound] // deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound]
std::vector<int> deltaSizes(records.size() + 1); std::vector<int> deltaSizes(records.size() + 1);
deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true); deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true);
@ -5715,28 +5912,34 @@ private:
deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true); deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true);
} }
PageToBuild p(0, m_blockSize, m_encodingType); PageToBuild p(
0, m_blockSize, m_encodingType, height, useEncryptionDomain, splitByDomain, m_keyProvider.getPtr());
for (int i = 0; i < records.size(); ++i) { for (int i = 0; i < records.size();) {
bool force = p.count < minRecords || p.slackFraction() > maxSlack; bool force = p.count < minRecords || p.slackFraction() > maxSlack;
debug_printf( if (i == 0 || p.count > 0) {
" before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s record=%s", debug_printf(" before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s "
i, "record=%s",
records.size(), i,
deltaSizes[i], records.size(),
records[i].kvBytes(), deltaSizes[i],
force, records[i].kvBytes(),
p.toString().c_str(), force,
records[i].toString(height == 1).c_str()); p.toString().c_str(),
records[i].toString(height == 1).c_str());
}
if (!p.addRecord(records[i], deltaSizes[i], force)) { if (!p.addRecord(records[i], i + 1 < records.size() ? records[i + 1] : emptyRecord, deltaSizes[i], force)) {
p.finish();
pages.push_back(p); pages.push_back(p);
p = p.next(m_encodingType); p = p.next();
p.addRecord(records[i], deltaSizes[i], true); } else {
i++;
} }
} }
if (p.count > 0) { if (p.count > 0) {
p.finish();
pages.push_back(p); pages.push_back(p);
} }
@ -5749,15 +5952,20 @@ private:
PageToBuild& a = pages[pages.size() - 2]; PageToBuild& a = pages[pages.size() - 2];
PageToBuild& b = pages.back(); PageToBuild& b = pages.back();
// While the last page page has too much slack and the second to last page // We can rebalance the two pages only if they are in the same encryption domain.
// has more than the minimum record count, shift a record from the second ASSERT(!useEncryptionDomain || (a.domainId.present() && b.domainId.present()));
// to last page to the last page. if (!useEncryptionDomain || a.domainId.get() == b.domainId.get()) {
while (b.slackFraction() > maxSlack && a.count > minRecords) {
int i = a.lastIndex(); // While the last page page has too much slack and the second to last page
if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) { // has more than the minimum record count, shift a record from the second
break; // to last page to the last page.
while (b.slackFraction() > maxSlack && a.count > minRecords) {
int i = a.lastIndex();
if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
break;
}
debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
} }
debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
} }
} }
@ -5780,8 +5988,13 @@ private:
// All records share the prefix shared by the lower and upper boundaries // All records share the prefix shared by the lower and upper boundaries
state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound); state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound);
// Whether encryption is used and we need to set encryption domain for a page.
state bool useEncryptionDomain =
ArenaPage::isEncodingTypeEncrypted(self->m_encodingType) && self->m_keyProvider->enableEncryptionDomain();
state std::vector<PageToBuild> pagesToBuild = state std::vector<PageToBuild> pagesToBuild =
self->splitPages(lowerBound, upperBound, prefixLen, entries, height); self->splitPages(lowerBound, upperBound, prefixLen, entries, height);
ASSERT(pagesToBuild.size() > 0);
debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str()); debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str());
// Lower bound of the page being added to // Lower bound of the page being added to
@ -5791,6 +6004,18 @@ private:
state int pageIndex; state int pageIndex;
if (useEncryptionDomain) {
ASSERT(pagesToBuild[0].domainId.present());
int64_t domainId = pagesToBuild[0].domainId.get();
// We need to make sure we use the domain prefix as the page lower bound, for the first page
// of a non-default domain on a level. That way we ensure that pages for a domain form a full subtree
// (i.e. have a single root) in the B-tree.
if (domainId != self->m_keyProvider->getDefaultEncryptionDomainId() &&
!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, false)) {
pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength));
}
}
for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) { for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) {
debug_printf("building page %d of %zu %s\n", debug_printf("building page %d of %zu %s\n",
pageIndex + 1, pageIndex + 1,
@ -5798,6 +6023,30 @@ private:
pagesToBuild[pageIndex].toString().c_str()); pagesToBuild[pageIndex].toString().c_str());
ASSERT(pagesToBuild[pageIndex].count != 0); ASSERT(pagesToBuild[pageIndex].count != 0);
// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
int endIndex = pagesToBuild[pageIndex].endIndex();
bool lastPage = endIndex == entries.size();
pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
if (!lastPage) {
PageToBuild& p = pagesToBuild[pageIndex];
PageToBuild& nextPage = pagesToBuild[pageIndex + 1];
if (height == 1) {
// If this is a leaf page, and not the last one to be written, shorten the upper boundary)
int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
pageUpperBound.truncate(commonPrefix + 1);
}
if (useEncryptionDomain) {
ASSERT(p.domainId.present());
ASSERT(nextPage.domainId.present());
if (p.domainId.get() != nextPage.domainId.get() &&
nextPage.domainId.get() != self->m_keyProvider->getDefaultEncryptionDomainId()) {
pageUpperBound =
RedwoodRecordRef(entries[nextPage.startIndex].key.substr(0, nextPage.domainPrefixLength));
}
}
}
// For internal pages, skip first entry if child link is null. Such links only exist // For internal pages, skip first entry if child link is null. Such links only exist
// to maintain a borrow-able prefix for the previous subtree after a subtree deletion. // to maintain a borrow-able prefix for the previous subtree after a subtree deletion.
// If the null link falls on a new page post-split, then the pageLowerBound of the page // If the null link falls on a new page post-split, then the pageLowerBound of the page
@ -5811,38 +6060,34 @@ private:
--p.count; --p.count;
debug_printf("Skipping first null record, new count=%d\n", p.count); debug_printf("Skipping first null record, new count=%d\n", p.count);
// If the page is now empty then it must be the last page in pagesToBuild, otherwise there would // In case encryption or encryption domain is not enabled, if the page is now empty then it must be the
// be more than 1 item since internal pages need to have multiple children. While there is no page // last page in pagesToBuild, otherwise there would be more than 1 item since internal pages need to
// to be built here, a record must be added to the output set because the upper boundary of the last // have multiple children. In case encryption and encryption domain is enabled, however, because of the
// page split by encryption domain, it may not be the last page.
//
// Either way, a record must be added to the output set because the upper boundary of the last
// page built does not match the upper boundary of the original page that this call to writePages() is // page built does not match the upper boundary of the original page that this call to writePages() is
// replacing. Put another way, the upper boundary of the rightmost page of the page set that was just // replacing. Put another way, the upper boundary of the rightmost page of the page set that was just
// built does not match the upper boundary of the original page that the page set is replacing, so // built does not match the upper boundary of the original page that the page set is replacing, so
// adding the extra null link fixes this. // adding the extra null link fixes this.
if (p.count == 0) { if (p.count == 0) {
ASSERT(pageIndex == pagesToBuild.size() - 1); ASSERT(useEncryptionDomain || lastPage);
records.push_back_deep(records.arena(), pageUpperBound); records.push_back_deep(records.arena(), pageLowerBound);
break; pageLowerBound = pageUpperBound;
continue;
} }
} }
// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
int endIndex = pagesToBuild[pageIndex].endIndex();
bool lastPage = endIndex == entries.size();
pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
// If this is a leaf page, and not the last one to be written, shorten the upper boundary
if (!lastPage && height == 1) {
int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
pageUpperBound.truncate(commonPrefix + 1);
}
// Create and init page here otherwise many variables must become state vars // Create and init page here otherwise many variables must become state vars
state Reference<ArenaPage> page = self->m_pager->newPageBuffer(pagesToBuild[pageIndex].blockCount); state Reference<ArenaPage> page = self->m_pager->newPageBuffer(pagesToBuild[pageIndex].blockCount);
page->init(self->m_encodingType, page->init(self->m_encodingType,
(pagesToBuild[pageIndex].blockCount == 1) ? PageType::BTreeNode : PageType::BTreeSuperNode, (pagesToBuild[pageIndex].blockCount == 1) ? PageType::BTreeNode : PageType::BTreeSuperNode,
height); height);
if (page->isEncrypted()) { if (page->isEncrypted()) {
ArenaPage::EncryptionKey k = wait(self->m_keyProvider->getLatestDefaultEncryptionKey()); ArenaPage::EncryptionKey k =
wait(useEncryptionDomain
? self->m_keyProvider->getLatestEncryptionKey(pagesToBuild[pageIndex].domainId.get())
: self->m_keyProvider->getLatestDefaultEncryptionKey());
page->encryptionKey = k; page->encryptionKey = k;
} }
@ -5935,7 +6180,8 @@ private:
} }
if (self->m_pBoundaryVerifier != nullptr) { if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->update(childPageID, v, pageLowerBound.key, pageUpperBound.key); ASSERT(self->m_pBoundaryVerifier->update(
childPageID, v, pageLowerBound.key, pageUpperBound.key, height, pagesToBuild[pageIndex].domainId));
} }
if (++sinceYield > 100) { if (++sinceYield > 100) {
@ -5984,9 +6230,26 @@ private:
// commit record, build a new root page and update records to be a link to that new page. // commit record, build a new root page and update records to be a link to that new page.
// Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in // Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in
// size. // size.
//
// There's another case. When encryption domain is enabled, we want to make sure the root node is encrypted
// using the default encryption domain. An indication that's not true is when the first record is not using
// dbBegin as key.
while (records.size() > 1 || while (records.size() > 1 ||
records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize)) { records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize) ||
records[0].key != dbBegin.key) {
CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large"); CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large");
if (records[0].key != dbBegin.key) {
ASSERT(self->m_keyProvider.isValid() && self->m_keyProvider->enableEncryption() &&
self->m_keyProvider->enableEncryptionDomain());
int64_t domainId;
size_t prefixLength;
std::tie(domainId, prefixLength) = self->m_keyProvider->getEncryptionDomain(records[0].key);
ASSERT(domainId != self->m_keyProvider->getDefaultEncryptionDomainId());
ASSERT(records[0].key.size() == prefixLength);
CODE_PROBE(true,
"Writing a new root because the current root is encrypted with non-default encryption "
"domain cipher key");
}
self->m_header.height = ++height; self->m_header.height = ++height;
ASSERT(height < std::numeric_limits<int8_t>::max()); ASSERT(height < std::numeric_limits<int8_t>::max());
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait( Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(
@ -6165,7 +6428,7 @@ private:
self->m_pager->updatePage(PagerEventReasons::Commit, height, newID, page); self->m_pager->updatePage(PagerEventReasons::Commit, height, newID, page);
if (self->m_pBoundaryVerifier != nullptr) { if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->update(writeVersion, oldID.front(), newID.front()); self->m_pBoundaryVerifier->updatePageId(writeVersion, oldID.front(), newID.front());
} }
self->freeBTreePage(height, oldID, writeVersion); self->freeBTreePage(height, oldID, writeVersion);
@ -6327,8 +6590,14 @@ private:
struct InternalPageModifier { struct InternalPageModifier {
InternalPageModifier() {} InternalPageModifier() {}
InternalPageModifier(Reference<const ArenaPage> p, bool alreadyCloned, bool updating, ParentInfo* parentInfo) InternalPageModifier(Reference<const ArenaPage> p,
: updating(updating), page(p), clonedPage(alreadyCloned), changesMade(false), parentInfo(parentInfo) {} bool alreadyCloned,
bool updating,
ParentInfo* parentInfo,
Reference<IPageEncryptionKeyProvider> keyProvider,
Optional<int64_t> pageDomainId)
: updating(updating), page(p), clonedPage(alreadyCloned), changesMade(false), parentInfo(parentInfo),
keyProvider(keyProvider), pageDomainId(pageDomainId) {}
// Whether updating the existing page is allowed // Whether updating the existing page is allowed
bool updating; bool updating;
@ -6343,6 +6612,9 @@ private:
bool changesMade; bool changesMade;
ParentInfo* parentInfo; ParentInfo* parentInfo;
Reference<IPageEncryptionKeyProvider> keyProvider;
Optional<int64_t> pageDomainId;
BTreePage* btPage() const { return (BTreePage*)page->mutateData(); } BTreePage* btPage() const { return (BTreePage*)page->mutateData(); }
bool empty() const { bool empty() const {
@ -6365,6 +6637,7 @@ private:
void insert(BTreePage::BinaryTree::Cursor end, const VectorRef<RedwoodRecordRef>& recs) { void insert(BTreePage::BinaryTree::Cursor end, const VectorRef<RedwoodRecordRef>& recs) {
int i = 0; int i = 0;
if (updating) { if (updating) {
cloneForUpdate();
// Update must be done in the new tree, not the original tree where the end cursor will be from // Update must be done in the new tree, not the original tree where the end cursor will be from
end.switchTree(btPage()->tree()); end.switchTree(btPage()->tree());
@ -6373,7 +6646,18 @@ private:
const RedwoodRecordRef& rec = recs[i]; const RedwoodRecordRef& rec = recs[i];
debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str()); debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str());
if (!end.insert(rec)) { // Fail if the inserted record does not belong to the same encryption domain as the existing page
// data.
bool canInsert = true;
if (page->isEncrypted() && keyProvider->enableEncryptionDomain()) {
ASSERT(keyProvider && pageDomainId.present());
canInsert = keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, true);
}
if (canInsert) {
canInsert = end.insert(rec);
}
if (!canInsert) {
debug_printf("internal page: failed to insert %s, switching to rebuild\n", debug_printf("internal page: failed to insert %s, switching to rebuild\n",
rec.toString(false).c_str()); rec.toString(false).c_str());
@ -6432,14 +6716,11 @@ private:
// If the children changed, replace [cBegin, cEnd) with newLinks // If the children changed, replace [cBegin, cEnd) with newLinks
if (u.childrenChanged) { if (u.childrenChanged) {
cloneForUpdate();
if (updating) { if (updating) {
auto c = u.cBegin; auto c = u.cBegin;
// must point c to the tree to erase from
if (c != u.cEnd) { c.switchTree(btPage()->tree());
cloneForUpdate();
// must point c to the tree to erase from
c.switchTree(btPage()->tree());
}
while (c != u.cEnd) { while (c != u.cEnd) {
debug_printf("applyUpdate (updating) erasing: %s\n", c.get().toString(false).c_str()); debug_printf("applyUpdate (updating) erasing: %s\n", c.get().toString(false).c_str());
@ -6489,6 +6770,55 @@ private:
} }
}; };
ACTOR static Future<Void> buildNewSubtree(VersionedBTree* self,
Version version,
LogicalPageID parentID,
unsigned int height,
MutationBuffer::const_iterator mBegin,
MutationBuffer::const_iterator mEnd,
InternalPageSliceUpdate* update) {
ASSERT(height > 1);
debug_printf(
"buildNewSubtree start version %" PRId64 ", height %u, %s\n'", version, height, update->toString().c_str());
state Standalone<VectorRef<RedwoodRecordRef>> records;
while (mBegin != mEnd && mBegin.key() < update->subtreeLowerBound.key) {
++mBegin;
}
while (mBegin != mEnd) {
if (mBegin.mutation().boundarySet()) {
RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get());
records.push_back_deep(records.arena(), rec);
if (REDWOOD_DEBUG) {
debug_printf(" Added %s", rec.toString().c_str());
}
}
++mBegin;
}
if (records.empty()) {
update->cleared();
} else {
state unsigned int h = 1;
debug_printf("buildNewSubtree at level %u\n", h);
while (h < height) {
// Only the parentID at the root is known as we are building the subtree bottom-up.
// We use the parentID for all levels, since the parentID is currently used for
// debug use only.
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(writePages(self,
&update->subtreeLowerBound,
&update->subtreeUpperBound,
records,
h,
version,
BTreeNodeLinkRef(),
parentID));
records = newRecords;
h++;
}
update->rebuilt(records);
}
return Void();
}
ACTOR static Future<Void> commitSubtree( ACTOR static Future<Void> commitSubtree(
VersionedBTree* self, VersionedBTree* self,
CommitBatch* batch, CommitBatch* batch,
@ -6545,6 +6875,12 @@ private:
// TryToUpdate indicates insert and erase operations should be tried on the existing page first // TryToUpdate indicates insert and erase operations should be tried on the existing page first
state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal(); state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal();
state bool useEncryptionDomain = page->isEncrypted() && self->m_keyProvider->enableEncryptionDomain();
state Optional<int64_t> pageDomainId;
if (useEncryptionDomain) {
pageDomainId = page->getEncryptionDomainId();
}
debug_printf("%s tryToUpdate=%d\n", context.c_str(), tryToUpdate); debug_printf("%s tryToUpdate=%d\n", context.c_str(), tryToUpdate);
debug_print(addPrefix(context, debug_print(addPrefix(context,
btPage->toString("commitSubtreeStart", btPage->toString("commitSubtreeStart",
@ -6562,7 +6898,9 @@ private:
ASSERT(self->m_pBoundaryVerifier->verify(rootID.front(), ASSERT(self->m_pBoundaryVerifier->verify(rootID.front(),
batch->snapshot->getVersion(), batch->snapshot->getVersion(),
update->cBegin.get().key, update->cBegin.get().key,
update->cBegin.next().getOrUpperBound().key)); update->cBegin.next().getOrUpperBound().key,
pageDomainId,
cursor));
} }
} }
@ -6661,8 +6999,16 @@ private:
// If updating, first try to add the record to the page // If updating, first try to add the record to the page
if (updatingDeltaTree) { if (updatingDeltaTree) {
copyForUpdate(); bool canInsert = true;
if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) { if (useEncryptionDomain) {
ASSERT(pageDomainId.present());
canInsert = self->m_keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, false);
}
if (canInsert) {
copyForUpdate();
canInsert = cursor.insert(rec, update->skipLen, maxHeightAllowed);
}
if (canInsert) {
btPage->kvBytes += rec.kvBytes(); btPage->kvBytes += rec.kvBytes();
debug_printf("%s Inserted %s [mutation, boundary start]\n", debug_printf("%s Inserted %s [mutation, boundary start]\n",
context.c_str(), context.c_str(),
@ -6881,6 +7227,25 @@ private:
bool first = true; bool first = true;
if (useEncryptionDomain && cursor.valid() && update->subtreeLowerBound.key < cursor.get().key) {
mEnd = batch->mutations->lower_bound(cursor.get().key);
first = false;
if (mBegin != mEnd) {
slices.emplace_back(new InternalPageSliceUpdate());
InternalPageSliceUpdate& u = *slices.back();
u.cBegin = cursor;
u.cEnd = cursor;
u.subtreeLowerBound = update->subtreeLowerBound;
u.decodeLowerBound = u.subtreeLowerBound;
u.subtreeUpperBound = cursor.get();
u.decodeUpperBound = u.subtreeUpperBound;
u.expectedUpperBound = u.subtreeUpperBound;
u.skipLen = 0;
recursions.push_back(
self->buildNewSubtree(self, batch->writeVersion, parentID, height, mBegin, mEnd, &u));
}
}
while (cursor.valid()) { while (cursor.valid()) {
slices.emplace_back(new InternalPageSliceUpdate()); slices.emplace_back(new InternalPageSliceUpdate());
InternalPageSliceUpdate& u = *slices.back(); InternalPageSliceUpdate& u = *slices.back();
@ -7063,7 +7428,8 @@ private:
// which to build new page(s) if modification is not possible or not allowed. // which to build new page(s) if modification is not possible or not allowed.
// If pageCopy is already set it was initialized to page above so the modifier doesn't need // If pageCopy is already set it was initialized to page above so the modifier doesn't need
// to copy it // to copy it
state InternalPageModifier modifier(page, pageCopy.isValid(), tryToUpdate, parentInfo); state InternalPageModifier modifier(
page, pageCopy.isValid(), tryToUpdate, parentInfo, self->m_keyProvider, pageDomainId);
// Apply the possible changes for each subtree range recursed to, except the last one. // Apply the possible changes for each subtree range recursed to, except the last one.
// For each range, the expected next record, if any, is checked against the first boundary // For each range, the expected next record, if any, is checked against the first boundary
@ -7082,8 +7448,11 @@ private:
modifier.changesMade); modifier.changesMade);
debug_print(addPrefix(context, update->toString())); debug_print(addPrefix(context, update->toString()));
// TODO(yiwu): check whether we can pass decodeUpperBound as nextBoundary when the last slice
// have childenChanged=true.
modifier.applyUpdate(*slices.back(), modifier.applyUpdate(*slices.back(),
modifier.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound); modifier.changesMade || slices.back()->childrenChanged ? &update->subtreeUpperBound
: &update->decodeUpperBound);
state bool detachChildren = (parentInfo->count > 2); state bool detachChildren = (parentInfo->count > 2);
state bool forceUpdate = false; state bool forceUpdate = false;
@ -7146,7 +7515,8 @@ private:
if (newID != invalidPhysicalPageID) { if (newID != invalidPhysicalPageID) {
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
if (self->m_pBoundaryVerifier != nullptr) { if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); self->m_pBoundaryVerifier->updatePageId(
batch->writeVersion, p, newID);
} }
p = newID; p = newID;
++stats.metrics.detachChild; ++stats.metrics.detachChild;
@ -7212,7 +7582,8 @@ private:
rec.setChildPage(newPages); rec.setChildPage(newPages);
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID); debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
if (self->m_pBoundaryVerifier != nullptr) { if (self->m_pBoundaryVerifier != nullptr) {
self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID); self->m_pBoundaryVerifier->updatePageId(
batch->writeVersion, p, newID);
} }
++stats.metrics.detachChild; ++stats.metrics.detachChild;
} }
@ -7222,7 +7593,6 @@ private:
} }
parentInfo->clear(); parentInfo->clear();
} }
Standalone<VectorRef<RedwoodRecordRef>> newChildEntries = Standalone<VectorRef<RedwoodRecordRef>> newChildEntries =
wait(writePages(self, wait(writePages(self,
&update->subtreeLowerBound, &update->subtreeLowerBound,
@ -7430,17 +7800,24 @@ public:
false, false,
!options.present() || options.get().cacheResult || path.back().btPage()->height != 2), !options.present() || options.get().cacheResult || path.back().btPage()->height != 2),
[=](Reference<const ArenaPage> p) { [=](Reference<const ArenaPage> p) {
BTreePage::BinaryTree::Cursor cursor = btree->getCursor(p.getPtr(), link);
#if REDWOOD_DEBUG #if REDWOOD_DEBUG
path.push_back({ p, btree->getCursor(p.getPtr(), link), link.get().getChildPage() }); path.push_back({ p, cursor, link.get().getChildPage() });
#else #else
path.push_back({ p, btree->getCursor(p.getPtr(), link) }); path.push_back({ p, cursor });
#endif #endif
if (btree->m_pBoundaryVerifier != nullptr) { if (btree->m_pBoundaryVerifier != nullptr) {
Optional<int64_t> domainId;
if (p->isEncrypted() && btree->m_keyProvider->enableEncryptionDomain()) {
domainId = p->getEncryptionDomainId();
}
ASSERT(btree->m_pBoundaryVerifier->verify(link.get().getChildPage().front(), ASSERT(btree->m_pBoundaryVerifier->verify(link.get().getChildPage().front(),
pager->getVersion(), pager->getVersion(),
link.get().key, link.get().key,
link.next().getOrUpperBound().key)); link.next().getOrUpperBound().key,
domainId,
cursor));
} }
return Void(); return Void();
}); });
@ -7723,8 +8100,13 @@ public:
// TODO(yiwu): When the cluster encryption config is available later, fail if the cluster is configured to // TODO(yiwu): When the cluster encryption config is available later, fail if the cluster is configured to
// enable encryption, but the Redwood instance is unencrypted. // enable encryption, but the Redwood instance is unencrypted.
if (encryptionKeyProvider && encryptionKeyProvider->enableEncryption()) { if (encryptionKeyProvider && encryptionKeyProvider->enableEncryption()) {
ASSERT(encryptionKeyProvider->expectedEncodingType() == EncodingType::AESEncryptionV1);
encodingType = EncodingType::AESEncryptionV1; encodingType = EncodingType::AESEncryptionV1;
m_keyProvider = encryptionKeyProvider; m_keyProvider = encryptionKeyProvider;
} else if (g_network->isSimulated() && logID.hash() % 2 == 0) {
// Simulation only. Deterministically enable encryption based on uid
encodingType = EncodingType::XOREncryption_TestOnly;
m_keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(filename);
} }
IPager2* pager = new DWALPager(pageSize, IPager2* pager = new DWALPager(pageSize,
@ -9752,6 +10134,11 @@ TEST_CASE("Lredwood/correctness/btree") {
state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->random01() < 0.25); state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->random01() < 0.25);
state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25); state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25);
state int encoding =
params.getInt("encodingType").orDefault(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
state unsigned int encryptionDomainMode =
params.getInt("domainMode")
.orDefault(deterministicRandom()->randomInt(0, RandomEncryptionKeyProvider::EncryptionDomainMode::MAX));
state int pageSize = state int pageSize =
shortTest ? 250 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(250, 400)); shortTest ? 250 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(250, 400));
state int extentSize = state int extentSize =
@ -9802,24 +10189,25 @@ TEST_CASE("Lredwood/correctness/btree") {
// Max number of records in the BTree or the versioned written map to visit // Max number of records in the BTree or the versioned written map to visit
state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6); state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6);
state EncodingType encodingType = state EncodingType encodingType = static_cast<EncodingType>(encoding);
static_cast<EncodingType>(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
state Reference<IPageEncryptionKeyProvider> keyProvider; state Reference<IPageEncryptionKeyProvider> keyProvider;
if (encodingType == EncodingType::AESEncryptionV1) { if (encodingType == EncodingType::AESEncryptionV1) {
keyProvider = makeReference<RandomEncryptionKeyProvider>(); keyProvider = makeReference<RandomEncryptionKeyProvider>(
RandomEncryptionKeyProvider::EncryptionDomainMode(encryptionDomainMode));
} else if (encodingType == EncodingType::XOREncryption_TestOnly) { } else if (encodingType == EncodingType::XOREncryption_TestOnly) {
keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(file); keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(file);
} }
printf("\n"); printf("\n");
printf("file: %s\n", file.c_str()); printf("file: %s\n", file.c_str());
printf("encodingType: %d\n", encodingType);
printf("maxPageOps: %" PRId64 "\n", maxPageOps); printf("maxPageOps: %" PRId64 "\n", maxPageOps);
printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries); printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries);
printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead); printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead);
printf("pagerMemoryOnly: %d\n", pagerMemoryOnly); printf("pagerMemoryOnly: %d\n", pagerMemoryOnly);
printf("serialTest: %d\n", serialTest); printf("serialTest: %d\n", serialTest);
printf("shortTest: %d\n", shortTest); printf("shortTest: %d\n", shortTest);
printf("encodingType: %d\n", encodingType);
printf("domainMode: %d\n", encryptionDomainMode);
printf("pageSize: %d\n", pageSize); printf("pageSize: %d\n", pageSize);
printf("extentSize: %d\n", extentSize); printf("extentSize: %d\n", extentSize);
printf("maxKeySize: %d\n", maxKeySize); printf("maxKeySize: %d\n", maxKeySize);

View File

@ -37,6 +37,7 @@
#define XXH_INLINE_ALL #define XXH_INLINE_ALL
#include "flow/xxhash.h" #include "flow/xxhash.h"
#include <functional>
#include <tuple> #include <tuple>
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
@ -72,10 +73,12 @@ public:
virtual bool enableEncryptionDomain() const { return false; } virtual bool enableEncryptionDomain() const { return false; }
// Get an encryption key from given encoding header. // Get an encryption key from given encoding header.
virtual Future<EncryptionKey> getEncryptionKey(void* encodingHeader) { throw not_implemented(); } virtual Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) { throw not_implemented(); }
// Get latest encryption key. If encryption domain is enabled, get encryption key for the default domain. // Get latest encryption key. If encryption domain is enabled, get encryption key for the default domain.
virtual Future<EncryptionKey> getLatestDefaultEncryptionKey() { throw not_implemented(); } virtual Future<EncryptionKey> getLatestDefaultEncryptionKey() {
return getLatestEncryptionKey(getDefaultEncryptionDomainId());
}
// Get latest encryption key for data in given encryption domain. // Get latest encryption key for data in given encryption domain.
virtual Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) { throw not_implemented(); } virtual Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) { throw not_implemented(); }
@ -94,10 +97,22 @@ public:
} }
// Get encryption domain of a page given encoding header. // Get encryption domain of a page given encoding header.
virtual int64_t getEncryptionDomain(void* encodingHeader) { throw not_implemented(); } virtual int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) { throw not_implemented(); }
// Setting tenant prefix to tenant name map. Used by TenantAwareEncryptionKeyProvider. // Setting tenant prefix to tenant name map. Used by TenantAwareEncryptionKeyProvider.
virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {} virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
// Helper methods.
// Check if a key fits in an encryption domain.
bool keyFitsInDomain(int64_t domainId, const KeyRef& key, bool canUseDefaultDomain) {
ASSERT(enableEncryptionDomain());
int64_t keyDomainId;
size_t prefixLength;
std::tie(keyDomainId, prefixLength) = getEncryptionDomain(key);
return keyDomainId == domainId ||
(canUseDefaultDomain && (domainId == getDefaultEncryptionDomainId() && key.size() == prefixLength));
}
}; };
// The null key provider is useful to simplify page decoding. // The null key provider is useful to simplify page decoding.
@ -133,39 +148,20 @@ public:
bool enableEncryption() const override { return true; } bool enableEncryption() const override { return true; }
bool enableEncryptionDomain() const override { return true; } Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override { const EncodingHeader* h = reinterpret_cast<const EncodingHeader*>(encodingHeader);
EncodingHeader* h = reinterpret_cast<EncodingHeader*>(encodingHeader);
EncryptionKey s; EncryptionKey s;
s.xorKey = h->xorKey; s.xorKey = h->xorKey;
return s; return s;
} }
Future<EncryptionKey> getLatestDefaultEncryptionKey() override { return getLatestEncryptionKey(0); } Future<EncryptionKey> getLatestDefaultEncryptionKey() override {
Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) override {
EncryptionKey s; EncryptionKey s;
s.xorKey = ~(uint8_t)domainId ^ xorWith; s.xorKey = xorWith;
return s; return s;
} }
int64_t getDefaultEncryptionDomainId() const override { return 0; }
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key,
Optional<int64_t> /*possibleDomainId*/) override {
if (key.size() > 0) {
return { *key.begin(), 1 };
}
return { 0, 0 };
}
int64_t getEncryptionDomain(void* encodingHeader) override {
uint8_t xorKey = reinterpret_cast<EncodingHeader*>(encodingHeader)->xorKey;
return (int64_t)(~xorKey ^ xorWith);
}
uint8_t xorWith; uint8_t xorWith;
}; };
@ -173,11 +169,19 @@ public:
// Use for testing. // Use for testing.
class RandomEncryptionKeyProvider : public IPageEncryptionKeyProvider { class RandomEncryptionKeyProvider : public IPageEncryptionKeyProvider {
public: public:
RandomEncryptionKeyProvider() { enum EncryptionDomainMode : unsigned int {
DISABLED = 0, // disable encryption domain
RANDOM, // for each key prefix, deterministic randomly decide if there's an encryption domain for it.
ALL, // all key prefixes has an encryption domain assigned to it.
MAX,
};
explicit RandomEncryptionKeyProvider(EncryptionDomainMode mode) : mode(mode) {
ASSERT(mode < EncryptionDomainMode::MAX);
for (unsigned i = 0; i < NUM_CIPHER; i++) { for (unsigned i = 0; i < NUM_CIPHER; i++) {
BlobCipherDetails cipherDetails; BlobCipherDetails cipherDetails;
cipherDetails.encryptDomainId = i; cipherDetails.encryptDomainId = 0;
cipherDetails.baseCipherId = deterministicRandom()->randomUInt64(); cipherDetails.baseCipherId = i;
cipherDetails.salt = deterministicRandom()->randomUInt64(); cipherDetails.salt = deterministicRandom()->randomUInt64();
cipherKeys[i] = generateCipherKey(cipherDetails); cipherKeys[i] = generateCipherKey(cipherDetails);
} }
@ -188,22 +192,47 @@ public:
bool enableEncryption() const override { return true; } bool enableEncryption() const override { return true; }
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override { bool enableEncryptionDomain() const override { return mode > 1; }
Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
using Header = ArenaPage::AESEncryptionV1Encoder::Header; using Header = ArenaPage::AESEncryptionV1Encoder::Header;
Header* h = reinterpret_cast<Header*>(encodingHeader); const Header* h = reinterpret_cast<const Header*>(encodingHeader);
EncryptionKey s; EncryptionKey s;
s.aesKey.cipherTextKey = cipherKeys[h->cipherTextDetails.encryptDomainId]; s.aesKey.cipherTextKey = getCipherKey(h->cipherTextDetails.encryptDomainId, h->cipherTextDetails.baseCipherId);
s.aesKey.cipherHeaderKey = cipherKeys[h->cipherHeaderDetails.encryptDomainId]; s.aesKey.cipherHeaderKey =
getCipherKey(h->cipherHeaderDetails.encryptDomainId, h->cipherHeaderDetails.baseCipherId);
return s; return s;
} }
Future<EncryptionKey> getLatestDefaultEncryptionKey() override { Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) override {
domainId = checkDomainId(domainId);
EncryptionKey s; EncryptionKey s;
s.aesKey.cipherTextKey = cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)]; s.aesKey.cipherTextKey = getCipherKey(domainId, deterministicRandom()->randomInt(0, NUM_CIPHER));
s.aesKey.cipherHeaderKey = cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)]; s.aesKey.cipherHeaderKey =
getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, deterministicRandom()->randomInt(0, NUM_CIPHER));
return s; return s;
} }
int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; }
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key, Optional<int64_t>) override {
int64_t domainId;
if (key.size() < PREFIX_LENGTH) {
domainId = getDefaultEncryptionDomainId();
} else {
// Use first 4 bytes as a 32-bit int for the domain id.
domainId = checkDomainId(static_cast<int64_t>(*reinterpret_cast<const int32_t*>(key.begin())));
}
return { domainId, (domainId == getDefaultEncryptionDomainId() ? 0 : PREFIX_LENGTH) };
}
int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override {
ASSERT(encodingHeader != nullptr);
using Header = ArenaPage::AESEncryptionV1Encoder::Header;
const Header* h = reinterpret_cast<const Header*>(encodingHeader);
return h->cipherTextDetails.encryptDomainId;
}
private: private:
Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) { Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b"; static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
@ -226,7 +255,28 @@ private:
std::numeric_limits<int64_t>::max() /* expireAt */); std::numeric_limits<int64_t>::max() /* expireAt */);
} }
int64_t checkDomainId(int64_t domainId) {
std::hash<int64_t> hasher;
if (mode == DISABLED || (mode == RANDOM && hasher(domainId) % 2 == 0)) {
return getDefaultEncryptionDomainId();
}
return domainId;
}
Reference<BlobCipherKey> getCipherKey(EncryptCipherDomainId domainId, EncryptCipherBaseKeyId cipherId) {
// Create a new cipher key by replacing the domain id.
return makeReference<BlobCipherKey>(domainId,
cipherId,
cipherKeys[cipherId]->rawBaseCipher(),
AES_256_KEY_LENGTH,
cipherKeys[cipherId]->getSalt(),
std::numeric_limits<int64_t>::max() /* refreshAt */,
std::numeric_limits<int64_t>::max() /* expireAt */);
}
static constexpr int NUM_CIPHER = 1000; static constexpr int NUM_CIPHER = 1000;
static constexpr size_t PREFIX_LENGTH = 4;
EncryptionDomainMode mode;
Reference<BlobCipherKey> cipherKeys[NUM_CIPHER]; Reference<BlobCipherKey> cipherKeys[NUM_CIPHER];
}; };
@ -248,8 +298,9 @@ public:
bool enableEncryptionDomain() const override { return true; } bool enableEncryptionDomain() const override { return true; }
ACTOR static Future<EncryptionKey> getEncryptionKey(TenantAwareEncryptionKeyProvider* self, void* encodingHeader) { ACTOR static Future<EncryptionKey> getEncryptionKey(TenantAwareEncryptionKeyProvider* self,
BlobCipherEncryptHeader* header = reinterpret_cast<EncodingHeader*>(encodingHeader); const void* encodingHeader) {
const BlobCipherEncryptHeader* header = reinterpret_cast<const EncodingHeader*>(encodingHeader);
TextAndHeaderCipherKeys cipherKeys = TextAndHeaderCipherKeys cipherKeys =
wait(getEncryptCipherKeys(self->db, *header, BlobCipherMetrics::KV_REDWOOD)); wait(getEncryptCipherKeys(self->db, *header, BlobCipherMetrics::KV_REDWOOD));
EncryptionKey encryptionKey; EncryptionKey encryptionKey;
@ -257,7 +308,7 @@ public:
return encryptionKey; return encryptionKey;
} }
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override { Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
return getEncryptionKey(this, encodingHeader); return getEncryptionKey(this, encodingHeader);
} }
@ -292,7 +343,7 @@ public:
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 }; return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
} }
StringRef prefix = key.substr(0, TENANT_PREFIX_SIZE); StringRef prefix = key.substr(0, TENANT_PREFIX_SIZE);
int64_t tenantId = TenantMapEntry::prefixToId(prefix); int64_t tenantId = TenantMapEntry::prefixToId(prefix, EnforceValidTenantId::False);
// Tenant id must be non-negative. // Tenant id must be non-negative.
if (tenantId < 0) { if (tenantId < 0) {
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 }; return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
@ -314,8 +365,9 @@ public:
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 }; return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
} }
int64_t getEncryptionDomain(void* encodingHeader) override { int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override {
BlobCipherEncryptHeader* header = reinterpret_cast<EncodingHeader*>(encodingHeader); ASSERT(encodingHeader != nullptr);
const BlobCipherEncryptHeader* header = reinterpret_cast<const EncodingHeader*>(encodingHeader);
return header->cipherTextDetails.encryptDomainId; return header->cipherTextDetails.encryptDomainId;
} }

View File

@ -498,7 +498,7 @@ public:
// Secret is set if needed // Secret is set if needed
// Post: Main and Encoding subheaders are updated // Post: Main and Encoding subheaders are updated
// Payload is possibly encrypted // Payload is possibly encrypted
void preWrite(PhysicalPageID pageID) const { void preWrite(PhysicalPageID pageID) {
// Explicitly check payload definedness to make the source of valgrind errors more clear. // Explicitly check payload definedness to make the source of valgrind errors more clear.
// Without this check, calculating a checksum on a payload with undefined bytes does not // Without this check, calculating a checksum on a payload with undefined bytes does not
// cause a valgrind error but the resulting checksum is undefined which causes errors later. // cause a valgrind error but the resulting checksum is undefined which causes errors later.
@ -519,6 +519,7 @@ public:
} else { } else {
throw page_header_version_not_supported(); throw page_header_version_not_supported();
} }
encodingHeaderAvailable = true;
} }
// Must be called after reading from disk to verify all non-payload bytes // Must be called after reading from disk to verify all non-payload bytes
@ -531,6 +532,7 @@ public:
void postReadHeader(PhysicalPageID pageID, bool verify = true) { void postReadHeader(PhysicalPageID pageID, bool verify = true) {
pPayload = page->getPayload(); pPayload = page->getPayload();
payloadSize = logicalSize - (pPayload - buffer); payloadSize = logicalSize - (pPayload - buffer);
encodingHeaderAvailable = true;
if (page->headerVersion == 1) { if (page->headerVersion == 1) {
if (verify) { if (verify) {
@ -568,7 +570,18 @@ public:
// Returns true if the page's encoding type employs encryption // Returns true if the page's encoding type employs encryption
bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); } bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
void* getEncodingHeader() { return page->getEncodingHeader(); } // Return encryption domain id used. This method only use information from the encryptionKey.
// Caller should make sure encryption domain is in use.
int64_t getEncryptionDomainId() const {
// encryption domain is only supported by AESEncryptionV1.
ASSERT(getEncodingType() == EncodingType::AESEncryptionV1);
const Reference<BlobCipherKey>& cipherKey = encryptionKey.aesKey.cipherTextKey;
ASSERT(cipherKey.isValid());
return cipherKey->getDomainId();
}
// Return pointer to encoding header.
const void* getEncodingHeader() const { return encodingHeaderAvailable ? page->getEncodingHeader() : nullptr; }
private: private:
Arena arena; Arena arena;
@ -608,6 +621,9 @@ public:
// Used by encodings that do encryption // Used by encodings that do encryption
EncryptionKey encryptionKey; EncryptionKey encryptionKey;
// Whether encoding header is set
bool encodingHeaderAvailable = false;
mutable ArbitraryObject extra; mutable ArbitraryObject extra;
}; };