Split Redwood page by tenant boundary (#7979)
Redwood encrypt with page granularity. To do per-tenant encryption (i.e. each tenant are encrypted with different set of cipher keys), we need to split Redwood pages by tenant boundary. Moreover, it also needs to handle different tenant modes: * tenantMode = disabled: do not split page by tenant, all data encrypt using default encryption domain. * tenantMode = required: look at the prefix of the keys and split by tenant accordingly, and encrypt in tenant specific encryption domain accordingly. * tenantMode = optional: some key ranges may not map to a tenant. In additional to looking at the key prefix, the key provider also query the tenant prefix index. For prefixes not found in the tenant prefix index, corresponding key should be encrypted using the default encryption domain. The change also enforce data for each tenant forms a subtree, and key of the link to the subtree is exactly the tenant prefix. This PR is building on top of #8172 and use the IPageEncryptionKeyProvider interface added there. Changes: * In `writePages` and `splitPages`, query the key provider to split page accordingly. * In `commitSubtree`, when doing in-place update (to both of leaf or internal page), check if the entry being insert belong to the same encryption domain as existing data of the page. If not, fallback to full page rebuild, where `writePages` will handle the page split. * When updating the root, check if it is encrypted using non-default (i.e. tenant specific) domain. If so, add a redundant root node which will be encrypted with default encryption domain. Tested with 100K run of `Lredwood/correctness/btree` unit test, where it uses `RandomEncryptionKeyProvider`, which is updated to support and generate random encryption domain with 4 byte domain prefixes.
This commit is contained in:
parent
55b880432e
commit
5c549601d2
|
@ -18,37 +18,40 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "fmt/format.h"
|
||||
#include "fdbclient/CommitTransaction.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbclient/Tuple.h"
|
||||
#include "fdbrpc/ContinuousSample.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbserver/DeltaTree.h"
|
||||
#include "fdbserver/IKeyValueStore.h"
|
||||
#include "fdbserver/IPager.h"
|
||||
#include "fdbserver/Knobs.h"
|
||||
#include "fdbserver/WorkerInterface.actor.h"
|
||||
#include "flow/ActorCollection.h"
|
||||
#include "flow/Error.h"
|
||||
#include "flow/FastRef.h"
|
||||
#include "flow/flow.h"
|
||||
#include "flow/genericactors.actor.h"
|
||||
#include "flow/Histogram.h"
|
||||
#include "flow/IAsyncFile.h"
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/Knobs.h"
|
||||
#include "flow/ObjectSerializer.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/flow.h"
|
||||
#include "flow/Histogram.h"
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include "fdbrpc/ContinuousSample.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbserver/IPager.h"
|
||||
#include "fdbclient/Tuple.h"
|
||||
#include "flow/serialize.h"
|
||||
#include "flow/genericactors.actor.h"
|
||||
#include "flow/Trace.h"
|
||||
#include "flow/UnitTest.h"
|
||||
#include "flow/IAsyncFile.h"
|
||||
#include "flow/ActorCollection.h"
|
||||
#include "fmt/format.h"
|
||||
|
||||
#include <boost/intrusive/list.hpp>
|
||||
#include <cinttypes>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "fdbclient/CommitTransaction.h"
|
||||
#include "fdbserver/IKeyValueStore.h"
|
||||
#include "fdbserver/DeltaTree.h"
|
||||
#include <string.h>
|
||||
#include <cinttypes>
|
||||
#include <boost/intrusive/list.hpp>
|
||||
|
||||
#include "flow/actorcompiler.h" // must be last include
|
||||
|
||||
#define REDWOOD_DEBUG 0
|
||||
|
@ -4768,6 +4771,8 @@ struct DecodeBoundaryVerifier {
|
|||
struct DecodeBoundaries {
|
||||
Key lower;
|
||||
Key upper;
|
||||
unsigned int height;
|
||||
Optional<int64_t> domainId;
|
||||
bool empty() const { return lower.empty() && upper.empty(); }
|
||||
};
|
||||
|
||||
|
@ -4776,6 +4781,11 @@ struct DecodeBoundaryVerifier {
|
|||
std::vector<Key> boundarySamples;
|
||||
int boundarySampleSize = 1000;
|
||||
int boundaryPopulation = 0;
|
||||
Reference<IPageEncryptionKeyProvider> keyProvider;
|
||||
|
||||
// Sample rate of pages to be scanned to verify if all entries in the page meet domain prefix requirement.
|
||||
double domainPrefixScanProbability = 0.01;
|
||||
uint64_t domainPrefixScanCount = 0;
|
||||
|
||||
static DecodeBoundaryVerifier* getVerifier(std::string name) {
|
||||
static std::map<std::string, DecodeBoundaryVerifier> verifiers;
|
||||
|
@ -4786,6 +4796,8 @@ struct DecodeBoundaryVerifier {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
void setKeyProvider(Reference<IPageEncryptionKeyProvider> kp) { keyProvider = kp; }
|
||||
|
||||
void sampleBoundary(Key b) {
|
||||
if (boundaryPopulation <= boundarySampleSize) {
|
||||
boundarySamples.push_back(b);
|
||||
|
@ -4802,21 +4814,53 @@ struct DecodeBoundaryVerifier {
|
|||
return boundarySamples[deterministicRandom()->randomInt(0, boundarySamples.size())];
|
||||
}
|
||||
|
||||
void update(BTreeNodeLinkRef id, Version v, Key lowerBound, Key upperBound) {
|
||||
bool update(BTreeNodeLinkRef id,
|
||||
Version v,
|
||||
Key lowerBound,
|
||||
Key upperBound,
|
||||
unsigned int height,
|
||||
Optional<int64_t> domainId) {
|
||||
sampleBoundary(lowerBound);
|
||||
sampleBoundary(upperBound);
|
||||
debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s'\n",
|
||||
debug_printf("decodeBoundariesUpdate %s %s '%s' to '%s', %u, %s\n",
|
||||
::toString(id).c_str(),
|
||||
::toString(v).c_str(),
|
||||
lowerBound.printable().c_str(),
|
||||
upperBound.printable().c_str());
|
||||
upperBound.printable().c_str(),
|
||||
height,
|
||||
Traceable<decltype(domainId)>::toString(domainId).c_str());
|
||||
|
||||
if (domainId.present()) {
|
||||
ASSERT(keyProvider && keyProvider->enableEncryptionDomain());
|
||||
// Temporarily disabling the check, since if a tenant is removed, where the key provider
|
||||
// would not find the domain, the data for the tenant may still be in Redwood and being read.
|
||||
// TODO(yiwu): re-enable the check.
|
||||
/*
|
||||
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId() &&
|
||||
!keyProvider->keyFitsInDomain(domainId.get(), lowerBound, false)) {
|
||||
fprintf(stderr,
|
||||
"Page lower bound not in domain: %s %s, domain id %s, lower bound '%s'\n",
|
||||
::toString(id).c_str(),
|
||||
::toString(v).c_str(),
|
||||
::toString(domainId).c_str(),
|
||||
lowerBound.printable().c_str());
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
auto& b = boundariesByPageID[id.front()][v];
|
||||
ASSERT(b.empty());
|
||||
b = { lowerBound, upperBound };
|
||||
b = { lowerBound, upperBound, height, domainId };
|
||||
return true;
|
||||
}
|
||||
|
||||
bool verify(LogicalPageID id, Version v, Key lowerBound, Key upperBound) {
|
||||
bool verify(LogicalPageID id,
|
||||
Version v,
|
||||
Key lowerBound,
|
||||
Key upperBound,
|
||||
Optional<int64_t> domainId,
|
||||
BTreePage::BinaryTree::Cursor& cursor) {
|
||||
auto i = boundariesByPageID.find(id);
|
||||
ASSERT(i != boundariesByPageID.end());
|
||||
ASSERT(!i->second.empty());
|
||||
|
@ -4835,10 +4879,66 @@ struct DecodeBoundaryVerifier {
|
|||
b->second.upper.printable().c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!b->second.domainId.present()) {
|
||||
ASSERT(!keyProvider || !keyProvider->enableEncryptionDomain());
|
||||
ASSERT(!domainId.present());
|
||||
} else {
|
||||
ASSERT(keyProvider->enableEncryptionDomain());
|
||||
if (b->second.domainId != domainId) {
|
||||
fprintf(stderr,
|
||||
"Page encrypted with incorrect domain: %s %s, using %s, written %s\n",
|
||||
::toString(id).c_str(),
|
||||
::toString(v).c_str(),
|
||||
::toString(domainId).c_str(),
|
||||
::toString(b->second.domainId).c_str());
|
||||
return false;
|
||||
}
|
||||
// Temporarily disabling the check, since if a tenant is removed, where the key provider
|
||||
// would not find the domain, the data for the tenant may still be in Redwood and being read.
|
||||
// TODO(yiwu): re-enable the check.
|
||||
/*
|
||||
ASSERT(domainId.present());
|
||||
auto checkKeyFitsInDomain = [&]() -> bool {
|
||||
if (!keyProvider->keyFitsInDomain(domainId.get(), cursor.get().key, b->second.height > 1)) {
|
||||
fprintf(stderr,
|
||||
"Encryption domain mismatch on %s, %s, domain: %s, key %s\n",
|
||||
::toString(id).c_str(),
|
||||
::toString(v).c_str(),
|
||||
::toString(domainId).c_str(),
|
||||
cursor.get().key.printable().c_str());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
if (domainId.get() != keyProvider->getDefaultEncryptionDomainId()) {
|
||||
cursor.moveFirst();
|
||||
if (cursor.valid() && !checkKeyFitsInDomain()) {
|
||||
return false;
|
||||
}
|
||||
cursor.moveLast();
|
||||
if (cursor.valid() && !checkKeyFitsInDomain()) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (deterministicRandom()->random01() < domainPrefixScanProbability) {
|
||||
cursor.moveFirst();
|
||||
while (cursor.valid()) {
|
||||
if (!checkKeyFitsInDomain()) {
|
||||
return false;
|
||||
}
|
||||
cursor.moveNext();
|
||||
}
|
||||
domainPrefixScanCount++;
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void update(Version v, LogicalPageID oldID, LogicalPageID newID) {
|
||||
void updatePageId(Version v, LogicalPageID oldID, LogicalPageID newID) {
|
||||
auto& old = boundariesByPageID[oldID];
|
||||
ASSERT(!old.empty());
|
||||
auto i = old.end();
|
||||
|
@ -5053,6 +5153,10 @@ public:
|
|||
}
|
||||
|
||||
m_pBoundaryVerifier = DecodeBoundaryVerifier::getVerifier(name);
|
||||
if (m_pBoundaryVerifier != nullptr) {
|
||||
m_pBoundaryVerifier->setKeyProvider(m_keyProvider);
|
||||
}
|
||||
|
||||
m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource();
|
||||
m_lazyClearActor = 0;
|
||||
m_init = init_impl(this);
|
||||
|
@ -5574,16 +5678,27 @@ private:
|
|||
|
||||
// Describes a range of a vector of records that should be built into a single BTreePage
|
||||
struct PageToBuild {
|
||||
PageToBuild(int index, int blockSize, EncodingType t)
|
||||
PageToBuild(int index,
|
||||
int blockSize,
|
||||
EncodingType encodingType,
|
||||
unsigned int height,
|
||||
bool useEncryptionDomain,
|
||||
bool splitByDomain,
|
||||
IPageEncryptionKeyProvider* keyProvider)
|
||||
: startIndex(index), count(0), pageSize(blockSize),
|
||||
largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
|
||||
kvBytes(0) {
|
||||
kvBytes(0), encodingType(encodingType), height(height), useEncryptionDomain(useEncryptionDomain),
|
||||
splitByDomain(splitByDomain), keyProvider(keyProvider) {
|
||||
|
||||
// Subtrace Page header overhead, BTreePage overhead, and DeltaTree (BTreePage::BinaryTree) overhead.
|
||||
bytesLeft = ArenaPage::getUsableSize(blockSize, t) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree);
|
||||
bytesLeft =
|
||||
ArenaPage::getUsableSize(blockSize, encodingType) - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree);
|
||||
}
|
||||
|
||||
PageToBuild next(EncodingType t) { return PageToBuild(endIndex(), blockSize, t); }
|
||||
PageToBuild next() {
|
||||
return PageToBuild(
|
||||
endIndex(), blockSize, encodingType, height, useEncryptionDomain, splitByDomain, keyProvider);
|
||||
}
|
||||
|
||||
int startIndex; // Index of the first record
|
||||
int count; // Number of records added to the page
|
||||
|
@ -5595,6 +5710,16 @@ private:
|
|||
int blockCount; // The number of blocks in pageSize
|
||||
int kvBytes; // The amount of user key/value bytes added to the page
|
||||
|
||||
EncodingType encodingType;
|
||||
unsigned int height;
|
||||
bool useEncryptionDomain;
|
||||
bool splitByDomain;
|
||||
IPageEncryptionKeyProvider* keyProvider;
|
||||
|
||||
Optional<int64_t> domainId = Optional<int64_t>();
|
||||
size_t domainPrefixLength = 0;
|
||||
bool canUseDefaultDomain = false;
|
||||
|
||||
// Number of bytes used by the generated/serialized BTreePage, including all headers
|
||||
int usedBytes() const { return pageSize - bytesLeft; }
|
||||
|
||||
|
@ -5614,17 +5739,18 @@ private:
|
|||
int endIndex() const { return startIndex + count; }
|
||||
|
||||
std::string toString() const {
|
||||
return format(
|
||||
"{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}",
|
||||
startIndex,
|
||||
count,
|
||||
usedBytes(),
|
||||
pageSize,
|
||||
slackFraction() * 100,
|
||||
kvBytes,
|
||||
blockCount,
|
||||
blockSize,
|
||||
largeDeltaTree);
|
||||
return format("{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d "
|
||||
"large=%d, domain=%s}",
|
||||
startIndex,
|
||||
count,
|
||||
usedBytes(),
|
||||
pageSize,
|
||||
slackFraction() * 100,
|
||||
kvBytes,
|
||||
blockCount,
|
||||
blockSize,
|
||||
largeDeltaTree,
|
||||
::toString(domainId).c_str());
|
||||
}
|
||||
|
||||
// Move an item from a to b if a has 2 or more items and the item fits in b
|
||||
|
@ -5656,7 +5782,8 @@ private:
|
|||
// Try to add a record of the given delta size to the page.
|
||||
// If force is true, the page will be expanded to make the record fit if needed.
|
||||
// Return value is whether or not the record was added to the page.
|
||||
bool addRecord(const RedwoodRecordRef& rec, int deltaSize, bool force) {
|
||||
bool addRecord(const RedwoodRecordRef& rec, const RedwoodRecordRef& nextRecord, int deltaSize, bool force) {
|
||||
|
||||
int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree);
|
||||
|
||||
// If the record doesn't fit and the page can't be expanded then return false
|
||||
|
@ -5664,6 +5791,53 @@ private:
|
|||
return false;
|
||||
}
|
||||
|
||||
if (useEncryptionDomain) {
|
||||
int64_t defaultDomainId = keyProvider->getDefaultEncryptionDomainId();
|
||||
int64_t currentDomainId;
|
||||
size_t prefixLength;
|
||||
if (count == 0 || (splitByDomain && count > 0)) {
|
||||
std::tie(currentDomainId, prefixLength) = keyProvider->getEncryptionDomain(rec.key, domainId);
|
||||
}
|
||||
if (count == 0) {
|
||||
domainId = currentDomainId;
|
||||
domainPrefixLength = prefixLength;
|
||||
canUseDefaultDomain =
|
||||
(height > 1 && (currentDomainId == defaultDomainId || prefixLength == rec.key.size()));
|
||||
} else if (splitByDomain) {
|
||||
ASSERT(domainId.present());
|
||||
if (domainId == currentDomainId) {
|
||||
// The new record falls in the same domain as the rest of the page.
|
||||
// Since this is not the first record, the key must contain a non-prefix portion,
|
||||
// so we cannot use the default domain the encrypt the page (unless domainId is the default
|
||||
// domain).
|
||||
if (domainId != defaultDomainId) {
|
||||
ASSERT(prefixLength < rec.key.size());
|
||||
canUseDefaultDomain = false;
|
||||
}
|
||||
} else if (canUseDefaultDomain &&
|
||||
(currentDomainId == defaultDomainId ||
|
||||
(prefixLength == rec.key.size() &&
|
||||
(nextRecord.key.empty() ||
|
||||
!nextRecord.key.startsWith(rec.key.substr(0, prefixLength)))))) {
|
||||
// The new record meets one of the following conditions:
|
||||
// 1. it falls in the default domain, or
|
||||
// 2. its key contain only the domain prefix, and
|
||||
// 2a. the following record doesn't fall in the same domain.
|
||||
// In this case switch to use the default domain to encrypt the page.
|
||||
// Condition 2a is needed, because if there are multiple records from the same domain,
|
||||
// they need to form their own page(s).
|
||||
domainId = defaultDomainId;
|
||||
domainPrefixLength = 0;
|
||||
} else {
|
||||
// The new record doesn't fit in the same domain as the existing page.
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
ASSERT(domainPrefixLength < rec.key.size());
|
||||
canUseDefaultDomain = false;
|
||||
}
|
||||
}
|
||||
|
||||
++count;
|
||||
bytesLeft -= nodeSize;
|
||||
kvBytes += rec.kvBytes();
|
||||
|
@ -5686,6 +5860,12 @@ private:
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void finish() {
|
||||
if (useEncryptionDomain && canUseDefaultDomain) {
|
||||
domainId = keyProvider->getDefaultEncryptionDomainId();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build
|
||||
|
@ -5705,8 +5885,25 @@ private:
|
|||
// Leaves can have just one record if it's large, but internal pages should have at least 4
|
||||
int minRecords = height == 1 ? 1 : 4;
|
||||
double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK;
|
||||
RedwoodRecordRef emptyRecord;
|
||||
std::vector<PageToBuild> pages;
|
||||
|
||||
// Whether encryption is used and we need to set encryption domain for a page.
|
||||
bool useEncryptionDomain =
|
||||
ArenaPage::isEncodingTypeEncrypted(m_encodingType) && m_keyProvider->enableEncryptionDomain();
|
||||
// Whether we may need to split by encryption domain. It is mean to be an optimization to avoid
|
||||
// unnecessary domain check and may not be exhaust all cases.
|
||||
bool splitByDomain = false;
|
||||
if (useEncryptionDomain && records.size() > 1) {
|
||||
int64_t firstDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[0].key));
|
||||
int64_t lastDomain = std::get<0>(m_keyProvider->getEncryptionDomain(records[records.size() - 1].key));
|
||||
// If the two record falls in the same non-default domain, we know all the records fall in the
|
||||
// same domain. Otherwise we may need to split pages by domain.
|
||||
if (firstDomain != lastDomain || firstDomain == m_keyProvider->getDefaultEncryptionDomainId()) {
|
||||
splitByDomain = true;
|
||||
}
|
||||
}
|
||||
|
||||
// deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound]
|
||||
std::vector<int> deltaSizes(records.size() + 1);
|
||||
deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true);
|
||||
|
@ -5715,28 +5912,34 @@ private:
|
|||
deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true);
|
||||
}
|
||||
|
||||
PageToBuild p(0, m_blockSize, m_encodingType);
|
||||
PageToBuild p(
|
||||
0, m_blockSize, m_encodingType, height, useEncryptionDomain, splitByDomain, m_keyProvider.getPtr());
|
||||
|
||||
for (int i = 0; i < records.size(); ++i) {
|
||||
for (int i = 0; i < records.size();) {
|
||||
bool force = p.count < minRecords || p.slackFraction() > maxSlack;
|
||||
debug_printf(
|
||||
" before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s record=%s",
|
||||
i,
|
||||
records.size(),
|
||||
deltaSizes[i],
|
||||
records[i].kvBytes(),
|
||||
force,
|
||||
p.toString().c_str(),
|
||||
records[i].toString(height == 1).c_str());
|
||||
if (i == 0 || p.count > 0) {
|
||||
debug_printf(" before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s "
|
||||
"record=%s",
|
||||
i,
|
||||
records.size(),
|
||||
deltaSizes[i],
|
||||
records[i].kvBytes(),
|
||||
force,
|
||||
p.toString().c_str(),
|
||||
records[i].toString(height == 1).c_str());
|
||||
}
|
||||
|
||||
if (!p.addRecord(records[i], deltaSizes[i], force)) {
|
||||
if (!p.addRecord(records[i], i + 1 < records.size() ? records[i + 1] : emptyRecord, deltaSizes[i], force)) {
|
||||
p.finish();
|
||||
pages.push_back(p);
|
||||
p = p.next(m_encodingType);
|
||||
p.addRecord(records[i], deltaSizes[i], true);
|
||||
p = p.next();
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (p.count > 0) {
|
||||
p.finish();
|
||||
pages.push_back(p);
|
||||
}
|
||||
|
||||
|
@ -5749,15 +5952,20 @@ private:
|
|||
PageToBuild& a = pages[pages.size() - 2];
|
||||
PageToBuild& b = pages.back();
|
||||
|
||||
// While the last page page has too much slack and the second to last page
|
||||
// has more than the minimum record count, shift a record from the second
|
||||
// to last page to the last page.
|
||||
while (b.slackFraction() > maxSlack && a.count > minRecords) {
|
||||
int i = a.lastIndex();
|
||||
if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
|
||||
break;
|
||||
// We can rebalance the two pages only if they are in the same encryption domain.
|
||||
ASSERT(!useEncryptionDomain || (a.domainId.present() && b.domainId.present()));
|
||||
if (!useEncryptionDomain || a.domainId.get() == b.domainId.get()) {
|
||||
|
||||
// While the last page page has too much slack and the second to last page
|
||||
// has more than the minimum record count, shift a record from the second
|
||||
// to last page to the last page.
|
||||
while (b.slackFraction() > maxSlack && a.count > minRecords) {
|
||||
int i = a.lastIndex();
|
||||
if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
|
||||
break;
|
||||
}
|
||||
debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
|
||||
}
|
||||
debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5780,8 +5988,13 @@ private:
|
|||
// All records share the prefix shared by the lower and upper boundaries
|
||||
state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound);
|
||||
|
||||
// Whether encryption is used and we need to set encryption domain for a page.
|
||||
state bool useEncryptionDomain =
|
||||
ArenaPage::isEncodingTypeEncrypted(self->m_encodingType) && self->m_keyProvider->enableEncryptionDomain();
|
||||
|
||||
state std::vector<PageToBuild> pagesToBuild =
|
||||
self->splitPages(lowerBound, upperBound, prefixLen, entries, height);
|
||||
ASSERT(pagesToBuild.size() > 0);
|
||||
debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str());
|
||||
|
||||
// Lower bound of the page being added to
|
||||
|
@ -5791,6 +6004,18 @@ private:
|
|||
|
||||
state int pageIndex;
|
||||
|
||||
if (useEncryptionDomain) {
|
||||
ASSERT(pagesToBuild[0].domainId.present());
|
||||
int64_t domainId = pagesToBuild[0].domainId.get();
|
||||
// We need to make sure we use the domain prefix as the page lower bound, for the first page
|
||||
// of a non-default domain on a level. That way we ensure that pages for a domain form a full subtree
|
||||
// (i.e. have a single root) in the B-tree.
|
||||
if (domainId != self->m_keyProvider->getDefaultEncryptionDomainId() &&
|
||||
!self->m_keyProvider->keyFitsInDomain(domainId, pageLowerBound.key, false)) {
|
||||
pageLowerBound = RedwoodRecordRef(entries[0].key.substr(0, pagesToBuild[0].domainPrefixLength));
|
||||
}
|
||||
}
|
||||
|
||||
for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) {
|
||||
debug_printf("building page %d of %zu %s\n",
|
||||
pageIndex + 1,
|
||||
|
@ -5798,6 +6023,30 @@ private:
|
|||
pagesToBuild[pageIndex].toString().c_str());
|
||||
ASSERT(pagesToBuild[pageIndex].count != 0);
|
||||
|
||||
// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
|
||||
int endIndex = pagesToBuild[pageIndex].endIndex();
|
||||
bool lastPage = endIndex == entries.size();
|
||||
pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
|
||||
|
||||
if (!lastPage) {
|
||||
PageToBuild& p = pagesToBuild[pageIndex];
|
||||
PageToBuild& nextPage = pagesToBuild[pageIndex + 1];
|
||||
if (height == 1) {
|
||||
// If this is a leaf page, and not the last one to be written, shorten the upper boundary)
|
||||
int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
|
||||
pageUpperBound.truncate(commonPrefix + 1);
|
||||
}
|
||||
if (useEncryptionDomain) {
|
||||
ASSERT(p.domainId.present());
|
||||
ASSERT(nextPage.domainId.present());
|
||||
if (p.domainId.get() != nextPage.domainId.get() &&
|
||||
nextPage.domainId.get() != self->m_keyProvider->getDefaultEncryptionDomainId()) {
|
||||
pageUpperBound =
|
||||
RedwoodRecordRef(entries[nextPage.startIndex].key.substr(0, nextPage.domainPrefixLength));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For internal pages, skip first entry if child link is null. Such links only exist
|
||||
// to maintain a borrow-able prefix for the previous subtree after a subtree deletion.
|
||||
// If the null link falls on a new page post-split, then the pageLowerBound of the page
|
||||
|
@ -5811,38 +6060,34 @@ private:
|
|||
--p.count;
|
||||
debug_printf("Skipping first null record, new count=%d\n", p.count);
|
||||
|
||||
// If the page is now empty then it must be the last page in pagesToBuild, otherwise there would
|
||||
// be more than 1 item since internal pages need to have multiple children. While there is no page
|
||||
// to be built here, a record must be added to the output set because the upper boundary of the last
|
||||
// In case encryption or encryption domain is not enabled, if the page is now empty then it must be the
|
||||
// last page in pagesToBuild, otherwise there would be more than 1 item since internal pages need to
|
||||
// have multiple children. In case encryption and encryption domain is enabled, however, because of the
|
||||
// page split by encryption domain, it may not be the last page.
|
||||
//
|
||||
// Either way, a record must be added to the output set because the upper boundary of the last
|
||||
// page built does not match the upper boundary of the original page that this call to writePages() is
|
||||
// replacing. Put another way, the upper boundary of the rightmost page of the page set that was just
|
||||
// built does not match the upper boundary of the original page that the page set is replacing, so
|
||||
// adding the extra null link fixes this.
|
||||
if (p.count == 0) {
|
||||
ASSERT(pageIndex == pagesToBuild.size() - 1);
|
||||
records.push_back_deep(records.arena(), pageUpperBound);
|
||||
break;
|
||||
ASSERT(useEncryptionDomain || lastPage);
|
||||
records.push_back_deep(records.arena(), pageLowerBound);
|
||||
pageLowerBound = pageUpperBound;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
|
||||
int endIndex = pagesToBuild[pageIndex].endIndex();
|
||||
bool lastPage = endIndex == entries.size();
|
||||
pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
|
||||
|
||||
// If this is a leaf page, and not the last one to be written, shorten the upper boundary
|
||||
if (!lastPage && height == 1) {
|
||||
int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
|
||||
pageUpperBound.truncate(commonPrefix + 1);
|
||||
}
|
||||
|
||||
// Create and init page here otherwise many variables must become state vars
|
||||
state Reference<ArenaPage> page = self->m_pager->newPageBuffer(pagesToBuild[pageIndex].blockCount);
|
||||
page->init(self->m_encodingType,
|
||||
(pagesToBuild[pageIndex].blockCount == 1) ? PageType::BTreeNode : PageType::BTreeSuperNode,
|
||||
height);
|
||||
if (page->isEncrypted()) {
|
||||
ArenaPage::EncryptionKey k = wait(self->m_keyProvider->getLatestDefaultEncryptionKey());
|
||||
ArenaPage::EncryptionKey k =
|
||||
wait(useEncryptionDomain
|
||||
? self->m_keyProvider->getLatestEncryptionKey(pagesToBuild[pageIndex].domainId.get())
|
||||
: self->m_keyProvider->getLatestDefaultEncryptionKey());
|
||||
page->encryptionKey = k;
|
||||
}
|
||||
|
||||
|
@ -5935,7 +6180,8 @@ private:
|
|||
}
|
||||
|
||||
if (self->m_pBoundaryVerifier != nullptr) {
|
||||
self->m_pBoundaryVerifier->update(childPageID, v, pageLowerBound.key, pageUpperBound.key);
|
||||
ASSERT(self->m_pBoundaryVerifier->update(
|
||||
childPageID, v, pageLowerBound.key, pageUpperBound.key, height, pagesToBuild[pageIndex].domainId));
|
||||
}
|
||||
|
||||
if (++sinceYield > 100) {
|
||||
|
@ -5984,9 +6230,26 @@ private:
|
|||
// commit record, build a new root page and update records to be a link to that new page.
|
||||
// Root pointer size is limited because the pager commit header is limited to smallestPhysicalBlock in
|
||||
// size.
|
||||
//
|
||||
// There's another case. When encryption domain is enabled, we want to make sure the root node is encrypted
|
||||
// using the default encryption domain. An indication that's not true is when the first record is not using
|
||||
// dbBegin as key.
|
||||
while (records.size() > 1 ||
|
||||
records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize)) {
|
||||
records.front().getChildPage().size() > (BUGGIFY ? 1 : BTreeCommitHeader::maxRootPointerSize) ||
|
||||
records[0].key != dbBegin.key) {
|
||||
CODE_PROBE(records.size() == 1, "Writing a new root because the current root pointer would be too large");
|
||||
if (records[0].key != dbBegin.key) {
|
||||
ASSERT(self->m_keyProvider.isValid() && self->m_keyProvider->enableEncryption() &&
|
||||
self->m_keyProvider->enableEncryptionDomain());
|
||||
int64_t domainId;
|
||||
size_t prefixLength;
|
||||
std::tie(domainId, prefixLength) = self->m_keyProvider->getEncryptionDomain(records[0].key);
|
||||
ASSERT(domainId != self->m_keyProvider->getDefaultEncryptionDomainId());
|
||||
ASSERT(records[0].key.size() == prefixLength);
|
||||
CODE_PROBE(true,
|
||||
"Writing a new root because the current root is encrypted with non-default encryption "
|
||||
"domain cipher key");
|
||||
}
|
||||
self->m_header.height = ++height;
|
||||
ASSERT(height < std::numeric_limits<int8_t>::max());
|
||||
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(
|
||||
|
@ -6165,7 +6428,7 @@ private:
|
|||
self->m_pager->updatePage(PagerEventReasons::Commit, height, newID, page);
|
||||
|
||||
if (self->m_pBoundaryVerifier != nullptr) {
|
||||
self->m_pBoundaryVerifier->update(writeVersion, oldID.front(), newID.front());
|
||||
self->m_pBoundaryVerifier->updatePageId(writeVersion, oldID.front(), newID.front());
|
||||
}
|
||||
|
||||
self->freeBTreePage(height, oldID, writeVersion);
|
||||
|
@ -6327,8 +6590,14 @@ private:
|
|||
|
||||
struct InternalPageModifier {
|
||||
InternalPageModifier() {}
|
||||
InternalPageModifier(Reference<const ArenaPage> p, bool alreadyCloned, bool updating, ParentInfo* parentInfo)
|
||||
: updating(updating), page(p), clonedPage(alreadyCloned), changesMade(false), parentInfo(parentInfo) {}
|
||||
InternalPageModifier(Reference<const ArenaPage> p,
|
||||
bool alreadyCloned,
|
||||
bool updating,
|
||||
ParentInfo* parentInfo,
|
||||
Reference<IPageEncryptionKeyProvider> keyProvider,
|
||||
Optional<int64_t> pageDomainId)
|
||||
: updating(updating), page(p), clonedPage(alreadyCloned), changesMade(false), parentInfo(parentInfo),
|
||||
keyProvider(keyProvider), pageDomainId(pageDomainId) {}
|
||||
|
||||
// Whether updating the existing page is allowed
|
||||
bool updating;
|
||||
|
@ -6343,6 +6612,9 @@ private:
|
|||
bool changesMade;
|
||||
ParentInfo* parentInfo;
|
||||
|
||||
Reference<IPageEncryptionKeyProvider> keyProvider;
|
||||
Optional<int64_t> pageDomainId;
|
||||
|
||||
BTreePage* btPage() const { return (BTreePage*)page->mutateData(); }
|
||||
|
||||
bool empty() const {
|
||||
|
@ -6365,6 +6637,7 @@ private:
|
|||
void insert(BTreePage::BinaryTree::Cursor end, const VectorRef<RedwoodRecordRef>& recs) {
|
||||
int i = 0;
|
||||
if (updating) {
|
||||
cloneForUpdate();
|
||||
// Update must be done in the new tree, not the original tree where the end cursor will be from
|
||||
end.switchTree(btPage()->tree());
|
||||
|
||||
|
@ -6373,7 +6646,18 @@ private:
|
|||
const RedwoodRecordRef& rec = recs[i];
|
||||
debug_printf("internal page (updating) insert: %s\n", rec.toString(false).c_str());
|
||||
|
||||
if (!end.insert(rec)) {
|
||||
// Fail if the inserted record does not belong to the same encryption domain as the existing page
|
||||
// data.
|
||||
bool canInsert = true;
|
||||
if (page->isEncrypted() && keyProvider->enableEncryptionDomain()) {
|
||||
ASSERT(keyProvider && pageDomainId.present());
|
||||
canInsert = keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, true);
|
||||
}
|
||||
if (canInsert) {
|
||||
canInsert = end.insert(rec);
|
||||
}
|
||||
|
||||
if (!canInsert) {
|
||||
debug_printf("internal page: failed to insert %s, switching to rebuild\n",
|
||||
rec.toString(false).c_str());
|
||||
|
||||
|
@ -6432,14 +6716,11 @@ private:
|
|||
|
||||
// If the children changed, replace [cBegin, cEnd) with newLinks
|
||||
if (u.childrenChanged) {
|
||||
cloneForUpdate();
|
||||
if (updating) {
|
||||
auto c = u.cBegin;
|
||||
|
||||
if (c != u.cEnd) {
|
||||
cloneForUpdate();
|
||||
// must point c to the tree to erase from
|
||||
c.switchTree(btPage()->tree());
|
||||
}
|
||||
// must point c to the tree to erase from
|
||||
c.switchTree(btPage()->tree());
|
||||
|
||||
while (c != u.cEnd) {
|
||||
debug_printf("applyUpdate (updating) erasing: %s\n", c.get().toString(false).c_str());
|
||||
|
@ -6489,6 +6770,55 @@ private:
|
|||
}
|
||||
};
|
||||
|
||||
ACTOR static Future<Void> buildNewSubtree(VersionedBTree* self,
|
||||
Version version,
|
||||
LogicalPageID parentID,
|
||||
unsigned int height,
|
||||
MutationBuffer::const_iterator mBegin,
|
||||
MutationBuffer::const_iterator mEnd,
|
||||
InternalPageSliceUpdate* update) {
|
||||
ASSERT(height > 1);
|
||||
debug_printf(
|
||||
"buildNewSubtree start version %" PRId64 ", height %u, %s\n'", version, height, update->toString().c_str());
|
||||
state Standalone<VectorRef<RedwoodRecordRef>> records;
|
||||
while (mBegin != mEnd && mBegin.key() < update->subtreeLowerBound.key) {
|
||||
++mBegin;
|
||||
}
|
||||
while (mBegin != mEnd) {
|
||||
if (mBegin.mutation().boundarySet()) {
|
||||
RedwoodRecordRef rec(mBegin.key(), mBegin.mutation().boundaryValue.get());
|
||||
records.push_back_deep(records.arena(), rec);
|
||||
if (REDWOOD_DEBUG) {
|
||||
debug_printf(" Added %s", rec.toString().c_str());
|
||||
}
|
||||
}
|
||||
++mBegin;
|
||||
}
|
||||
if (records.empty()) {
|
||||
update->cleared();
|
||||
} else {
|
||||
state unsigned int h = 1;
|
||||
debug_printf("buildNewSubtree at level %u\n", h);
|
||||
while (h < height) {
|
||||
// Only the parentID at the root is known as we are building the subtree bottom-up.
|
||||
// We use the parentID for all levels, since the parentID is currently used for
|
||||
// debug use only.
|
||||
Standalone<VectorRef<RedwoodRecordRef>> newRecords = wait(writePages(self,
|
||||
&update->subtreeLowerBound,
|
||||
&update->subtreeUpperBound,
|
||||
records,
|
||||
h,
|
||||
version,
|
||||
BTreeNodeLinkRef(),
|
||||
parentID));
|
||||
records = newRecords;
|
||||
h++;
|
||||
}
|
||||
update->rebuilt(records);
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
|
||||
ACTOR static Future<Void> commitSubtree(
|
||||
VersionedBTree* self,
|
||||
CommitBatch* batch,
|
||||
|
@ -6545,6 +6875,12 @@ private:
|
|||
// TryToUpdate indicates insert and erase operations should be tried on the existing page first
|
||||
state bool tryToUpdate = btPage->tree()->numItems > 0 && update->boundariesNormal();
|
||||
|
||||
state bool useEncryptionDomain = page->isEncrypted() && self->m_keyProvider->enableEncryptionDomain();
|
||||
state Optional<int64_t> pageDomainId;
|
||||
if (useEncryptionDomain) {
|
||||
pageDomainId = page->getEncryptionDomainId();
|
||||
}
|
||||
|
||||
debug_printf("%s tryToUpdate=%d\n", context.c_str(), tryToUpdate);
|
||||
debug_print(addPrefix(context,
|
||||
btPage->toString("commitSubtreeStart",
|
||||
|
@ -6562,7 +6898,9 @@ private:
|
|||
ASSERT(self->m_pBoundaryVerifier->verify(rootID.front(),
|
||||
batch->snapshot->getVersion(),
|
||||
update->cBegin.get().key,
|
||||
update->cBegin.next().getOrUpperBound().key));
|
||||
update->cBegin.next().getOrUpperBound().key,
|
||||
pageDomainId,
|
||||
cursor));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6661,8 +6999,16 @@ private:
|
|||
|
||||
// If updating, first try to add the record to the page
|
||||
if (updatingDeltaTree) {
|
||||
copyForUpdate();
|
||||
if (cursor.insert(rec, update->skipLen, maxHeightAllowed)) {
|
||||
bool canInsert = true;
|
||||
if (useEncryptionDomain) {
|
||||
ASSERT(pageDomainId.present());
|
||||
canInsert = self->m_keyProvider->keyFitsInDomain(pageDomainId.get(), rec.key, false);
|
||||
}
|
||||
if (canInsert) {
|
||||
copyForUpdate();
|
||||
canInsert = cursor.insert(rec, update->skipLen, maxHeightAllowed);
|
||||
}
|
||||
if (canInsert) {
|
||||
btPage->kvBytes += rec.kvBytes();
|
||||
debug_printf("%s Inserted %s [mutation, boundary start]\n",
|
||||
context.c_str(),
|
||||
|
@ -6881,6 +7227,25 @@ private:
|
|||
|
||||
bool first = true;
|
||||
|
||||
if (useEncryptionDomain && cursor.valid() && update->subtreeLowerBound.key < cursor.get().key) {
|
||||
mEnd = batch->mutations->lower_bound(cursor.get().key);
|
||||
first = false;
|
||||
if (mBegin != mEnd) {
|
||||
slices.emplace_back(new InternalPageSliceUpdate());
|
||||
InternalPageSliceUpdate& u = *slices.back();
|
||||
u.cBegin = cursor;
|
||||
u.cEnd = cursor;
|
||||
u.subtreeLowerBound = update->subtreeLowerBound;
|
||||
u.decodeLowerBound = u.subtreeLowerBound;
|
||||
u.subtreeUpperBound = cursor.get();
|
||||
u.decodeUpperBound = u.subtreeUpperBound;
|
||||
u.expectedUpperBound = u.subtreeUpperBound;
|
||||
u.skipLen = 0;
|
||||
recursions.push_back(
|
||||
self->buildNewSubtree(self, batch->writeVersion, parentID, height, mBegin, mEnd, &u));
|
||||
}
|
||||
}
|
||||
|
||||
while (cursor.valid()) {
|
||||
slices.emplace_back(new InternalPageSliceUpdate());
|
||||
InternalPageSliceUpdate& u = *slices.back();
|
||||
|
@ -7063,7 +7428,8 @@ private:
|
|||
// which to build new page(s) if modification is not possible or not allowed.
|
||||
// If pageCopy is already set it was initialized to page above so the modifier doesn't need
|
||||
// to copy it
|
||||
state InternalPageModifier modifier(page, pageCopy.isValid(), tryToUpdate, parentInfo);
|
||||
state InternalPageModifier modifier(
|
||||
page, pageCopy.isValid(), tryToUpdate, parentInfo, self->m_keyProvider, pageDomainId);
|
||||
|
||||
// Apply the possible changes for each subtree range recursed to, except the last one.
|
||||
// For each range, the expected next record, if any, is checked against the first boundary
|
||||
|
@ -7082,8 +7448,11 @@ private:
|
|||
modifier.changesMade);
|
||||
debug_print(addPrefix(context, update->toString()));
|
||||
|
||||
// TODO(yiwu): check whether we can pass decodeUpperBound as nextBoundary when the last slice
|
||||
// have childenChanged=true.
|
||||
modifier.applyUpdate(*slices.back(),
|
||||
modifier.changesMade ? &update->subtreeUpperBound : &update->decodeUpperBound);
|
||||
modifier.changesMade || slices.back()->childrenChanged ? &update->subtreeUpperBound
|
||||
: &update->decodeUpperBound);
|
||||
|
||||
state bool detachChildren = (parentInfo->count > 2);
|
||||
state bool forceUpdate = false;
|
||||
|
@ -7146,7 +7515,8 @@ private:
|
|||
if (newID != invalidPhysicalPageID) {
|
||||
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
|
||||
if (self->m_pBoundaryVerifier != nullptr) {
|
||||
self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID);
|
||||
self->m_pBoundaryVerifier->updatePageId(
|
||||
batch->writeVersion, p, newID);
|
||||
}
|
||||
p = newID;
|
||||
++stats.metrics.detachChild;
|
||||
|
@ -7212,7 +7582,8 @@ private:
|
|||
rec.setChildPage(newPages);
|
||||
debug_printf("%s Detach updated %u -> %u\n", context.c_str(), p, newID);
|
||||
if (self->m_pBoundaryVerifier != nullptr) {
|
||||
self->m_pBoundaryVerifier->update(batch->writeVersion, p, newID);
|
||||
self->m_pBoundaryVerifier->updatePageId(
|
||||
batch->writeVersion, p, newID);
|
||||
}
|
||||
++stats.metrics.detachChild;
|
||||
}
|
||||
|
@ -7222,7 +7593,6 @@ private:
|
|||
}
|
||||
parentInfo->clear();
|
||||
}
|
||||
|
||||
Standalone<VectorRef<RedwoodRecordRef>> newChildEntries =
|
||||
wait(writePages(self,
|
||||
&update->subtreeLowerBound,
|
||||
|
@ -7430,17 +7800,24 @@ public:
|
|||
false,
|
||||
!options.present() || options.get().cacheResult || path.back().btPage()->height != 2),
|
||||
[=](Reference<const ArenaPage> p) {
|
||||
BTreePage::BinaryTree::Cursor cursor = btree->getCursor(p.getPtr(), link);
|
||||
#if REDWOOD_DEBUG
|
||||
path.push_back({ p, btree->getCursor(p.getPtr(), link), link.get().getChildPage() });
|
||||
path.push_back({ p, cursor, link.get().getChildPage() });
|
||||
#else
|
||||
path.push_back({ p, btree->getCursor(p.getPtr(), link) });
|
||||
path.push_back({ p, cursor });
|
||||
#endif
|
||||
|
||||
if (btree->m_pBoundaryVerifier != nullptr) {
|
||||
Optional<int64_t> domainId;
|
||||
if (p->isEncrypted() && btree->m_keyProvider->enableEncryptionDomain()) {
|
||||
domainId = p->getEncryptionDomainId();
|
||||
}
|
||||
ASSERT(btree->m_pBoundaryVerifier->verify(link.get().getChildPage().front(),
|
||||
pager->getVersion(),
|
||||
link.get().key,
|
||||
link.next().getOrUpperBound().key));
|
||||
link.next().getOrUpperBound().key,
|
||||
domainId,
|
||||
cursor));
|
||||
}
|
||||
return Void();
|
||||
});
|
||||
|
@ -7723,8 +8100,13 @@ public:
|
|||
// TODO(yiwu): When the cluster encryption config is available later, fail if the cluster is configured to
|
||||
// enable encryption, but the Redwood instance is unencrypted.
|
||||
if (encryptionKeyProvider && encryptionKeyProvider->enableEncryption()) {
|
||||
ASSERT(encryptionKeyProvider->expectedEncodingType() == EncodingType::AESEncryptionV1);
|
||||
encodingType = EncodingType::AESEncryptionV1;
|
||||
m_keyProvider = encryptionKeyProvider;
|
||||
} else if (g_network->isSimulated() && logID.hash() % 2 == 0) {
|
||||
// Simulation only. Deterministically enable encryption based on uid
|
||||
encodingType = EncodingType::XOREncryption_TestOnly;
|
||||
m_keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(filename);
|
||||
}
|
||||
|
||||
IPager2* pager = new DWALPager(pageSize,
|
||||
|
@ -9752,6 +10134,11 @@ TEST_CASE("Lredwood/correctness/btree") {
|
|||
state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->random01() < 0.25);
|
||||
state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25);
|
||||
|
||||
state int encoding =
|
||||
params.getInt("encodingType").orDefault(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
|
||||
state unsigned int encryptionDomainMode =
|
||||
params.getInt("domainMode")
|
||||
.orDefault(deterministicRandom()->randomInt(0, RandomEncryptionKeyProvider::EncryptionDomainMode::MAX));
|
||||
state int pageSize =
|
||||
shortTest ? 250 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(250, 400));
|
||||
state int extentSize =
|
||||
|
@ -9802,24 +10189,25 @@ TEST_CASE("Lredwood/correctness/btree") {
|
|||
// Max number of records in the BTree or the versioned written map to visit
|
||||
state int64_t maxRecordsRead = params.getInt("maxRecordsRead").orDefault(300e6);
|
||||
|
||||
state EncodingType encodingType =
|
||||
static_cast<EncodingType>(deterministicRandom()->randomInt(0, EncodingType::MAX_ENCODING_TYPE));
|
||||
state EncodingType encodingType = static_cast<EncodingType>(encoding);
|
||||
state Reference<IPageEncryptionKeyProvider> keyProvider;
|
||||
if (encodingType == EncodingType::AESEncryptionV1) {
|
||||
keyProvider = makeReference<RandomEncryptionKeyProvider>();
|
||||
keyProvider = makeReference<RandomEncryptionKeyProvider>(
|
||||
RandomEncryptionKeyProvider::EncryptionDomainMode(encryptionDomainMode));
|
||||
} else if (encodingType == EncodingType::XOREncryption_TestOnly) {
|
||||
keyProvider = makeReference<XOREncryptionKeyProvider_TestOnly>(file);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
printf("file: %s\n", file.c_str());
|
||||
printf("encodingType: %d\n", encodingType);
|
||||
printf("maxPageOps: %" PRId64 "\n", maxPageOps);
|
||||
printf("maxVerificationMapEntries: %d\n", maxVerificationMapEntries);
|
||||
printf("maxRecordsRead: %" PRId64 "\n", maxRecordsRead);
|
||||
printf("pagerMemoryOnly: %d\n", pagerMemoryOnly);
|
||||
printf("serialTest: %d\n", serialTest);
|
||||
printf("shortTest: %d\n", shortTest);
|
||||
printf("encodingType: %d\n", encodingType);
|
||||
printf("domainMode: %d\n", encryptionDomainMode);
|
||||
printf("pageSize: %d\n", pageSize);
|
||||
printf("extentSize: %d\n", extentSize);
|
||||
printf("maxKeySize: %d\n", maxKeySize);
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
#define XXH_INLINE_ALL
|
||||
#include "flow/xxhash.h"
|
||||
|
||||
#include <functional>
|
||||
#include <tuple>
|
||||
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
@ -72,10 +73,12 @@ public:
|
|||
virtual bool enableEncryptionDomain() const { return false; }
|
||||
|
||||
// Get an encryption key from given encoding header.
|
||||
virtual Future<EncryptionKey> getEncryptionKey(void* encodingHeader) { throw not_implemented(); }
|
||||
virtual Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) { throw not_implemented(); }
|
||||
|
||||
// Get latest encryption key. If encryption domain is enabled, get encryption key for the default domain.
|
||||
virtual Future<EncryptionKey> getLatestDefaultEncryptionKey() { throw not_implemented(); }
|
||||
virtual Future<EncryptionKey> getLatestDefaultEncryptionKey() {
|
||||
return getLatestEncryptionKey(getDefaultEncryptionDomainId());
|
||||
}
|
||||
|
||||
// Get latest encryption key for data in given encryption domain.
|
||||
virtual Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) { throw not_implemented(); }
|
||||
|
@ -94,10 +97,22 @@ public:
|
|||
}
|
||||
|
||||
// Get encryption domain of a page given encoding header.
|
||||
virtual int64_t getEncryptionDomain(void* encodingHeader) { throw not_implemented(); }
|
||||
virtual int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) { throw not_implemented(); }
|
||||
|
||||
// Setting tenant prefix to tenant name map. Used by TenantAwareEncryptionKeyProvider.
|
||||
virtual void setTenantPrefixIndex(Reference<TenantPrefixIndex> tenantPrefixIndex) {}
|
||||
|
||||
// Helper methods.
|
||||
|
||||
// Check if a key fits in an encryption domain.
|
||||
bool keyFitsInDomain(int64_t domainId, const KeyRef& key, bool canUseDefaultDomain) {
|
||||
ASSERT(enableEncryptionDomain());
|
||||
int64_t keyDomainId;
|
||||
size_t prefixLength;
|
||||
std::tie(keyDomainId, prefixLength) = getEncryptionDomain(key);
|
||||
return keyDomainId == domainId ||
|
||||
(canUseDefaultDomain && (domainId == getDefaultEncryptionDomainId() && key.size() == prefixLength));
|
||||
}
|
||||
};
|
||||
|
||||
// The null key provider is useful to simplify page decoding.
|
||||
|
@ -133,39 +148,20 @@ public:
|
|||
|
||||
bool enableEncryption() const override { return true; }
|
||||
|
||||
bool enableEncryptionDomain() const override { return true; }
|
||||
Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
|
||||
|
||||
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override {
|
||||
|
||||
EncodingHeader* h = reinterpret_cast<EncodingHeader*>(encodingHeader);
|
||||
const EncodingHeader* h = reinterpret_cast<const EncodingHeader*>(encodingHeader);
|
||||
EncryptionKey s;
|
||||
s.xorKey = h->xorKey;
|
||||
return s;
|
||||
}
|
||||
|
||||
Future<EncryptionKey> getLatestDefaultEncryptionKey() override { return getLatestEncryptionKey(0); }
|
||||
|
||||
Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) override {
|
||||
Future<EncryptionKey> getLatestDefaultEncryptionKey() override {
|
||||
EncryptionKey s;
|
||||
s.xorKey = ~(uint8_t)domainId ^ xorWith;
|
||||
s.xorKey = xorWith;
|
||||
return s;
|
||||
}
|
||||
|
||||
int64_t getDefaultEncryptionDomainId() const override { return 0; }
|
||||
|
||||
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key,
|
||||
Optional<int64_t> /*possibleDomainId*/) override {
|
||||
if (key.size() > 0) {
|
||||
return { *key.begin(), 1 };
|
||||
}
|
||||
return { 0, 0 };
|
||||
}
|
||||
|
||||
int64_t getEncryptionDomain(void* encodingHeader) override {
|
||||
uint8_t xorKey = reinterpret_cast<EncodingHeader*>(encodingHeader)->xorKey;
|
||||
return (int64_t)(~xorKey ^ xorWith);
|
||||
}
|
||||
|
||||
uint8_t xorWith;
|
||||
};
|
||||
|
||||
|
@ -173,11 +169,19 @@ public:
|
|||
// Use for testing.
|
||||
class RandomEncryptionKeyProvider : public IPageEncryptionKeyProvider {
|
||||
public:
|
||||
RandomEncryptionKeyProvider() {
|
||||
enum EncryptionDomainMode : unsigned int {
|
||||
DISABLED = 0, // disable encryption domain
|
||||
RANDOM, // for each key prefix, deterministic randomly decide if there's an encryption domain for it.
|
||||
ALL, // all key prefixes has an encryption domain assigned to it.
|
||||
MAX,
|
||||
};
|
||||
|
||||
explicit RandomEncryptionKeyProvider(EncryptionDomainMode mode) : mode(mode) {
|
||||
ASSERT(mode < EncryptionDomainMode::MAX);
|
||||
for (unsigned i = 0; i < NUM_CIPHER; i++) {
|
||||
BlobCipherDetails cipherDetails;
|
||||
cipherDetails.encryptDomainId = i;
|
||||
cipherDetails.baseCipherId = deterministicRandom()->randomUInt64();
|
||||
cipherDetails.encryptDomainId = 0;
|
||||
cipherDetails.baseCipherId = i;
|
||||
cipherDetails.salt = deterministicRandom()->randomUInt64();
|
||||
cipherKeys[i] = generateCipherKey(cipherDetails);
|
||||
}
|
||||
|
@ -188,22 +192,47 @@ public:
|
|||
|
||||
bool enableEncryption() const override { return true; }
|
||||
|
||||
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override {
|
||||
bool enableEncryptionDomain() const override { return mode > 1; }
|
||||
|
||||
Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
|
||||
using Header = ArenaPage::AESEncryptionV1Encoder::Header;
|
||||
Header* h = reinterpret_cast<Header*>(encodingHeader);
|
||||
const Header* h = reinterpret_cast<const Header*>(encodingHeader);
|
||||
EncryptionKey s;
|
||||
s.aesKey.cipherTextKey = cipherKeys[h->cipherTextDetails.encryptDomainId];
|
||||
s.aesKey.cipherHeaderKey = cipherKeys[h->cipherHeaderDetails.encryptDomainId];
|
||||
s.aesKey.cipherTextKey = getCipherKey(h->cipherTextDetails.encryptDomainId, h->cipherTextDetails.baseCipherId);
|
||||
s.aesKey.cipherHeaderKey =
|
||||
getCipherKey(h->cipherHeaderDetails.encryptDomainId, h->cipherHeaderDetails.baseCipherId);
|
||||
return s;
|
||||
}
|
||||
|
||||
Future<EncryptionKey> getLatestDefaultEncryptionKey() override {
|
||||
Future<EncryptionKey> getLatestEncryptionKey(int64_t domainId) override {
|
||||
domainId = checkDomainId(domainId);
|
||||
EncryptionKey s;
|
||||
s.aesKey.cipherTextKey = cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)];
|
||||
s.aesKey.cipherHeaderKey = cipherKeys[deterministicRandom()->randomInt(0, NUM_CIPHER)];
|
||||
s.aesKey.cipherTextKey = getCipherKey(domainId, deterministicRandom()->randomInt(0, NUM_CIPHER));
|
||||
s.aesKey.cipherHeaderKey =
|
||||
getCipherKey(ENCRYPT_HEADER_DOMAIN_ID, deterministicRandom()->randomInt(0, NUM_CIPHER));
|
||||
return s;
|
||||
}
|
||||
|
||||
int64_t getDefaultEncryptionDomainId() const override { return FDB_DEFAULT_ENCRYPT_DOMAIN_ID; }
|
||||
|
||||
std::tuple<int64_t, size_t> getEncryptionDomain(const KeyRef& key, Optional<int64_t>) override {
|
||||
int64_t domainId;
|
||||
if (key.size() < PREFIX_LENGTH) {
|
||||
domainId = getDefaultEncryptionDomainId();
|
||||
} else {
|
||||
// Use first 4 bytes as a 32-bit int for the domain id.
|
||||
domainId = checkDomainId(static_cast<int64_t>(*reinterpret_cast<const int32_t*>(key.begin())));
|
||||
}
|
||||
return { domainId, (domainId == getDefaultEncryptionDomainId() ? 0 : PREFIX_LENGTH) };
|
||||
}
|
||||
|
||||
int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override {
|
||||
ASSERT(encodingHeader != nullptr);
|
||||
using Header = ArenaPage::AESEncryptionV1Encoder::Header;
|
||||
const Header* h = reinterpret_cast<const Header*>(encodingHeader);
|
||||
return h->cipherTextDetails.encryptDomainId;
|
||||
}
|
||||
|
||||
private:
|
||||
Reference<BlobCipherKey> generateCipherKey(const BlobCipherDetails& cipherDetails) {
|
||||
static unsigned char SHA_KEY[] = "3ab9570b44b8315fdb261da6b1b6c13b";
|
||||
|
@ -226,7 +255,28 @@ private:
|
|||
std::numeric_limits<int64_t>::max() /* expireAt */);
|
||||
}
|
||||
|
||||
int64_t checkDomainId(int64_t domainId) {
|
||||
std::hash<int64_t> hasher;
|
||||
if (mode == DISABLED || (mode == RANDOM && hasher(domainId) % 2 == 0)) {
|
||||
return getDefaultEncryptionDomainId();
|
||||
}
|
||||
return domainId;
|
||||
}
|
||||
|
||||
Reference<BlobCipherKey> getCipherKey(EncryptCipherDomainId domainId, EncryptCipherBaseKeyId cipherId) {
|
||||
// Create a new cipher key by replacing the domain id.
|
||||
return makeReference<BlobCipherKey>(domainId,
|
||||
cipherId,
|
||||
cipherKeys[cipherId]->rawBaseCipher(),
|
||||
AES_256_KEY_LENGTH,
|
||||
cipherKeys[cipherId]->getSalt(),
|
||||
std::numeric_limits<int64_t>::max() /* refreshAt */,
|
||||
std::numeric_limits<int64_t>::max() /* expireAt */);
|
||||
}
|
||||
|
||||
static constexpr int NUM_CIPHER = 1000;
|
||||
static constexpr size_t PREFIX_LENGTH = 4;
|
||||
EncryptionDomainMode mode;
|
||||
Reference<BlobCipherKey> cipherKeys[NUM_CIPHER];
|
||||
};
|
||||
|
||||
|
@ -248,8 +298,9 @@ public:
|
|||
|
||||
bool enableEncryptionDomain() const override { return true; }
|
||||
|
||||
ACTOR static Future<EncryptionKey> getEncryptionKey(TenantAwareEncryptionKeyProvider* self, void* encodingHeader) {
|
||||
BlobCipherEncryptHeader* header = reinterpret_cast<EncodingHeader*>(encodingHeader);
|
||||
ACTOR static Future<EncryptionKey> getEncryptionKey(TenantAwareEncryptionKeyProvider* self,
|
||||
const void* encodingHeader) {
|
||||
const BlobCipherEncryptHeader* header = reinterpret_cast<const EncodingHeader*>(encodingHeader);
|
||||
TextAndHeaderCipherKeys cipherKeys =
|
||||
wait(getEncryptCipherKeys(self->db, *header, BlobCipherMetrics::KV_REDWOOD));
|
||||
EncryptionKey encryptionKey;
|
||||
|
@ -257,7 +308,7 @@ public:
|
|||
return encryptionKey;
|
||||
}
|
||||
|
||||
Future<EncryptionKey> getEncryptionKey(void* encodingHeader) override {
|
||||
Future<EncryptionKey> getEncryptionKey(const void* encodingHeader) override {
|
||||
return getEncryptionKey(this, encodingHeader);
|
||||
}
|
||||
|
||||
|
@ -292,7 +343,7 @@ public:
|
|||
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
|
||||
}
|
||||
StringRef prefix = key.substr(0, TENANT_PREFIX_SIZE);
|
||||
int64_t tenantId = TenantMapEntry::prefixToId(prefix);
|
||||
int64_t tenantId = TenantMapEntry::prefixToId(prefix, EnforceValidTenantId::False);
|
||||
// Tenant id must be non-negative.
|
||||
if (tenantId < 0) {
|
||||
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
|
||||
|
@ -314,8 +365,9 @@ public:
|
|||
return { FDB_DEFAULT_ENCRYPT_DOMAIN_ID, 0 };
|
||||
}
|
||||
|
||||
int64_t getEncryptionDomain(void* encodingHeader) override {
|
||||
BlobCipherEncryptHeader* header = reinterpret_cast<EncodingHeader*>(encodingHeader);
|
||||
int64_t getEncryptionDomainIdFromHeader(const void* encodingHeader) override {
|
||||
ASSERT(encodingHeader != nullptr);
|
||||
const BlobCipherEncryptHeader* header = reinterpret_cast<const EncodingHeader*>(encodingHeader);
|
||||
return header->cipherTextDetails.encryptDomainId;
|
||||
}
|
||||
|
||||
|
|
|
@ -498,7 +498,7 @@ public:
|
|||
// Secret is set if needed
|
||||
// Post: Main and Encoding subheaders are updated
|
||||
// Payload is possibly encrypted
|
||||
void preWrite(PhysicalPageID pageID) const {
|
||||
void preWrite(PhysicalPageID pageID) {
|
||||
// Explicitly check payload definedness to make the source of valgrind errors more clear.
|
||||
// Without this check, calculating a checksum on a payload with undefined bytes does not
|
||||
// cause a valgrind error but the resulting checksum is undefined which causes errors later.
|
||||
|
@ -519,6 +519,7 @@ public:
|
|||
} else {
|
||||
throw page_header_version_not_supported();
|
||||
}
|
||||
encodingHeaderAvailable = true;
|
||||
}
|
||||
|
||||
// Must be called after reading from disk to verify all non-payload bytes
|
||||
|
@ -531,6 +532,7 @@ public:
|
|||
void postReadHeader(PhysicalPageID pageID, bool verify = true) {
|
||||
pPayload = page->getPayload();
|
||||
payloadSize = logicalSize - (pPayload - buffer);
|
||||
encodingHeaderAvailable = true;
|
||||
|
||||
if (page->headerVersion == 1) {
|
||||
if (verify) {
|
||||
|
@ -568,7 +570,18 @@ public:
|
|||
// Returns true if the page's encoding type employs encryption
|
||||
bool isEncrypted() const { return isEncodingTypeEncrypted(getEncodingType()); }
|
||||
|
||||
void* getEncodingHeader() { return page->getEncodingHeader(); }
|
||||
// Return encryption domain id used. This method only use information from the encryptionKey.
|
||||
// Caller should make sure encryption domain is in use.
|
||||
int64_t getEncryptionDomainId() const {
|
||||
// encryption domain is only supported by AESEncryptionV1.
|
||||
ASSERT(getEncodingType() == EncodingType::AESEncryptionV1);
|
||||
const Reference<BlobCipherKey>& cipherKey = encryptionKey.aesKey.cipherTextKey;
|
||||
ASSERT(cipherKey.isValid());
|
||||
return cipherKey->getDomainId();
|
||||
}
|
||||
|
||||
// Return pointer to encoding header.
|
||||
const void* getEncodingHeader() const { return encodingHeaderAvailable ? page->getEncodingHeader() : nullptr; }
|
||||
|
||||
private:
|
||||
Arena arena;
|
||||
|
@ -608,6 +621,9 @@ public:
|
|||
// Used by encodings that do encryption
|
||||
EncryptionKey encryptionKey;
|
||||
|
||||
// Whether encoding header is set
|
||||
bool encodingHeaderAvailable = false;
|
||||
|
||||
mutable ArbitraryObject extra;
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue