foundationdb/fdbserver/BlobGranuleServerCommon.act...

473 lines
19 KiB
C++

/*
* BlobGranuleServerCommon.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fmt/format.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/Knobs.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // has to be last include
// serialize change feed key as UID bytes, to use 16 bytes on disk
Key granuleIDToCFKey(UID granuleID) {
BinaryWriter wr(Unversioned());
wr << granuleID;
return wr.toValue();
}
// parse change feed key back to UID, to be human-readable
UID cfKeyToGranuleID(Key cfKey) {
return BinaryReader::fromStringRef<UID>(cfKey, Unversioned());
}
// Gets the latest granule history node for range that was persisted
ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr, KeyRange range) {
state KeyRange historyRange = blobGranuleHistoryKeyRangeFor(range);
state RangeResult result = wait(tr->getRange(historyRange, 1, Snapshot::False, Reverse::True));
ASSERT(result.size() <= 1);
Optional<GranuleHistory> history;
if (!result.empty()) {
std::pair<KeyRange, Version> decodedKey = decodeBlobGranuleHistoryKey(result[0].key);
ASSERT(range == decodedKey.first);
history = GranuleHistory(range, decodedKey.second, decodeBlobGranuleHistoryValue(result[0].value));
}
return history;
}
// Gets the files based on the file key range [startKey, endKey)
// and populates the files object accordingly
ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, GranuleFiles* files, UID granuleID) {
loop {
int lim = BUGGIFY ? 2 : 1000;
RangeResult res = wait(tr->getRange(KeyRangeRef(*startKey, endKey), lim));
for (auto& it : res) {
UID gid;
uint8_t fileType;
Version version;
Standalone<StringRef> filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(it.key);
ASSERT(gid == granuleID);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(it.value);
BlobFileIndex idx(version, filename.toString(), offset, length, fullFileLength, cipherKeysMeta);
if (fileType == 'S') {
ASSERT(files->snapshotFiles.empty() || files->snapshotFiles.back().version < idx.version);
files->snapshotFiles.push_back(idx);
} else {
ASSERT(fileType == 'D');
ASSERT(files->deltaFiles.empty() || files->deltaFiles.back().version < idx.version);
files->deltaFiles.push_back(idx);
}
}
if (res.more) {
*startKey = keyAfter(res.back().key);
} else {
break;
}
}
return Void();
}
// Wrapper around readGranuleFiles
// Gets all files belonging to the granule with id granule ID
ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID) {
state KeyRange range = blobGranuleFileKeyRangeFor(granuleID);
state Key startKey = range.begin;
state GranuleFiles files;
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
wait(readGranuleFiles(&tr, &startKey, range.end, &files, granuleID));
return files;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Normally a beginVersion != 0 means the caller wants all mutations between beginVersion and readVersion, instead of
// the latest snapshot before readVersion + deltas after the snapshot. When canCollapse is set, the beginVersion is
// essentially just an optimization hint. The caller is still concerned with reconstructing rows at readVersion, it just
// knows it doesn't need anything before beginVersion.
// Normally this can eliminate the need for a snapshot and just return a small amount of deltas. But in a highly active
// key range, the granule may have a snapshot file at version X, where beginVersion < X <= readVersion. In this case, if
// the number of bytes in delta files between beginVersion and X is larger than the snapshot file at version X, it is
// strictly more efficient (in terms of files and bytes read) to just use the snapshot file at version X instead.
//
// To assist BlobGranule file (snapshot and/or delta) file encryption, the routine while populating snapshot and/or
// delta files, constructs BlobFilePointerRef->cipherKeysMeta field. Approach avoids this method to be defined as an
// ACTOR, as fetching desired EncryptionKey may potentially involve reaching out to EncryptKeyProxy or external KMS.
void GranuleFiles::getFiles(Version beginVersion,
Version readVersion,
bool canCollapse,
BlobGranuleChunkRef& chunk,
Arena& replyArena,
int64_t& deltaBytesCounter) const {
BlobFileIndex dummyIndex; // for searching
// if beginVersion == 0 or we can collapse, find the latest snapshot <= readVersion
auto snapshotF = snapshotFiles.end();
if (beginVersion == 0 || canCollapse) {
dummyIndex.version = readVersion;
snapshotF = std::lower_bound(snapshotFiles.begin(), snapshotFiles.end(), dummyIndex);
if (snapshotF == snapshotFiles.end() || snapshotF->version > readVersion) {
ASSERT(snapshotF != snapshotFiles.begin());
snapshotF--;
}
ASSERT(snapshotF != snapshotFiles.end());
ASSERT(snapshotF->version <= readVersion);
}
auto deltaF = deltaFiles.end();
if (beginVersion > 0) {
dummyIndex.version = beginVersion;
deltaF = std::lower_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
if (canCollapse) {
ASSERT(snapshotF != snapshotFiles.end());
// If we can collapse, see if delta files up to snapshotVersion are smaller or larger than snapshotBytes in
// total
auto deltaFCopy = deltaF;
int64_t snapshotBytes = snapshotF->length;
while (deltaFCopy != deltaFiles.end() && deltaFCopy->version <= snapshotF->version && snapshotBytes > 0) {
snapshotBytes -= deltaFCopy->length;
deltaFCopy++;
}
// if delta files contain the same or more bytes as the snapshot with collapse, do the collapse
if (snapshotBytes > 0) {
// don't collapse, clear snapshotF and just do delta files
snapshotF = snapshotFiles.end();
} else {
// do snapshot instead of previous deltas
dummyIndex.version = snapshotF->version;
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
}
}
} else {
dummyIndex.version = snapshotF->version;
deltaF = std::upper_bound(deltaFiles.begin(), deltaFiles.end(), dummyIndex);
ASSERT(deltaF == deltaFiles.end() || deltaF->version > snapshotF->version);
}
Version lastIncluded = invalidVersion;
if (snapshotF != snapshotFiles.end()) {
chunk.snapshotVersion = snapshotF->version;
chunk.snapshotFile = BlobFilePointerRef(replyArena,
snapshotF->filename,
snapshotF->offset,
snapshotF->length,
snapshotF->fullFileLength,
snapshotF->cipherKeysMeta);
lastIncluded = chunk.snapshotVersion;
} else {
chunk.snapshotVersion = invalidVersion;
}
while (deltaF != deltaFiles.end() && deltaF->version < readVersion) {
chunk.deltaFiles.emplace_back_deep(replyArena,
deltaF->filename,
deltaF->offset,
deltaF->length,
deltaF->fullFileLength,
deltaF->cipherKeysMeta);
deltaBytesCounter += deltaF->length;
ASSERT(lastIncluded < deltaF->version);
lastIncluded = deltaF->version;
deltaF++;
}
// include last delta file that passes readVersion, if it exists
if (deltaF != deltaFiles.end() && lastIncluded < readVersion) {
chunk.deltaFiles.emplace_back_deep(replyArena,
deltaF->filename,
deltaF->offset,
deltaF->length,
deltaF->fullFileLength,
deltaF->cipherKeysMeta);
deltaBytesCounter += deltaF->length;
lastIncluded = deltaF->version;
}
}
static std::string makeTestFileName(Version v) {
return "test" + std::to_string(v);
}
static BlobFileIndex makeTestFile(Version v, int64_t len) {
return BlobFileIndex(v, makeTestFileName(v), 0, len, len);
}
static void checkFile(int expectedVersion, const BlobFilePointerRef& actualFile) {
ASSERT(makeTestFileName(expectedVersion) == actualFile.filename.toString());
}
static void checkFiles(const GranuleFiles& f,
Version beginVersion,
Version readVersion,
bool canCollapse,
Optional<int> expectedSnapshotVersion,
std::vector<int> expectedDeltaVersions) {
Arena a;
BlobGranuleChunkRef chunk;
int64_t deltaBytes = 0;
f.getFiles(beginVersion, readVersion, canCollapse, chunk, a, deltaBytes);
fmt::print("results({0}, {1}, {2}):\nEXPECTED:\n snapshot={3}\n deltas ({4}):\n",
beginVersion,
readVersion,
canCollapse ? "T" : "F",
expectedSnapshotVersion.present() ? makeTestFileName(expectedSnapshotVersion.get()).c_str() : "<N/A>",
expectedDeltaVersions.size());
for (int d : expectedDeltaVersions) {
fmt::print(" {}\n", makeTestFileName(d));
}
fmt::print("ACTUAL:\n snapshot={0}\n deltas ({1}):\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().filename.toString().c_str() : "<N/A>",
chunk.deltaFiles.size());
for (auto& it : chunk.deltaFiles) {
fmt::print(" {}\n", it.filename.toString());
}
printf("\n\n\n");
ASSERT(expectedSnapshotVersion.present() == chunk.snapshotFile.present());
if (expectedSnapshotVersion.present()) {
checkFile(expectedSnapshotVersion.get(), chunk.snapshotFile.get());
}
ASSERT(expectedDeltaVersions.size() == chunk.deltaFiles.size());
for (int i = 0; i < expectedDeltaVersions.size(); i++) {
checkFile(expectedDeltaVersions[i], chunk.deltaFiles[i]);
}
}
/*
* Files:
* S @ 100 (10 bytes)
* D @ 150 (5 bytes)
* D @ 200 (6 bytes)
* S @ 200 (15 bytes)
* D @ 250 (7 bytes)
* D @ 300 (8 bytes)
* S @ 300 (10 bytes)
* D @ 350 (4 bytes)
*/
TEST_CASE("/blobgranule/server/common/granulefiles") {
// simple cases first
// single snapshot file, no deltas
GranuleFiles files;
files.snapshotFiles.push_back(makeTestFile(100, 10));
printf("Just snapshot\n");
checkFiles(files, 0, 100, false, 100, {});
checkFiles(files, 0, 200, false, 100, {});
printf("Small test\n");
// add delta files with re-snapshot at end
files.deltaFiles.push_back(makeTestFile(150, 5));
files.deltaFiles.push_back(makeTestFile(200, 6));
files.snapshotFiles.push_back(makeTestFile(200, 15));
// check different read versions with beginVersion=0
checkFiles(files, 0, 100, false, 100, {});
checkFiles(files, 0, 101, false, 100, { 150 });
checkFiles(files, 0, 149, false, 100, { 150 });
checkFiles(files, 0, 150, false, 100, { 150 });
checkFiles(files, 0, 151, false, 100, { 150, 200 });
checkFiles(files, 0, 199, false, 100, { 150, 200 });
checkFiles(files, 0, 200, false, 200, {});
checkFiles(files, 0, 300, false, 200, {});
// Test all cases of beginVersion + readVersion. Because delta files are smaller than snapshot at 200, this should
// be the same with and without collapse
checkFiles(files, 100, 200, false, Optional<int>(), { 150, 200 });
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200 });
checkFiles(files, 101, 199, false, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 151, false, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 150, false, Optional<int>(), { 150 });
checkFiles(files, 150, 151, false, Optional<int>(), { 150, 200 });
checkFiles(files, 151, 200, false, Optional<int>(), { 200 });
checkFiles(files, 100, 200, true, Optional<int>(), { 150, 200 });
checkFiles(files, 100, 300, true, Optional<int>(), { 150, 200 });
checkFiles(files, 101, 199, true, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 151, true, Optional<int>(), { 150, 200 });
checkFiles(files, 149, 150, true, Optional<int>(), { 150 });
checkFiles(files, 150, 151, true, Optional<int>(), { 150, 200 });
checkFiles(files, 151, 200, true, Optional<int>(), { 200 });
printf("Larger test\n");
// add more delta files and snapshots to check collapse logic
files.deltaFiles.push_back(makeTestFile(250, 7));
files.deltaFiles.push_back(makeTestFile(300, 8));
files.snapshotFiles.push_back(makeTestFile(300, 10));
files.deltaFiles.push_back(makeTestFile(350, 4));
checkFiles(files, 0, 300, false, 300, {});
checkFiles(files, 0, 301, false, 300, { 350 });
checkFiles(files, 0, 400, false, 300, { 350 });
// check delta files without collapse
checkFiles(files, 100, 301, false, Optional<int>(), { 150, 200, 250, 300, 350 });
checkFiles(files, 100, 300, false, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 251, false, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 250, false, Optional<int>(), { 150, 200, 250 });
checkFiles(files, 151, 300, false, Optional<int>(), { 200, 250, 300 });
checkFiles(files, 151, 301, false, Optional<int>(), { 200, 250, 300, 350 });
checkFiles(files, 151, 400, false, Optional<int>(), { 200, 250, 300, 350 });
checkFiles(files, 201, 300, false, Optional<int>(), { 250, 300 });
checkFiles(files, 201, 301, false, Optional<int>(), { 250, 300, 350 });
checkFiles(files, 201, 400, false, Optional<int>(), { 250, 300, 350 });
checkFiles(files, 251, 300, false, Optional<int>(), { 300 });
checkFiles(files, 251, 301, false, Optional<int>(), { 300, 350 });
checkFiles(files, 251, 400, false, Optional<int>(), { 300, 350 });
checkFiles(files, 301, 400, false, Optional<int>(), { 350 });
checkFiles(files, 351, 400, false, Optional<int>(), {});
// check with collapse
// these 2 collapse because the delta files at 150+200+250+300 are larger than the snapshot at 300
checkFiles(files, 100, 301, true, 300, { 350 });
checkFiles(files, 100, 300, true, 300, {});
// these 2 don't collapse because 150+200 delta files are smaller than the snapshot at 200
checkFiles(files, 100, 251, true, Optional<int>(), { 150, 200, 250, 300 });
checkFiles(files, 100, 250, true, Optional<int>(), { 150, 200, 250 });
// these 3 do collapse because the delta files at 200+250+300 are larger than the snapshot at 300
checkFiles(files, 151, 300, true, 300, {});
checkFiles(files, 151, 301, true, 300, { 350 });
checkFiles(files, 151, 400, true, 300, { 350 });
// these 3 do collapse because the delta files at 250+300 are larger than the snapshot at 300
checkFiles(files, 201, 300, true, 300, {});
checkFiles(files, 201, 301, true, 300, { 350 });
checkFiles(files, 201, 400, true, 300, { 350 });
// these don't collapse because the delta file at 300 is smaller than the snapshot at 300
checkFiles(files, 251, 300, true, Optional<int>(), { 300 });
checkFiles(files, 251, 301, true, Optional<int>(), { 300, 350 });
checkFiles(files, 251, 400, true, Optional<int>(), { 300, 350 });
checkFiles(files, 301, 400, true, Optional<int>(), { 350 });
checkFiles(files, 351, 400, true, Optional<int>(), {});
return Void();
}
// FIXME: if credentials can expire, refresh periodically
ACTOR Future<Void> loadBlobMetadataForTenants(BGTenantMap* self, std::vector<TenantMapEntry> tenantMapEntries) {
ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
ASSERT(!tenantMapEntries.empty());
state std::vector<BlobMetadataDomainId> domainIds;
for (auto& entry : tenantMapEntries) {
domainIds.push_back(entry.id);
}
// FIXME: if one tenant gets an error, don't kill whole process
// TODO: add latency metrics
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) {
EKPGetLatestBlobMetadataRequest req;
req.domainIds = domainIds;
requestFuture =
brokenPromiseToNever(self->dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req));
} else {
requestFuture = Never();
}
choose {
when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) {
ASSERT(rep.blobMetadataDetails.size() == domainIds.size());
// not guaranteed to be in same order in the request as the response
for (auto& metadata : rep.blobMetadataDetails) {
auto info = self->tenantInfoById.find(metadata.domainId);
if (info == self->tenantInfoById.end()) {
continue;
}
auto dataEntry = self->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->setBStore(BlobConnectionProvider::newBlobConnectionProvider(metadata));
}
return Void();
}
when(wait(self->dbInfo->onChange())) {}
}
}
}
// list of tenants that may or may not already exist
void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) {
std::vector<TenantMapEntry> tenantsToLoad;
for (auto entry : tenants) {
if (tenantInfoById.insert({ entry.second.id, entry.second }).second) {
auto r = makeReference<GranuleTenantData>(entry.first, entry.second);
tenantData.insert(KeyRangeRef(entry.second.prefix, entry.second.prefix.withSuffix(normalKeys.end)), r);
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
r->bstoreLoaded.send(Void());
} else {
tenantsToLoad.push_back(entry.second);
}
}
}
if (!tenantsToLoad.empty()) {
addActor.send(loadBlobMetadataForTenants(this, tenantsToLoad));
}
}
// TODO: implement
void BGTenantMap::removeTenants(std::vector<int64_t> tenantIds) {
throw not_implemented();
}
Optional<TenantMapEntry> BGTenantMap::getTenantById(int64_t id) {
auto tenant = tenantInfoById.find(id);
if (tenant == tenantInfoById.end()) {
return {};
} else {
return tenant->second;
}
}
// TODO: handle case where tenant isn't loaded yet
Reference<GranuleTenantData> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) {
auto tenant = tenantData.rangeContaining(keyRange.begin);
ASSERT(tenant.begin() <= keyRange.begin);
ASSERT(tenant.end() >= keyRange.end);
return tenant.cvalue();
}