Full integration with BlobConnectionProvider for blob granules

This commit is contained in:
Josh Slocum 2022-06-01 11:19:24 -05:00
parent ffa4255c65
commit 567b1d35f7
18 changed files with 349 additions and 234 deletions

View File

@ -21,7 +21,7 @@
#include <string>
#include "flow/IRandom.h"
#include "fdbserver/BlobConnectionProvider.h"
#include "fdbclient/BlobConnectionProvider.h"
struct SingleBlobConnectionProvider : BlobConnectionProvider {
public:

View File

@ -22,7 +22,7 @@
#define BLOB_CONNECTION_PROVIDER_H
#include "fdbclient/BackupContainerFileSystem.h"
#include "fdbserver/BlobMetadataUtils.h"
#include "fdbclient/BlobMetadataUtils.h"
struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProvider> {
// chooses a partition and prepends the necessary prefix to the filename (if necessary) for writing a file, and
@ -38,6 +38,8 @@ struct BlobConnectionProvider : NonCopyable, ReferenceCounted<BlobConnectionProv
static Reference<BlobConnectionProvider> newBlobConnectionProvider(std::string blobUrl);
static Reference<BlobConnectionProvider> newBlobConnectionProvider(Standalone<BlobMetadataDetailsRef> blobMetadata);
// TODO add update impl
};
#endif

View File

@ -38,11 +38,13 @@
// TODO could refactor the file reading code from here and the delta file function into another actor,
// then this part would also be testable? but meh
ACTOR Future<Standalone<StringRef>> readFile(Reference<BackupContainerFileSystem> bstore, BlobFilePointerRef f) {
ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> bstoreProvider, BlobFilePointerRef f) {
try {
state Arena arena;
// printf("Starting read of snapshot file %s\n", filename.c_str());
state Reference<IAsyncFile> reader = wait(bstore->readFile(f.filename.toString()));
std::string fname = f.filename.toString();
state Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(fname);
// printf("Starting read of snapshot file %s\n", fname.c_str());
state Reference<IAsyncFile> reader = wait(bstore->readFile(fname));
// printf("Got snapshot file size %lld\n", size);
state uint8_t* data = new (arena) uint8_t[f.length];
// printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str());
@ -66,7 +68,7 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
Optional<BlobWorkerStats*> stats) {
// TODO REMOVE with early replying
@ -120,7 +122,7 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
// TODO probably should add things like limit/bytelimit at some point?
ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
BlobGranuleFileReply reply,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
PromiseStream<RangeResult> results) {
// TODO for large amount of chunks, this should probably have some sort of buffer limit like ReplyPromiseStream.
// Maybe just use ReplyPromiseStream instead of PromiseStream?

View File

@ -29,6 +29,7 @@
#define BLOB_GRANULE_READER_CLIENT_H
#include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/BlobWorkerInterface.h"
@ -42,12 +43,12 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
KeyRangeRef keyRange,
Version beginVersion,
Version readVersion,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
Optional<BlobWorkerStats*> stats = Optional<BlobWorkerStats*>());
ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
BlobGranuleFileReply reply,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
PromiseStream<RangeResult> results);
#include "flow/unactorcompiler.h"

View File

@ -18,15 +18,18 @@ set(FDBCLIENT_SRCS
BackupContainerLocalDirectory.h
BackupContainerS3BlobStore.actor.cpp
BackupContainerS3BlobStore.h
ClientBooleanParams.cpp
ClientBooleanParams.h
BlobConnectionProvider.h
BlobConnectionProvider.cpp
BlobWorkerInterface.h
BlobGranuleReader.actor.cpp
BlobGranuleReader.actor.h
BlobGranuleCommon.h
BlobGranuleFiles.cpp
BlobGranuleFiles.h
BlobMetadataUtils.h
BlobWorkerCommon.h
ClientBooleanParams.cpp
ClientBooleanParams.h
ClientKnobCollection.cpp
ClientKnobCollection.h
ClientKnobs.cpp

View File

@ -19,12 +19,13 @@
*/
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/Knobs.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // has to be last include
@ -356,4 +357,88 @@ TEST_CASE("/blobgranule/server/common/granulefiles") {
checkFiles(files, 351, 400, true, Optional<int>(), {});
return Void();
}
// FIXME: if credentials can expire, refresh periodically
ACTOR Future<Void> loadBlobMetadataForTenants(BGTenantMap* self, std::vector<TenantMapEntry> tenantMapEntries) {
ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
ASSERT(!tenantMapEntries.empty());
state std::vector<BlobMetadataDomainId> domainIds;
for (auto& entry : tenantMapEntries) {
domainIds.push_back(entry.id);
}
// FIXME: if one tenant gets an error, don't kill whole process
// TODO: add latency metrics
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) {
EKPGetLatestBlobMetadataRequest req;
req.domainIds = domainIds;
requestFuture =
brokenPromiseToNever(self->dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req));
} else {
requestFuture = Never();
}
choose {
when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) {
ASSERT(rep.blobMetadataDetails.size() == domainIds.size());
// not guaranteed to be in same order in the request as the response
for (auto& metadata : rep.blobMetadataDetails) {
auto info = self->tenantInfoById.find(metadata.domainId);
if (info == self->tenantInfoById.end()) {
continue;
}
auto dataEntry = self->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->setBStore(BlobConnectionProvider::newBlobConnectionProvider(metadata));
}
return Void();
}
when(wait(self->dbInfo->onChange())) {}
}
}
}
// list of tenants that may or may not already exist
void BGTenantMap::addTenants(std::vector<std::pair<TenantName, TenantMapEntry>> tenants) {
std::vector<TenantMapEntry> tenantsToLoad;
for (auto entry : tenants) {
if (tenantInfoById.insert({ entry.second.id, entry.second }).second) {
auto r = makeReference<GranuleTenantData>(entry.first, entry.second);
tenantData.insert(KeyRangeRef(entry.second.prefix, entry.second.prefix.withSuffix(normalKeys.end)), r);
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
r->bstoreLoaded.send(Void());
} else {
tenantsToLoad.push_back(entry.second);
}
}
}
if (!tenantsToLoad.empty()) {
addActor.send(loadBlobMetadataForTenants(this, tenantsToLoad));
}
}
// TODO: implement
void BGTenantMap::removeTenants(std::vector<int64_t> tenantIds) {
throw not_implemented();
}
Optional<TenantMapEntry> BGTenantMap::getTenantById(int64_t id) {
auto tenant = tenantInfoById.find(id);
if (tenant == tenantInfoById.end()) {
return {};
} else {
return tenant->second;
}
}
// TODO: handle case where tenant isn't loaded yet
Reference<GranuleTenantData> BGTenantMap::getDataForGranule(const KeyRangeRef& keyRange) {
auto tenant = tenantData.rangeContaining(keyRange.begin);
ASSERT(tenant.begin() <= keyRange.begin);
ASSERT(tenant.end() >= keyRange.end);
return tenant.cvalue();
}

View File

@ -29,7 +29,10 @@
#include "flow/flow.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/actorcompiler.h" // has to be last include
struct GranuleHistory {
@ -79,4 +82,45 @@ ACTOR Future<Optional<GranuleHistory>> getLatestGranuleHistory(Transaction* tr,
ACTOR Future<Void> readGranuleFiles(Transaction* tr, Key* startKey, Key endKey, GranuleFiles* files, UID granuleID);
ACTOR Future<GranuleFiles> loadHistoryFiles(Database cx, UID granuleID);
// TODO: versioned like SS has?
struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
TenantName name;
TenantMapEntry entry;
Reference<BlobConnectionProvider> bstore;
Promise<Void> bstoreLoaded;
GranuleTenantData() {}
GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {}
void setBStore(Reference<BlobConnectionProvider> bs) {
ASSERT(bstoreLoaded.canBeSet());
bstore = bs;
bstoreLoaded.send(Void());
}
};
// TODO: add refreshing
struct BGTenantMap {
public:
void addTenants(std::vector<std::pair<TenantName, TenantMapEntry>>);
void removeTenants(std::vector<int64_t> tenantIds);
Optional<TenantMapEntry> getTenantById(int64_t id);
Reference<GranuleTenantData> getDataForGranule(const KeyRangeRef& keyRange);
KeyRangeMap<Reference<GranuleTenantData>> tenantData;
std::unordered_map<int64_t, TenantMapEntry> tenantInfoById;
Reference<AsyncVar<ServerDBInfo> const> dbInfo;
PromiseStream<Future<Void>> addActor;
BGTenantMap() {}
explicit BGTenantMap(const Reference<AsyncVar<ServerDBInfo> const> dbInfo) : dbInfo(dbInfo) {
collection = actorCollection(addActor.getFuture());
}
private:
Future<Void> collection;
};
#endif

View File

@ -61,7 +61,7 @@ ACTOR Future<std::pair<RangeResult, Version>> readFromFDB(Database cx, KeyRange
// FIXME: typedef this pair type and/or chunk list
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
Database cx,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
KeyRange range,
Version beginVersion,
Version readVersion,

View File

@ -37,7 +37,7 @@
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
Database cx,
Reference<BackupContainerFileSystem> bstore,
Reference<BlobConnectionProvider> bstore,
KeyRange range,
Version beginVersion,
Version readVersion,

View File

@ -49,7 +49,7 @@
* The Blob Manager is responsible for managing range granules, and recruiting and monitoring Blob Workers.
*/
#define BM_DEBUG false
#define BM_DEBUG true
void handleClientBlobRange(KeyRangeMap<bool>* knownBlobRanges,
Arena& ar,
@ -255,7 +255,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
BlobManagerStats stats;
Reference<BackupContainerFileSystem> bstore;
Reference<BlobConnectionProvider> bstore;
std::unordered_map<UID, BlobWorkerInterface> workersById;
std::unordered_map<UID, BlobWorkerInfo> workerStats; // mapping between workerID -> workerStats
@ -265,6 +265,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
KeyRangeActorMap assignsInProgress;
KeyRangeMap<SplitEvaluation> splitEvaluations;
KeyRangeMap<bool> knownBlobRanges;
BGTenantMap tenantData;
AsyncTrigger startRecruiting;
Debouncer restartRecruiting;
@ -282,18 +283,18 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
// assigned sequence numbers
PromiseStream<RangeAssignment> rangesToAssign;
BlobManagerData(UID id, Database db, Optional<Key> dcId)
BlobManagerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db, Optional<Key> dcId)
: id(id), db(db), dcId(dcId), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &workersById),
knownBlobRanges(false, normalKeys.end), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
recruitingStream(0) {}
knownBlobRanges(false, normalKeys.end), tenantData(BGTenantMap(dbInfo)),
restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), recruitingStream(0) {}
// only initialize blob store if actually needed
void initBStore() {
if (!bstore.isValid()) {
if (!bstore.isValid() && SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
if (BM_DEBUG) {
fmt::print("BM {} constructing backup container from {}\n", epoch, SERVER_KNOBS->BG_URL.c_str());
}
bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
if (BM_DEBUG) {
fmt::print("BM {} constructed backup container\n", epoch);
}
@ -840,11 +841,15 @@ ACTOR Future<Void> monitorClientRanges(Reference<BlobManagerData> bmData) {
throw internal_error();
}
std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
std::vector<Key> prefixes;
for (auto& it : tenantResults) {
StringRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value);
tenants.push_back(std::pair(tenantName, entry));
prefixes.push_back(entry.prefix);
}
bmData->tenantData.addTenants(tenants);
// make this look like knownBlobRanges
std::sort(prefixes.begin(), prefixes.end());
@ -2279,6 +2284,30 @@ ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
return Void();
}
static Future<Void> deleteFile(Reference<BlobConnectionProvider> bstoreProvider, std::string filePath) {
Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(filePath);
return bstore->deleteFile(filePath);
}
// since all BM operations for a tenant are driven by discovering the tenant from the tenant map, it will always know
// about this tenant
ACTOR Future<Reference<BlobConnectionProvider>> getBStoreForGranule(Reference<BlobManagerData> self,
KeyRange granuleRange) {
if (self->bstore.isValid()) {
return self->bstore;
}
loop {
state Reference<GranuleTenantData> data = self->tenantData.getDataForGranule(granuleRange);
if (data.isValid()) {
wait(data->bstoreLoaded.getFuture());
return data->bstore;
} else {
// race on startup between loading tenant ranges and bgcc/purging. just wait
wait(delay(0.1));
}
}
}
/*
* Deletes all files pertaining to the granule with id granuleId and
* also removes the history entry for this granule from the system keyspace
@ -2286,7 +2315,8 @@ ACTOR Future<Void> canDeleteFullGranule(Reference<BlobManagerData> self, UID gra
ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Key historyKey,
Version purgeVersion) {
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
}
@ -2294,6 +2324,7 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
// delete the granule, since we need to keep the last snapshot and deltas for splitting
wait(canDeleteFullGranule(self, granuleId));
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
// get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
@ -2303,13 +2334,13 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
for (auto snapshotFile : files.snapshotFiles) {
std::string fname = snapshotFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname));
deletions.push_back(deleteFile(bstore, fname));
filesToDelete.emplace_back(fname);
}
for (auto deltaFile : files.deltaFiles) {
std::string fname = deltaFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname));
deletions.push_back(deleteFile(bstore, fname));
filesToDelete.emplace_back(fname);
}
@ -2371,11 +2402,16 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
* file might be deleted. We will need to ensure we don't rely on the granule's startVersion
* (that's persisted as part of the key), but rather use the granule's first snapshot's version when needed
*/
ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID granuleId, Version purgeVersion) {
ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
}
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
// get files
GranuleFiles files = wait(loadHistoryFiles(self->db, granuleId));
@ -2391,7 +2427,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
// if we already found the latestSnapshotVersion, this snapshot can be deleted
if (latestSnapshotVersion != invalidVersion) {
std::string fname = files.snapshotFiles[idx].filename;
deletions.emplace_back(self->bstore->deleteFile(fname));
deletions.push_back(deleteFile(bstore, fname));
deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, files.snapshotFiles[idx].version, 'S'));
filesToDelete.emplace_back(fname);
} else if (files.snapshotFiles[idx].version <= purgeVersion) {
@ -2415,7 +2451,7 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self, UID g
// otherwise deltaFile.version <= latestSnapshotVersion so delete it
// == should also be deleted because the last delta file before a snapshot would have the same version
std::string fname = deltaFile.filename;
deletions.emplace_back(self->bstore->deleteFile(fname));
deletions.push_back(deleteFile(bstore, fname));
deletedFileKeys.emplace_back(blobGranuleFileKeyFor(granuleId, deltaFile.version, 'D'));
filesToDelete.emplace_back(fname);
}
@ -2500,8 +2536,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state std::queue<std::tuple<KeyRange, Version, Version>> historyEntryQueue;
// stacks of <granuleId, historyKey> and <granuleId> to track which granules to delete
state std::vector<std::tuple<UID, Key>> toFullyDelete;
state std::vector<UID> toPartiallyDelete;
state std::vector<std::tuple<UID, Key, KeyRange>> toFullyDelete;
state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;
// track which granules we have already added to traversal
// note: (startKey, startVersion) uniquely identifies a granule
@ -2562,7 +2598,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
while (!historyEntryQueue.empty()) {
// process the node at the front of the queue and remove it
KeyRange currRange;
state KeyRange currRange;
state Version startVersion;
state Version endVersion;
std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
@ -2612,12 +2648,12 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
if (BM_DEBUG) {
fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
}
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey });
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
} else if (startVersion < purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
}
toPartiallyDelete.push_back({ currHistoryNode.granuleID });
toPartiallyDelete.push_back({ currHistoryNode.granuleID, currRange });
}
// add all of the node's parents to the queue
@ -2668,12 +2704,13 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
for (i = toFullyDelete.size() - 1; i >= 0; --i) {
state UID granuleId;
Key historyKey;
std::tie(granuleId, historyKey) = toFullyDelete[i];
KeyRange keyRange;
std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_DEBUG) {
fmt::print("About to fully delete granule {0}\n", granuleId.toString());
}
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion));
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range));
}
if (BM_DEBUG) {
@ -2681,11 +2718,13 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId = toPartiallyDelete[i];
UID granuleId;
KeyRange range;
std::tie(granuleId, range) = toPartiallyDelete[i];
if (BM_DEBUG) {
fmt::print("About to partially delete granule {0}\n", granuleId.toString());
}
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion));
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range));
}
wait(waitForAll(partialDeletions));
@ -2906,9 +2945,10 @@ static void blobManagerExclusionSafetyCheck(Reference<BlobManagerData> self,
ACTOR Future<int64_t> bgccCheckGranule(Reference<BlobManagerData> bmData, KeyRange range) {
state std::pair<RangeResult, Version> fdbResult = wait(readFromFDB(bmData->db, range));
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(bmData, range));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blobResult =
wait(readFromBlob(bmData->db, bmData->bstore, range, 0, fdbResult.second));
wait(readFromBlob(bmData->db, bstore, range, 0, fdbResult.second));
if (!compareFDBAndBlob(fdbResult.first, blobResult, range, fdbResult.second, BM_DEBUG)) {
++bmData->stats.ccMismatches;
@ -3008,6 +3048,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
}
state Reference<BlobManagerData> self =
makeReference<BlobManagerData>(deterministicRandom()->randomUniqueID(),
dbInfo,
openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True),
bmInterf.locality.dcId());

View File

@ -27,19 +27,20 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/BackupContainerFileSystem.h"
#include "fdbclient/BlobConnectionProvider.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleReader.actor.h"
#include "fdbclient/BlobMetadataUtils.h"
#include "fdbclient/BlobWorkerCommon.h"
#include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/Tenant.h"
#include "fdbserver/BlobMetadataUtils.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobConnectionProvider.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/ServerDBInfo.h"
@ -51,7 +52,7 @@
#include "flow/actorcompiler.h" // has to be last include
#define BW_DEBUG false
#define BW_DEBUG true
#define BW_REQUEST_DEBUG false
/*
@ -158,18 +159,6 @@ struct GranuleHistoryEntry : NonCopyable, ReferenceCounted<GranuleHistoryEntry>
: range(range), granuleID(granuleID), startVersion(startVersion), endVersion(endVersion) {}
};
// TODO: does this need to be versioned like SS has?
struct GranuleTenantData : NonCopyable, ReferenceCounted<GranuleTenantData> {
TenantName name;
TenantMapEntry entry;
// TODO add other useful stuff like per-tenant blob connection, if necessary
Reference<BlobConnectionProvider> conn;
Promise<Void> connLoaded;
GranuleTenantData() {}
GranuleTenantData(TenantName name, TenantMapEntry entry) : name(name), entry(entry) {}
};
struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
UID id;
Database db;
@ -186,10 +175,9 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
// FIXME: refactor out the parts of this that are just for interacting with blob stores from the backup business
// logic
Reference<BackupContainerFileSystem> bstore;
Reference<BlobConnectionProvider> bstore;
KeyRangeMap<GranuleRangeMetadata> granuleMetadata;
KeyRangeMap<Reference<GranuleTenantData>> tenantData;
std::unordered_map<int64_t, TenantMapEntry> tenantInfoById;
BGTenantMap tenantData;
// contains the history of completed granules before the existing ones. Maps to the latest one, and has
// back-pointers to earlier granules
@ -207,8 +195,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int changeFeedStreamReplyBufferSize = SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / 2;
BlobWorkerData(UID id, Database db)
: id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL),
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), stats(id, SERVER_KNOBS->WORKER_LOGGING_INTERVAL), tenantData(BGTenantMap(dbInfo)),
initialSnapshotLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM) {}
bool managerEpochOk(int64_t epoch) {
@ -479,6 +467,7 @@ ACTOR Future<std::pair<BlobGranuleSplitState, Version>> getGranuleSplitState(Tra
// in flight. The synchronization happens after the s3 file is written, but before we update the FDB index of what files
// exist. Before updating FDB, we ensure the version is committed and all previous delta files have updated FDB.
ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
KeyRange keyRange,
UID granuleID,
int64_t epoch,
@ -492,9 +481,9 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
// Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if
// multiple blob workers try to create the exact same file at the same millisecond (which observably happens)
state std::string fname = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" +
std::to_string(currentDeltaVersion) + ".delta";
std::string fileName = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() + "_T" +
std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(currentDeltaVersion) +
".delta";
state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned());
state size_t serializedSize = serialized.size();
@ -502,7 +491,10 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
// Free up deltasToWrite here to reduce memory
deltasToWrite = Standalone<GranuleDeltas>();
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname));
state Reference<BackupContainerFileSystem> writeBStore;
state std::string fname;
std::tie(writeBStore, fname) = bstore->createForWrite(fileName);
state Reference<IBackupFile> objectFile = wait(writeBStore->writeFile(fname));
++bwData->stats.s3PutReqs;
++bwData->stats.deltaFilesWritten;
@ -577,12 +569,13 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
}
TEST(true); // Granule cleaning up delta file after error
++bwData->stats.s3DeleteReqs;
bwData->addActor.send(bwData->bstore->deleteFile(fname));
bwData->addActor.send(writeBStore->deleteFile(fname));
throw e;
}
}
ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
KeyRange keyRange,
UID granuleID,
int64_t epoch,
@ -592,9 +585,9 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
bool createGranuleHistory) {
// Prefix filename with random chars both to avoid hotspotting on granuleID, and to have unique file names if
// multiple blob workers try to create the exact same file at the same millisecond (which observably happens)
state std::string fname = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(version) +
".snapshot";
state std::string fileName = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(version) +
".snapshot";
state Standalone<GranuleSnapshot> snapshot;
wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
@ -644,7 +637,10 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
snapshot = Standalone<GranuleSnapshot>();
// write to blob using multi part upload
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname));
state Reference<BackupContainerFileSystem> writeBStore;
state std::string fname;
std::tie(writeBStore, fname) = bstore->createForWrite(fileName);
state Reference<IBackupFile> objectFile = wait(writeBStore->writeFile(fname));
++bwData->stats.s3PutReqs;
++bwData->stats.snapshotFilesWritten;
@ -698,7 +694,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
TEST(true); // Granule deleting snapshot file after error
++bwData->stats.s3DeleteReqs;
bwData->addActor.send(bwData->bstore->deleteFile(fname));
bwData->addActor.send(writeBStore->deleteFile(fname));
throw e;
}
@ -719,6 +715,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata,
UID granuleID,
Key cfKey,
@ -747,6 +744,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
ASSERT(lastReadVersion <= readVersion);
state PromiseStream<RangeResult> rowsStream;
state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData,
bstore,
metadata->keyRange,
granuleID,
metadata->originalEpoch,
@ -805,6 +803,7 @@ ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData>
// files might not be the current set of files in metadata, in the case of doing the initial snapshot of a granule that
// was split.
ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata,
UID granuleID,
GranuleFiles files,
@ -859,6 +858,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
try {
state PromiseStream<RangeResult> rowsStream;
state Future<BlobFileIndex> snapshotWriter = writeSnapshot(bwData,
bstore,
metadata->keyRange,
granuleID,
metadata->originalEpoch,
@ -867,7 +867,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
rowsStream,
false);
RangeResult newGranule =
wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bwData->bstore, &bwData->stats));
wait(readBlobGranule(chunk, metadata->keyRange, 0, version, bstore, &bwData->stats));
bwData->stats.bytesReadFromS3ForCompaction += compactBytesRead;
rowsStream.send(std::move(newGranule));
@ -906,6 +906,7 @@ struct CounterHolder {
};
ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bwData,
Reference<BlobConnectionProvider> bstore,
Reference<GranuleMetadata> metadata,
UID granuleID,
int64_t bytesInNewDeltaFiles,
@ -1012,7 +1013,7 @@ ACTOR Future<BlobFileIndex> checkSplitAndReSnapshot(Reference<BlobWorkerData> bw
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
}
BlobFileIndex reSnapshotIdx =
wait(compactFromBlob(bwData, metadata, granuleID, metadata->files, reSnapshotVersion));
wait(compactFromBlob(bwData, bstore, metadata, granuleID, metadata->files, reSnapshotVersion));
return reSnapshotIdx;
}
@ -1305,7 +1306,9 @@ ACTOR Future<Void> waitVersionCommitted(Reference<BlobWorkerData> bwData,
// TODO: this is getting kind of large. Should try to split out this actor if it continues to grow?
ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
Reference<GranuleMetadata> metadata,
Future<GranuleStartState> assignFuture) {
Future<GranuleStartState> assignFuture,
Future<Reference<BlobConnectionProvider>> bstoreFuture) {
state Reference<BlobConnectionProvider> bstore;
state std::deque<InFlightFile> inFlightFiles;
state std::deque<Future<Void>> inFlightPops;
state Future<Void> oldChangeFeedFuture;
@ -1327,9 +1330,9 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// set resume snapshot so it's not valid until we pause to ask the blob manager for a re-snapshot
metadata->resumeSnapshot.send(Void());
// before starting, make sure worker persists range assignment and acquires the granule lock
GranuleStartState _info = wait(assignFuture);
startState = _info;
// before starting, make sure worker persists range assignment, acquires the granule lock, and has a blob store
wait(store(startState, assignFuture));
wait(store(bstore, bstoreFuture));
wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
@ -1385,15 +1388,15 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (startState.blobFilesToSnapshot.present()) {
startVersion = startState.previousDurableVersion;
Future<BlobFileIndex> inFlightBlobSnapshot = compactFromBlob(
bwData, metadata, startState.granuleID, startState.blobFilesToSnapshot.get(), startVersion);
bwData, bstore, metadata, startState.granuleID, startState.blobFilesToSnapshot.get(), startVersion);
inFlightFiles.push_back(InFlightFile(inFlightBlobSnapshot, startVersion, 0, true));
pendingSnapshots++;
metadata->durableSnapshotVersion.set(startState.blobFilesToSnapshot.get().snapshotFiles.back().version);
} else {
ASSERT(startState.previousDurableVersion == invalidVersion);
BlobFileIndex fromFDB =
wait(dumpInitialSnapshotFromFDB(bwData, metadata, startState.granuleID, cfKey, &inFlightPops));
BlobFileIndex fromFDB = wait(
dumpInitialSnapshotFromFDB(bwData, bstore, metadata, startState.granuleID, cfKey, &inFlightPops));
newSnapshotFile = fromFDB;
ASSERT(startState.changeFeedStartVersion <= fromFDB.version);
startVersion = newSnapshotFile.version;
@ -1764,6 +1767,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
Future<BlobFileIndex> dfFuture =
writeDeltaFile(bwData,
bstore,
metadata->keyRange,
startState.granuleID,
metadata->originalEpoch,
@ -1825,6 +1829,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
int64_t versionsSinceLastSnapshot =
metadata->pendingDeltaVersion - metadata->pendingSnapshotVersion;
Future<BlobFileIndex> inFlightBlobSnapshot = checkSplitAndReSnapshot(bwData,
bstore,
metadata,
startState.granuleID,
metadata->bytesInNewDeltaFiles,
@ -2227,10 +2232,10 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state Optional<Key> tenantPrefix;
if (req.tenantInfo.name.present()) {
ASSERT(req.tenantInfo.tenantId != TenantInfo::INVALID_TENANT);
auto tenantEntry = bwData->tenantInfoById.find(req.tenantInfo.tenantId);
if (tenantEntry != bwData->tenantInfoById.end()) {
ASSERT(tenantEntry->second.id == req.tenantInfo.tenantId);
tenantPrefix = tenantEntry->second.prefix;
Optional<TenantMapEntry> tenantEntry = bwData->tenantData.getTenantById(req.tenantInfo.tenantId);
if (tenantEntry.present()) {
ASSERT(tenantEntry.get().id == req.tenantInfo.tenantId);
tenantPrefix = tenantEntry.get().prefix;
} else {
TEST(true); // Blob worker unknown tenant
// FIXME - better way. Wait on retry here, or just have better model for tenant metadata?
@ -2744,11 +2749,39 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
}
ACTOR Future<Reference<BlobConnectionProvider>> loadBStoreForTenant(Reference<BlobWorkerData> bwData,
KeyRange keyRange) {
state int retryCount = 0;
loop {
state Reference<GranuleTenantData> data = bwData->tenantData.getDataForGranule(keyRange);
if (data.isValid()) {
wait(data->bstoreLoaded.getFuture());
return data->bstore;
} else {
TEST(true); // bstore for unknown tenant
// Assume not loaded yet, just wait a bit. Could do sophisticated mechanism but will redo tenant loading to
// be versioned anyway
retryCount++;
TraceEvent(retryCount < 10 ? SevDebug : SevWarn, "BlobWorkerUnknownTenantForGranule", bwData->id)
.detail("KeyRange", keyRange);
wait(delay(0.1));
}
}
}
ACTOR Future<Void> start(Reference<BlobWorkerData> bwData, GranuleRangeMetadata* meta, AssignBlobRangeRequest req) {
ASSERT(meta->activeMetadata.isValid());
Future<Reference<BlobConnectionProvider>> loadBStore;
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
loadBStore = Future<Reference<BlobConnectionProvider>>(bwData->bstore); // done
} else {
loadBStore = loadBStoreForTenant(bwData, req.keyRange);
}
meta->activeMetadata->originalReq = req;
meta->assignFuture = openGranule(bwData, req);
meta->fileUpdaterFuture = blobGranuleUpdateFiles(bwData, meta->activeMetadata, meta->assignFuture);
meta->fileUpdaterFuture = blobGranuleUpdateFiles(bwData, meta->activeMetadata, meta->assignFuture, loadBStore);
meta->historyLoaderFuture = blobGranuleLoadHistory(bwData, meta->activeMetadata, meta->assignFuture);
wait(success(meta->assignFuture));
return Void();
@ -3115,72 +3148,6 @@ ACTOR Future<Void> monitorRemoval(Reference<BlobWorkerData> bwData) {
}
}
// FIXME: if credentials can expire, refresh periodically
ACTOR Future<Void> loadBlobMetadataForTenants(Reference<BlobWorkerData> bwData,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
std::vector<TenantMapEntry> tenantMapEntries) {
ASSERT(SERVER_KNOBS->BG_METADATA_SOURCE == "tenant");
ASSERT(!tenantMapEntries.empty());
state std::vector<BlobMetadataDomainId> domainIds;
for (auto& entry : tenantMapEntries) {
if (BW_DEBUG) {
fmt::print(
"BW {0} loading blob metadata for tenant {1}\n", bwData->id.shortString().substr(0, 5), entry.id);
}
domainIds.push_back(entry.id);
}
// FIXME: if one tenant gets an error, don't kill whole blob worker
// TODO: add latency metrics
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (dbInfo.isValid() && dbInfo->get().encryptKeyProxy.present()) {
EKPGetLatestBlobMetadataRequest req;
req.domainIds = domainIds;
requestFuture =
brokenPromiseToNever(dbInfo->get().encryptKeyProxy.get().getLatestBlobMetadata.getReply(req));
} else {
requestFuture = Never();
}
choose {
when(EKPGetLatestBlobMetadataReply rep = wait(requestFuture)) {
ASSERT(rep.blobMetadataDetails.size() == domainIds.size());
// not guaranteed to be in same order in the request as the response
for (auto& metadata : rep.blobMetadataDetails) {
auto info = bwData->tenantInfoById.find(metadata.domainId);
if (info == bwData->tenantInfoById.end()) {
TraceEvent(SevWarn, "BlobWorkerTenantDeletedWhileLoadMetadata", bwData->id)
.detail("TenantId", metadata.domainId);
continue;
}
auto dataEntry = bwData->tenantData.rangeContaining(info->second.prefix);
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->conn = BlobConnectionProvider::newBlobConnectionProvider(metadata);
dataEntry.cvalue()->connLoaded.send(Void());
TraceEvent(SevDebug, "BlobWorkerTenantMetadataLoaded", bwData->id)
.detail("TenantId", metadata.domainId);
if (BW_DEBUG) {
fmt::print("BW {0} loaded blob metadata for {1}: {2}",
bwData->id.shortString().substr(0, 5),
metadata.domainId,
metadata.base.present() ? metadata.base.get().toString() : "");
if (metadata.partitions.empty()) {
fmt::print("\n");
} else {
fmt::print(" ({0})\n", metadata.partitions.size());
for (auto& it : metadata.partitions) {
fmt::print(" {0}\n", it.toString());
}
}
}
}
return Void();
}
when(wait(dbInfo->onChange())) {}
}
}
}
// Because change feeds send uncommitted data and explicit rollback messages, we speculatively buffer/write
// uncommitted data. This means we must ensure the data is actually committed before "committing" those writes in
// the blob granule. The simplest way to do this is to have the blob worker do a periodic GRV, which is guaranteed
@ -3213,7 +3180,7 @@ ACTOR Future<Void> runGRVChecks(Reference<BlobWorkerData> bwData) {
// FIXME: better way to do this?
// monitor system keyspace for new tenants
ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData) {
loop {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(bwData->db);
loop {
@ -3233,33 +3200,14 @@ ACTOR Future<Void> monitorTenants(Reference<BlobWorkerData> bwData, Reference<As
throw internal_error();
}
std::vector<TenantMapEntry> tenantsToLoad;
std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
for (auto& it : tenantResults) {
// FIXME: handle removing/moving tenants!
StringRef tenantName = it.key.removePrefix(tenantMapPrefix);
TenantMapEntry entry = decodeTenantEntry(it.value);
if (bwData->tenantInfoById.insert({ entry.id, entry }).second) {
if (BW_DEBUG) {
fmt::print("BW {0} found new tenant {1}: {2} {3}\n",
bwData->id.shortString().substr(0, 5),
tenantName.printable(),
entry.id,
entry.prefix.printable());
}
auto r = makeReference<GranuleTenantData>(tenantName, entry);
bwData->tenantData.insert(KeyRangeRef(entry.prefix, entry.prefix.withSuffix(normalKeys.end)),
r);
if (SERVER_KNOBS->BG_METADATA_SOURCE != "tenant") {
r->connLoaded.send(Void());
} else {
tenantsToLoad.push_back(entry);
}
}
}
if (!tenantsToLoad.empty()) {
bwData->addActor.send(loadBlobMetadataForTenants(bwData, dbInfo, tenantsToLoad));
tenants.push_back(std::pair(tenantName, entry));
}
bwData->tenantData.addTenants(tenants);
state Future<Void> watchChange = tr->watch(tenantLastIdKey);
wait(tr->commit());
@ -3298,8 +3246,8 @@ static void handleGetGranuleAssignmentsRequest(Reference<BlobWorkerData> self,
ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
ReplyPromise<InitializeBlobWorkerReply> recruitReply,
Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
state Reference<BlobWorkerData> self(
new BlobWorkerData(bwInterf.id(), openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)));
state Reference<BlobWorkerData> self(new BlobWorkerData(
bwInterf.id(), dbInfo, openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, LockAware::True)));
self->id = bwInterf.id();
self->locality = bwInterf.locality;
@ -3310,19 +3258,21 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
}
try {
if (BW_DEBUG) {
fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL);
}
self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
if (BW_DEBUG) {
printf("BW constructed backup container\n");
if (SERVER_KNOBS->BG_RANGE_SOURCE != "tenant") {
if (BW_DEBUG) {
fmt::print("BW constructing backup container from {0}\n", SERVER_KNOBS->BG_URL);
}
self->bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
if (BW_DEBUG) {
printf("BW constructed backup container\n");
}
}
// register the blob worker to the system keyspace
wait(registerBlobWorker(self, bwInterf));
} catch (Error& e) {
if (BW_DEBUG) {
fmt::print("BW got backup container init error {0}\n", e.name());
fmt::print("BW got init error {0}\n", e.name());
}
// if any errors came up while initializing the blob worker, let the blob manager know
// that recruitment failed
@ -3342,7 +3292,7 @@ ACTOR Future<Void> blobWorker(BlobWorkerInterface bwInterf,
self->addActor.send(waitFailureServer(bwInterf.waitFailure.getFuture()));
self->addActor.send(runGRVChecks(self));
if (SERVER_KNOBS->BG_RANGE_SOURCE == "tenant") {
self->addActor.send(monitorTenants(self, dbInfo));
self->addActor.send(monitorTenants(self));
}
state Future<Void> selfRemoved = monitorRemoval(self);

View File

@ -5,15 +5,12 @@ set(FDBSERVER_SRCS
BackupProgress.actor.cpp
BackupProgress.actor.h
BackupWorker.actor.cpp
BlobConnectionProvider.h
BlobConnectionProvider.cpp
BlobGranuleServerCommon.actor.cpp
BlobGranuleServerCommon.actor.h
BlobGranuleValidation.actor.cpp
BlobGranuleValidation.actor.h
BlobManager.actor.cpp
BlobManagerInterface.h
BlobMetadataUtils.h
BlobWorker.actor.cpp
ClusterController.actor.cpp
ClusterController.actor.h

View File

@ -26,10 +26,10 @@
#include "flow/network.h"
#pragma once
#include "fdbclient/BlobMetadataUtils.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/Locality.h"
#include "fdbserver/BlobMetadataUtils.h"
struct EncryptKeyProxyInterface {
constexpr static FileIdentifier file_identifier = 1303419;

View File

@ -28,7 +28,7 @@
#include "flow/Trace.h"
#include "flow/flow.h"
#include "flow/network.h"
#include "fdbserver/BlobMetadataUtils.h"
#include "fdbclient/BlobMetadataUtils.h"
struct KmsConnectorInterface {
constexpr static FileIdentifier file_identifier = 2416711;

View File

@ -20,6 +20,7 @@
#include "fdbserver/SimKmsConnector.h"
#include "contrib/fmt-8.1.1/include/fmt/format.h"
#include "fdbrpc/sim_validation.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
@ -151,17 +152,21 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
fmt::print("SimBlobMetadata ({})\n", domainId);
if (type == 0) {
// single storage location
metadata.base = StringRef(metadata.arena(), "file://fdbblob/" + std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.base.get().printable());
}
if (type == 1) {
// simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), "file://fdbblob/");
metadata.base = StringRef(metadata.arena(), "file://fdbblob/"_sr);
fmt::print(" {} ({})\n", metadata.base.get().printable(), partitionCount);
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
}
}
if (type == 2) {
@ -169,6 +174,7 @@ static Standalone<BlobMetadataDetailsRef> createBlobMetadata(BlobMetadataDomainI
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(
metadata.arena(), "file://fdbblob" + std::to_string(domainId) + "_" + std::to_string(i) + "/");
fmt::print(" {}\n", metadata.partitions.back().printable());
}
}
return metadata;

View File

@ -30,6 +30,7 @@
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobGranuleValidation.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/TesterInterface.actor.h"
@ -66,6 +67,7 @@ struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
KeyRange directoryRange;
TenantName tenantName;
TenantMapEntry tenant;
Reference<BlobConnectionProvider> bstore;
// key + value gen data
// in vector for efficient random selection
@ -150,7 +152,6 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
std::vector<Reference<ThreadData>> directories;
std::vector<Future<Void>> clients;
DatabaseConfiguration config;
Reference<BackupContainerFileSystem> bstore;
BlobGranuleCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
doSetup = !clientId; // only do this on the "first" client
@ -245,6 +246,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
}
state int directoryIdx = 0;
state std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
state BGTenantMap tenantData(self->dbInfo);
for (; directoryIdx < self->directories.size(); directoryIdx++) {
// Set up the blob range first
TenantMapEntry tenantEntry = wait(self->setUpTenant(cx, self->directories[directoryIdx]->tenantName));
@ -252,32 +255,17 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
self->directories[directoryIdx]->tenant = tenantEntry;
self->directories[directoryIdx]->directoryRange =
KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end));
tenants.push_back({ self->directories[directoryIdx]->tenantName, tenantEntry });
}
tenantData.addTenants(tenants);
if (BGW_DEBUG) {
printf("Initializing Blob Granule Correctness s3 stuff\n");
}
try {
if (g_network->isSimulated()) {
if (BGW_DEBUG) {
printf("Blob Granule Correctness constructing simulated backup container\n");
}
self->bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {});
} else {
if (BGW_DEBUG) {
printf("Blob Granule Correctness constructing backup container from %s\n",
SERVER_KNOBS->BG_URL.c_str());
}
self->bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
if (BGW_DEBUG) {
printf("Blob Granule Correctness constructed backup container\n");
}
}
} catch (Error& e) {
if (BGW_DEBUG) {
printf("Blob Granule Correctness got backup container init error %s\n", e.name());
}
throw e;
// wait for tenant data to be loaded
for (directoryIdx = 0; directoryIdx < self->directories.size(); directoryIdx++) {
state Reference<GranuleTenantData> data =
tenantData.getDataForGranule(self->directories[directoryIdx]->directoryRange);
wait(data->bstoreLoaded.getFuture());
self->directories[directoryIdx]->bstore = data->bstore;
}
return Void();
@ -307,8 +295,13 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
try {
Version rv = wait(self->doGrv(&tr));
state Version readVersion = rv;
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob(
cx, self->bstore, normalKeys /* tenant handles range */, 0, readVersion, threadData->tenantName));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
wait(readFromBlob(cx,
threadData->bstore,
normalKeys /* tenant handles range */,
0,
readVersion,
threadData->tenantName));
fmt::print("Directory {0} got {1} RV {2}\n",
threadData->directoryID,
doSetup ? "initial" : "final",
@ -681,8 +674,8 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
beginVersion = threadData->writeVersions[beginVersionIdx];
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
wait(readFromBlob(cx, self->bstore, range, beginVersion, readVersion, threadData->tenantName));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(
readFromBlob(cx, threadData->bstore, range, beginVersion, readVersion, threadData->tenantName));
self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion);
int resultBytes = blob.first.expectedSize();
@ -883,7 +876,7 @@ struct BlobGranuleCorrectnessWorkload : TestWorkload {
fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob(
cx, self->bstore, normalKeys /*tenant handles range*/, 0, readVersion, threadData->tenantName));
cx, threadData->bstore, normalKeys /*tenant handles range*/, 0, readVersion, threadData->tenantName));
result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion);
finalRowsValidated = blob.first.size();

View File

@ -68,7 +68,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
DatabaseConfiguration config;
Reference<BackupContainerFileSystem> bstore;
Reference<BlobConnectionProvider> bstore;
AsyncVar<Standalone<VectorRef<KeyRangeRef>>> granuleRanges;
BlobGranuleVerifierWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
@ -87,21 +87,12 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
printf("Initializing Blob Granule Verifier s3 stuff\n");
}
try {
if (g_network->isSimulated()) {
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructing simulated backup container\n");
}
bstore = BackupContainerFileSystem::openContainerFS("file://fdbblob/", {}, {});
} else {
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructing backup container from %s\n",
SERVER_KNOBS->BG_URL.c_str());
}
bstore = BackupContainerFileSystem::openContainerFS(SERVER_KNOBS->BG_URL, {}, {});
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructed backup container\n");
}
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructing backup container from %s\n", SERVER_KNOBS->BG_URL.c_str());
}
bstore = BlobConnectionProvider::newBlobConnectionProvider(SERVER_KNOBS->BG_URL);
if (BGV_DEBUG) {
printf("Blob Granule Verifier constructed backup container\n");
}
} catch (Error& e) {
if (BGV_DEBUG) {