From f5a44967f267334159d01c0d1a7d5bd33ffa5ece Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 23 Sep 2022 16:15:04 -0700 Subject: [PATCH 01/57] Add tenant group support to the tenant management workload and test configure and rename tenant operations. Fix a few error types picked up by the test. --- .../MetaclusterManagementWorkload.actor.cpp | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 4fc3b4e4d1..089aa3e621 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -446,6 +446,9 @@ struct MetaclusterManagementWorkload : TestWorkload { tenantMapEntry.tenantName = tenant; tenantMapEntry.tenantGroup = tenantGroup; + state TenantMapEntry tenantMapEntry; + tenantMapEntry.tenantGroup = tenantGroup; + try { loop { try { @@ -811,6 +814,189 @@ struct MetaclusterManagementWorkload : TestWorkload { return Void(); } + ACTOR static Future configureTenant(MetaclusterManagementWorkload* self) { + state TenantName tenant = self->chooseTenantName(); + state Optional newTenantGroup = self->chooseTenantGroup(); + + auto itr = self->createdTenants.find(tenant); + state bool exists = itr != self->createdTenants.end(); + state bool tenantGroupExists = + newTenantGroup.present() && self->tenantGroups.find(newTenantGroup.get()) != self->tenantGroups.end(); + + state bool hasCapacity = false; + if (exists) { + auto& dataDb = self->dataDbs[itr->second.cluster]; + hasCapacity = dataDb.ungroupedTenants.size() + dataDb.tenantGroups.size() < dataDb.tenantGroupCapacity; + } + + state std::map, Optional> configurationParameters = { { "tenant_group"_sr, + newTenantGroup } }; + + try { + loop { + Future configureFuture = + MetaclusterAPI::configureTenant(self->managementDb, tenant, configurationParameters); + Optional result = wait(timeout(configureFuture, deterministicRandom()->randomInt(1, 30))); + + if (result.present()) { + break; + } + } + + ASSERT(exists); + auto tenantData = self->createdTenants.find(tenant); + ASSERT(tenantData != self->createdTenants.end()); + + auto& dataDb = self->dataDbs[tenantData->second.cluster]; + ASSERT(dataDb.registered); + + bool allocationRemoved = false; + bool allocationAdded = false; + if (tenantData->second.tenantGroup != newTenantGroup) { + if (tenantData->second.tenantGroup.present()) { + auto& tenantGroupData = self->tenantGroups[tenantData->second.tenantGroup.get()]; + tenantGroupData.tenants.erase(tenant); + if (tenantGroupData.tenants.empty()) { + allocationRemoved = true; + self->tenantGroups.erase(tenantData->second.tenantGroup.get()); + dataDb.tenantGroups.erase(tenantData->second.tenantGroup.get()); + } + } else { + allocationRemoved = true; + self->ungroupedTenants.erase(tenant); + dataDb.ungroupedTenants.erase(tenant); + } + + if (newTenantGroup.present()) { + auto [tenantGroupData, inserted] = self->tenantGroups.try_emplace( + newTenantGroup.get(), TenantGroupData(tenantData->second.cluster)); + tenantGroupData->second.tenants.insert(tenant); + if (inserted) { + allocationAdded = true; + dataDb.tenantGroups.insert(newTenantGroup.get()); + } + } else { + allocationAdded = true; + self->ungroupedTenants.insert(tenant); + dataDb.ungroupedTenants.insert(tenant); + } + + tenantData->second.tenantGroup = newTenantGroup; + + if (allocationAdded && !allocationRemoved) { + ASSERT(hasCapacity); + } else if (allocationRemoved && !allocationAdded && + dataDb.ungroupedTenants.size() + dataDb.tenantGroups.size() >= dataDb.tenantGroupCapacity) { + --self->totalTenantGroupCapacity; + } + } + + return Void(); + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(!exists); + return Void(); + } else if (e.code() == error_code_cluster_no_capacity) { + ASSERT(exists && !hasCapacity); + return Void(); + } else if (e.code() == error_code_invalid_tenant_configuration) { + ASSERT(exists && tenantGroupExists && + self->createdTenants[tenant].cluster != self->tenantGroups[newTenantGroup.get()].cluster); + return Void(); + } + + TraceEvent(SevError, "ConfigureTenantFailure") + .error(e) + .detail("TenantName", tenant) + .detail("TenantGroup", newTenantGroup); + ASSERT(false); + throw internal_error(); + } + } + + ACTOR static Future renameTenant(MetaclusterManagementWorkload* self) { + state TenantName tenant = self->chooseTenantName(); + state TenantName newTenantName = self->chooseTenantName(); + + auto itr = self->createdTenants.find(tenant); + state bool exists = itr != self->createdTenants.end(); + + itr = self->createdTenants.find(newTenantName); + state bool newTenantExists = itr != self->createdTenants.end(); + + try { + state bool retried = false; + loop { + try { + Future renameFuture = MetaclusterAPI::renameTenant(self->managementDb, tenant, newTenantName); + Optional result = wait(timeout(renameFuture, deterministicRandom()->randomInt(1, 30))); + + if (result.present()) { + break; + } + + retried = true; + } catch (Error& e) { + // If we retry the rename after it had succeeded, we will get an error that we should ignore + if (e.code() == error_code_tenant_not_found && exists && !newTenantExists && retried) { + break; + } + throw e; + } + } + + ASSERT(exists); + ASSERT(!newTenantExists); + + Optional oldEntry = wait(MetaclusterAPI::tryGetTenant(self->managementDb, tenant)); + ASSERT(!oldEntry.present()); + + TenantMapEntry newEntry = wait(MetaclusterAPI::getTenant(self->managementDb, newTenantName)); + + auto tenantData = self->createdTenants.find(tenant); + ASSERT(tenantData != self->createdTenants.end()); + ASSERT(tenantData->second.tenantGroup == newEntry.tenantGroup); + ASSERT(newEntry.assignedCluster.present() && tenantData->second.cluster == newEntry.assignedCluster.get()); + + self->createdTenants[newTenantName] = tenantData->second; + self->createdTenants.erase(tenantData); + + auto& dataDb = self->dataDbs[tenantData->second.cluster]; + ASSERT(dataDb.registered); + + dataDb.tenants.erase(tenant); + dataDb.tenants.insert(newTenantName); + + if (tenantData->second.tenantGroup.present()) { + auto& tenantGroup = self->tenantGroups[tenantData->second.tenantGroup.get()]; + tenantGroup.tenants.erase(tenant); + tenantGroup.tenants.insert(newTenantName); + } else { + dataDb.ungroupedTenants.erase(tenant); + dataDb.ungroupedTenants.insert(newTenantName); + self->ungroupedTenants.erase(tenant); + self->ungroupedTenants.insert(newTenantName); + } + + return Void(); + } catch (Error& e) { + if (e.code() == error_code_tenant_not_found) { + ASSERT(!exists); + return Void(); + } else if (e.code() == error_code_tenant_already_exists) { + ASSERT(newTenantExists); + return Void(); + } + + TraceEvent(SevError, "RenameTenantFailure") + .error(e) + .detail("OldTenantName", tenant) + .detail("NewTenantName", newTenantName); + ASSERT(false); + throw internal_error(); + } + } + Future start(Database const& cx) override { if (clientId == 0) { return _start(cx, this); From 718074a016771f06fbfa1522bfc054b6c3a9e7f5 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 5 Aug 2022 08:58:10 -0700 Subject: [PATCH 02/57] Metacluster support for restoring a data cluster --- fdbcli/MetaclusterCommands.actor.cpp | 84 +++++++++- .../fdbclient/MetaclusterManagement.actor.h | 151 ++++++++++++++++-- 2 files changed, 219 insertions(+), 16 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index cb90e2c9ce..9f22689f99 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -158,6 +158,75 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector return true; } +Optional parseToken(StringRef token, const char* str) { + bool foundEquals; + StringRef param = token.eat("=", &foundEquals); + if (!foundEquals) { + fmt::print(stderr, + "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n", + param.toString().c_str()); + return Optional(); + } + + if (!tokencmp(param, str)) { + fmt::print( + stderr, "ERROR: invalid configuration string `{}'. Expected: `{}'.\n", param.toString().c_str(), str); + return Optional(); + } + + return Optional(token.toString()); +} + +// metacluster restore command +ACTOR Future metaclusterRestoreCommand(Reference db, std::vector tokens) { + if (tokens.size() < 4 || tokens.size() > 6) { + fmt::print("Usage: metacluster restore connection_string=\n" + "[|]\n\n"); + fmt::print("Restore a data cluster.\n"); + return false; + } + + state ClusterNameRef clusterName = tokens[3]; + + // connection string + ClusterConnectionString connectionString; + auto optVal = parseToken(tokens[4], "connection_string"); + if (optVal.present()) { + connectionString = ClusterConnectionString(optVal.get()); + } + + AddNewTenants addNewTenants(AddNewTenants::True); + if (tokens.size() > 4) { + optVal = parseToken(tokens[4], "add_new_tenants"); + if (optVal.present()) { + if (optVal.get() == "true") { + addNewTenants = AddNewTenants::True; + } else { + addNewTenants = AddNewTenants::False; + } + } + } + + RemoveMissingTenants removeMissingTenants(RemoveMissingTenants::True); + if (tokens.size() > 5) { + optVal = parseToken(tokens[4], "remove_missing_tenants"); + if (optVal.present()) { + if (optVal.get() == "true") { + removeMissingTenants = RemoveMissingTenants::True; + } else { + removeMissingTenants = RemoveMissingTenants::False; + } + } + } + + DataClusterEntry defaultEntry; + wait(MetaclusterAPI::restoreCluster( + db, clusterName, connectionString, defaultEntry, addNewTenants, removeMissingTenants)); + + fmt::print("The cluster `{}' has been restored\n", printable(clusterName).c_str()); + return true; +} + // metacluster configure command ACTOR Future metaclusterConfigureCommand(Reference db, std::vector tokens) { if (tokens.size() < 4) { @@ -383,6 +452,8 @@ Future metaclusterCommand(Reference db, std::vector return metaclusterRegisterCommand(db, tokens); } else if (tokencmp(tokens[1], "remove")) { return metaclusterRemoveCommand(db, tokens); + } else if (tokencmp(tokens[1], "restore")) { + return metaclusterRestoreCommand(db, tokens); } else if (tokencmp(tokens[1], "configure")) { return metaclusterConfigureCommand(db, tokens); } else if (tokencmp(tokens[1], "list")) { @@ -402,9 +473,8 @@ void metaclusterGenerator(const char* text, std::vector& lc, std::vector const& tokens) { if (tokens.size() == 1) { - const char* opts[] = { - "create_experimental", "decommission", "register", "remove", "configure", "list", "get", "status", nullptr - }; + const char* opts[] = { "create_experimental", "decommission", "register", "remove", "restore", + "configure", "list", "get", "status", nullptr }; arrayGenerator(text, line, opts, lc); } else if (tokens.size() > 1 && (tokencmp(tokens[1], "register") || tokencmp(tokens[1], "configure"))) { const char* opts[] = { "max_tenant_groups=", "connection_string=", nullptr }; @@ -418,7 +488,7 @@ void metaclusterGenerator(const char* text, std::vector metaclusterHintGenerator(std::vector const& tokens, bool inArgument) { if (tokens.size() == 1) { - return { "", "[ARGS]" }; + return { "", "[ARGS]" }; } else if (tokencmp(tokens[1], "create_experimental")) { return { "" }; } else if (tokencmp(tokens[1], "decommission")) { @@ -438,6 +508,12 @@ std::vector metaclusterHintGenerator(std::vector const& } else { return {}; } + } else if (tokencmp(tokens[1], "restore") && tokens.size() < 4) { + static std::vector opts = { "", + "connection_string= ", + "", + "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); } else if (tokencmp(tokens[1], "configure")) { static std::vector opts = { "", "|connection_string=>" diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index c745916f9a..35aa3315b1 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -19,6 +19,7 @@ */ #pragma once +#include "fdbclient/CoordinationInterface.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/Tenant.h" #include "flow/IRandom.h" @@ -710,18 +711,6 @@ Future registerCluster(Reference db, return Void(); } -ACTOR template -Future restoreCluster(Reference db, - ClusterName name, - std::string connectionString, - DataClusterEntry entry, - AddNewTenants addNewTenants, - RemoveMissingTenants removeMissingTenants) { - // TODO: add implementation - wait(delay(0.0)); - return Void(); -} - template struct RemoveClusterImpl { MetaclusterOperationContext ctx; @@ -1080,6 +1069,144 @@ Future managementClusterRemoveTenantFromGroup(Transaction tr, return Void(); } +ACTOR template +Future> restoreClusterTransaction(Transaction tr, + ClusterName name, + ClusterConnectionString connectionString, + DataClusterEntry entry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + // pre-checks + wait(TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT)); + + state Optional clusterEntry = wait(ManagementClusterMetadata::dataClusters.get(tr, name)); + if (!clusterEntry.present()) { + // end if metacluter does not already know about this data cluster. + } + + Reference dataClusterDb = wait(openDatabase(connectionString)); + state Reference dataClusterTr = dataClusterDb->createTransaction(); + + state Optional existingRegistration = + wait(MetaclusterMetadata::metaclusterRegistration.get(tr)); + + // get all tenants in the mgmt cluster + KeyBackedRangeResult> mgmtCluterTenantsFuture = + wait(ManagementClusterMetadata::tenantMetadata.tenantMap.getRange( + tr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS * CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); + std::vector> mgmtClusterExistingTenants = mgmtCluterTenantsFuture.results; + // convert to map + state std::unordered_map mgmtClusterTenantMap; + for (auto t : mgmtClusterExistingTenants) { + mgmtClusterTenantMap.emplace(t.first, t.second); + } + + // get tenants for the specific data cluster in the mgmt cluster + state std::pair clusterTupleRange = + std::make_pair(Tuple::makeTuple(name), Tuple::makeTuple(keyAfter(name))); + state KeyBackedRangeResult tenantEntries = wait(ManagementClusterMetadata::clusterTenantIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); + state std::vector mgmtTenants = tenantEntries.results; + // convert to set + state std::unordered_set mgmtTenantSet; + for (auto t : mgmtTenants) { + ASSERT(t.getString(0) == name); + mgmtTenantSet.insert(t.getString(1)); + } + + // get tenants from the restoring data cluster + state std::vector> dataClusterExistingTenants = wait( + TenantAPI::listTenantsTransaction(dataClusterTr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); + // convert to map + state std::unordered_map dataClusterTenantMap; + for (auto t : dataClusterExistingTenants) { + dataClusterTenantMap.emplace(t.first, t.second); + } + + for (std::pair t : dataClusterExistingTenants) { + // Check to ensure that the tenant is not assigned to another data cluster + auto itr = mgmtClusterTenantMap.find(t.first); + if ((itr != mgmtClusterTenantMap.end()) && itr->second.assignedCluster.present() && + itr->second.assignedCluster.get() != name) { + // error, fail the restore + // set error + return Optional(); + } + + if (mgmtTenantSet.find(t.first) == mgmtTenantSet.end()) { + if (addNewTenants == AddNewTenants::True) { + // Add the tenant to the tenantmap + ManagementClusterMetadata::tenantMetadata.tenantMap.set(tr, t.first, t.second); + + // Add the tenant to the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.insert(tr, Tuple::makeTuple(name, t.first)); + + // Add to the tenant group + ManagementClusterMetadata::tenantMetadata.tenantGroupTenantIndex.insert( + tr, Tuple::makeTuple(t.second.tenantGroup.get(), t.first)); + } else { + // error + } + } + } + + for (Tuple t : mgmtTenants) { + state TenantNameRef tenantName = t.getString(1); + if (dataClusterTenantMap.find(tenantName) == dataClusterTenantMap.end()) { + if (removeMissingTenants == RemoveMissingTenants::True) { + // Erase the tenant from the tenantmap + ManagementClusterMetadata::tenantMetadata.tenantMap.erase(tr, tenantName); + + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase(tr, Tuple::makeTuple(name, tenantName)); + + // Remove from the tenant group + state TenantMapEntry tenantEntry = wait(getTenantTransaction(tr, tenantName)); + state DataClusterMetadata clusterMetadata1 = wait(getClusterTransaction(tr, name)); + wait(managementClusterRemoveTenantFromGroup(tr, tenantName, tenantEntry, &clusterMetadata1)); + } else { + // error + } + } + } + + return Optional(entry); +} + +ACTOR template +Future restoreCluster(Reference db, + ClusterName name, + ClusterConnectionString connectionString, + DataClusterEntry entry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) { + if (name.startsWith("\xff"_sr)) { + throw invalid_cluster_name(); + } + + state Reference tr = db->createTransaction(); + + loop { + try { + state Optional newCluster = + wait(restoreClusterTransaction(tr, name, connectionString, entry, addNewTenants, removeMissingTenants)); + + wait(buggifiedCommit(tr, BUGGIFY)); + + TraceEvent("RestoredDataCluster") + .detail("ClusterName", name) + .detail("ClusterId", newCluster.present() ? newCluster.get().id : UID()) + .detail("Version", tr->getCommittedVersion()); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + template struct CreateTenantImpl { MetaclusterOperationContext ctx; From 8f11e77ec402bd40e78939b0cf1a2357476b15ca Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 11 Aug 2022 18:03:10 -0700 Subject: [PATCH 03/57] Switch to the impl design pattern, and handle multi transactions --- .../fdbclient/MetaclusterManagement.actor.h | 368 ++++++++++++------ flow/include/flow/error_definitions.h | 1 + 2 files changed, 260 insertions(+), 109 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 35aa3315b1..e7412bc7fc 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -21,7 +21,7 @@ #pragma once #include "fdbclient/CoordinationInterface.h" #include "fdbclient/FDBOptions.g.h" -#include "fdbclient/Tenant.h" +#include "fdbrpc/TenantName.h" #include "flow/IRandom.h" #include "flow/Platform.h" #include "flow/ThreadHelper.actor.h" @@ -564,6 +564,8 @@ void updateClusterMetadata(Transaction tr, if (updatedEntry.present()) { if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); + } else if (previousMetadata.entry.clusterState == DataClusterState::RESTORING) { + throw cluster_restored(); } ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get()); @@ -1069,111 +1071,277 @@ Future managementClusterRemoveTenantFromGroup(Transaction tr, return Void(); } -ACTOR template -Future> restoreClusterTransaction(Transaction tr, - ClusterName name, - ClusterConnectionString connectionString, - DataClusterEntry entry, - AddNewTenants addNewTenants, - RemoveMissingTenants removeMissingTenants) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); +template +struct RestoreCluterImpl { + MetaclusterOperationContext ctx; - // pre-checks - wait(TenantAPI::checkTenantMode(tr, ClusterType::METACLUSTER_MANAGEMENT)); + // Initialization parameters + ClusterName clusterName; + ClusterConnectionString connectionString; + DataClusterEntry clusterEntry; + AddNewTenants addNewTenants; + RemoveMissingTenants removeMissingTenants; - state Optional clusterEntry = wait(ManagementClusterMetadata::dataClusters.get(tr, name)); - if (!clusterEntry.present()) { - // end if metacluter does not already know about this data cluster. - } + TenantName lastTenantName; + ClusterName lastClusterName; - Reference dataClusterDb = wait(openDatabase(connectionString)); - state Reference dataClusterTr = dataClusterDb->createTransaction(); + std::unordered_map dataClusterTenantMap; + std::unordered_map mgmtClusterTenantMap; + std::unordered_set mgmtTenantSet; - state Optional existingRegistration = - wait(MetaclusterMetadata::metaclusterRegistration.get(tr)); + std::vector> addList; + std::vector removeList; - // get all tenants in the mgmt cluster - KeyBackedRangeResult> mgmtCluterTenantsFuture = - wait(ManagementClusterMetadata::tenantMetadata.tenantMap.getRange( - tr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS * CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); - std::vector> mgmtClusterExistingTenants = mgmtCluterTenantsFuture.results; - // convert to map - state std::unordered_map mgmtClusterTenantMap; - for (auto t : mgmtClusterExistingTenants) { - mgmtClusterTenantMap.emplace(t.first, t.second); - } + RestoreCluterImpl(Reference managementDb, + ClusterName clusterName, + ClusterConnectionString connectionString, + DataClusterEntry clusterEntry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) + : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), + clusterEntry(clusterEntry), addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} - // get tenants for the specific data cluster in the mgmt cluster - state std::pair clusterTupleRange = - std::make_pair(Tuple::makeTuple(name), Tuple::makeTuple(keyAfter(name))); - state KeyBackedRangeResult tenantEntries = wait(ManagementClusterMetadata::clusterTenantIndex.getRange( - tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); - state std::vector mgmtTenants = tenantEntries.results; - // convert to set - state std::unordered_set mgmtTenantSet; - for (auto t : mgmtTenants) { - ASSERT(t.getString(0) == name); - mgmtTenantSet.insert(t.getString(1)); - } + ACTOR static Future markClusterRestoring(RestoreCluterImpl* self, Reference tr) { + if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + updatedEntry.clusterState = DataClusterState::RESTORING; - // get tenants from the restoring data cluster - state std::vector> dataClusterExistingTenants = wait( - TenantAPI::listTenantsTransaction(dataClusterTr, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); - // convert to map - state std::unordered_map dataClusterTenantMap; - for (auto t : dataClusterExistingTenants) { - dataClusterTenantMap.emplace(t.first, t.second); - } - - for (std::pair t : dataClusterExistingTenants) { - // Check to ensure that the tenant is not assigned to another data cluster - auto itr = mgmtClusterTenantMap.find(t.first); - if ((itr != mgmtClusterTenantMap.end()) && itr->second.assignedCluster.present() && - itr->second.assignedCluster.get() != name) { - // error, fail the restore - // set error - return Optional(); + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + self->ctx.dataClusterMetadata.get(), + Optional(self->connectionString), + updatedEntry); } - if (mgmtTenantSet.find(t.first) == mgmtTenantSet.end()) { - if (addNewTenants == AddNewTenants::True) { - // Add the tenant to the tenantmap - ManagementClusterMetadata::tenantMetadata.tenantMap.set(tr, t.first, t.second); + TraceEvent("MarkedDataClusterRestoring") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); - // Add the tenant to the cluster -> tenant index - ManagementClusterMetadata::clusterTenantIndex.insert(tr, Tuple::makeTuple(name, t.first)); + return true; + } - // Add to the tenant group - ManagementClusterMetadata::tenantMetadata.tenantGroupTenantIndex.insert( - tr, Tuple::makeTuple(t.second.tenantGroup.get(), t.first)); - } else { - // error + ACTOR static Future markClusterReady(RestoreCluterImpl* self, Reference tr) { + if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + updatedEntry.clusterState = DataClusterState::READY; + + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + self->ctx.dataClusterMetadata.get(), + Optional(self->connectionString), + updatedEntry); + } + + TraceEvent("MarkedDataClusterReady") + .detail("Name", self->ctx.clusterName.get()) + .detail("Version", tr->getCommittedVersion()); + + return Void(); + } + + ACTOR static Future getTenantsFromDataCluster(RestoreCluterImpl* self, Reference tr) { + TenantNameRef begin = self->lastTenantName; + TenantNameRef end = "\xff\xff"_sr; + state Future>> tenantsFuture = + TenantMetadata::tenantMap().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult> tenants = wait(tenantsFuture); + + if (!tenants.results.empty()) { + for (auto t : tenants.results) { + self->dataClusterTenantMap.emplace(t.first, t.second); + self->lastTenantName = t.first; } } + + return !tenants.more; } - for (Tuple t : mgmtTenants) { - state TenantNameRef tenantName = t.getString(1); - if (dataClusterTenantMap.find(tenantName) == dataClusterTenantMap.end()) { - if (removeMissingTenants == RemoveMissingTenants::True) { - // Erase the tenant from the tenantmap - ManagementClusterMetadata::tenantMetadata.tenantMap.erase(tr, tenantName); + ACTOR static Future getAllTenantsFromDataCluster(RestoreCluterImpl* self) { + loop { + bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - // Remove the tenant from the cluster -> tenant index - ManagementClusterMetadata::clusterTenantIndex.erase(tr, Tuple::makeTuple(name, tenantName)); - - // Remove from the tenant group - state TenantMapEntry tenantEntry = wait(getTenantTransaction(tr, tenantName)); - state DataClusterMetadata clusterMetadata1 = wait(getClusterTransaction(tr, name)); - wait(managementClusterRemoveTenantFromGroup(tr, tenantName, tenantEntry, &clusterMetadata1)); - } else { - // error + if (gotAllTenants) { + break; } } + + return Void(); } - return Optional(entry); -} + ACTOR static Future getTenantsFromMgmtCluster(RestoreCluterImpl* self, Reference tr) { + TenantNameRef begin = self->lastTenantName; + TenantNameRef end = "\xff\xff"_sr; + state Future>> tenantsFuture = + ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( + tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult> tenants = wait(tenantsFuture); + + if (!tenants.results.empty()) { + for (auto t : tenants.results) { + self->mgmtClusterTenantMap.emplace(t.first, t.second); + self->lastTenantName = t.first; + } + } + + return !tenants.more; + } + + ACTOR static Future getTenantsFromMgmtClusterForCurrentDataCluster(RestoreCluterImpl* self, + Reference tr) { + std::pair clusterTupleRange = + std::make_pair(Tuple::makeTuple(self->lastClusterName), Tuple::makeTuple(keyAfter(self->clusterName))); + state Future> tenantEntriesFuture = + ManagementClusterMetadata::clusterTenantIndex.getRange( + tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); + + if (!tenantEntries.results.empty()) { + for (auto t : tenantEntries.results) { + ASSERT(t.getString(0) == self->clusterName); + self->mgmtTenantSet.insert(t.getString(1)); + // fix this ... not correct + self->lastClusterName = t.getString(0); + } + } + + return !tenantEntries.more; + } + + ACTOR static Future getAllTenantsFromMgmtCluster(RestoreCluterImpl* self) { + // get all tenants across all data clusters + self->lastClusterName = self->clusterName; + loop { + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantsFromMgmtCluster(self, tr); + })); + + if (gotAllTenants) { + break; + } + } + + // get tenants for the specific data cluster + loop { + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantsFromMgmtClusterForCurrentDataCluster(self, tr); + })); + + if (gotAllTenants) { + break; + } + } + + return Void(); + } + + ACTOR static Future addTenants(RestoreCluterImpl* self, Reference tr) { + for (auto t : self->addList) { + // Add the tenant to the tenantmap + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, t.first, t.second); + // Add the tenant to the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.insert(tr, Tuple::makeTuple(self->clusterName, t.first)); + // Add to the tenant group + ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.insert( + tr, Tuple::makeTuple(t.second.tenantGroup.get(), t.first)); + } + + return Void(); + } + + ACTOR static Future removeTenants(RestoreCluterImpl* self, Reference tr) { + for (auto t : self->removeList) { + // Erase the tenant from the tenantmap + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, t); + // Remove the tenant from the cluster -> tenant index + ManagementClusterMetadata::clusterTenantIndex.erase(tr, Tuple::makeTuple(self->clusterName, t)); + } + + return Void(); + } + + ACTOR static Future run(RestoreCluterImpl* self) { + state bool clusterIsPresent; + // set state to restoring + try { + wait(store(clusterIsPresent, + self->ctx.runManagementTransaction([self = self](Reference tr) { + return markClusterRestoring(self, tr); + }))); + } catch (Error& e) { + // If the transaction retries after success or if we are trying a second time to restore the cluster, it + // will throw an error indicating that the restore has already started + if (e.code() == error_code_cluster_restored) { + clusterIsPresent = true; + } else { + throw; + } + } + + if (clusterIsPresent) { + // get all the tenant information from the new data cluster + wait(getAllTenantsFromDataCluster(self)); + // get all the tenant information for this data cluster from mgmt cluster + wait(getAllTenantsFromMgmtCluster(self)); + // compare and build the add list and remove list + + for (std::pair t : self->dataClusterTenantMap) { + // Check to ensure that the tenant is not assigned to another data cluster + auto itr = self->mgmtClusterTenantMap.find(t.first); + if ((itr != self->mgmtClusterTenantMap.end()) && itr->second.assignedCluster.present() && + itr->second.assignedCluster.get() != self->clusterName) { + // error, fail the restore + // set error + // throw; + } + + // build add list + if (self->mgmtTenantSet.find(t.first) == self->mgmtTenantSet.end()) { + if (self->addNewTenants == AddNewTenants::True) { + self->addList.push_back(t); + } else { + // error + // throw; + } + } + } + + for (TenantNameRef t : self->mgmtTenantSet) { + if (self->dataClusterTenantMap.find(t) == self->dataClusterTenantMap.end()) { + if (self->removeMissingTenants == RemoveMissingTenants::True) { + self->removeList.push_back(t); + } else { + // error + // throw; + } + } + } + + try { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return addTenants(self, tr); })); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return removeTenants(self, tr); })); + } catch (Error& e) { + throw; + } + + // set to ready state + try { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markClusterReady(self, tr); })); + } catch (Error& e) { + throw; + } + } + + return Void(); + } + + Future run() { return run(this); } +}; ACTOR template Future restoreCluster(Reference db, @@ -1182,29 +1350,9 @@ Future restoreCluster(Reference db, DataClusterEntry entry, AddNewTenants addNewTenants, RemoveMissingTenants removeMissingTenants) { - if (name.startsWith("\xff"_sr)) { - throw invalid_cluster_name(); - } - - state Reference tr = db->createTransaction(); - - loop { - try { - state Optional newCluster = - wait(restoreClusterTransaction(tr, name, connectionString, entry, addNewTenants, removeMissingTenants)); - - wait(buggifiedCommit(tr, BUGGIFY)); - - TraceEvent("RestoredDataCluster") - .detail("ClusterName", name) - .detail("ClusterId", newCluster.present() ? newCluster.get().id : UID()) - .detail("Version", tr->getCommittedVersion()); - - return Void(); - } catch (Error& e) { - wait(safeThreadFutureToFuture(tr->onError(e))); - } - } + state RestoreCluterImpl impl(db, name, connectionString, entry, addNewTenants, removeMissingTenants); + wait(impl.run()); + return Void(); } template @@ -1426,6 +1574,8 @@ struct CreateTenantImpl { // then we fail with an error. if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); + } else if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { + throw cluster_restored(); } managementClusterAddTenantToGroup( diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index d7e1badc98..282d57542f 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -268,6 +268,7 @@ ERROR( metacluster_no_capacity, 2166, "Metacluster does not have capacity to cre ERROR( management_cluster_invalid_access, 2167, "Standard transactions cannot be run against the management cluster" ) ERROR( tenant_creation_permanently_failed, 2168, "The tenant creation did not complete in a timely manner and has permanently failed" ) ERROR( cluster_removed, 2169, "The cluster is being removed from the metacluster" ) +ERROR( cluster_restored, 2170, "The cluster is being restored to the metacluster" ) // 2200 - errors from bindings and official APIs ERROR( api_version_unset, 2200, "API version is not set" ) From 02d0fd95ceb0b96a8d2b49cfb83407044f5118bd Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 25 Aug 2022 11:22:50 -0700 Subject: [PATCH 04/57] Rename and fix the typo --- .../fdbclient/MetaclusterManagement.actor.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index e7412bc7fc..e258d5e004 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1072,7 +1072,7 @@ Future managementClusterRemoveTenantFromGroup(Transaction tr, } template -struct RestoreCluterImpl { +struct RestoreClusterImpl { MetaclusterOperationContext ctx; // Initialization parameters @@ -1092,16 +1092,16 @@ struct RestoreCluterImpl { std::vector> addList; std::vector removeList; - RestoreCluterImpl(Reference managementDb, - ClusterName clusterName, - ClusterConnectionString connectionString, - DataClusterEntry clusterEntry, - AddNewTenants addNewTenants, - RemoveMissingTenants removeMissingTenants) + RestoreClusterImpl(Reference managementDb, + ClusterName clusterName, + ClusterConnectionString connectionString, + DataClusterEntry clusterEntry, + AddNewTenants addNewTenants, + RemoveMissingTenants removeMissingTenants) : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry), addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} - ACTOR static Future markClusterRestoring(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future markClusterRestoring(RestoreClusterImpl* self, Reference tr) { if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::RESTORING; @@ -1120,7 +1120,7 @@ struct RestoreCluterImpl { return true; } - ACTOR static Future markClusterReady(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future markClusterReady(RestoreClusterImpl* self, Reference tr) { if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::READY; @@ -1139,7 +1139,7 @@ struct RestoreCluterImpl { return Void(); } - ACTOR static Future getTenantsFromDataCluster(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { TenantNameRef begin = self->lastTenantName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = @@ -1156,7 +1156,7 @@ struct RestoreCluterImpl { return !tenants.more; } - ACTOR static Future getAllTenantsFromDataCluster(RestoreCluterImpl* self) { + ACTOR static Future getAllTenantsFromDataCluster(RestoreClusterImpl* self) { loop { bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); @@ -1169,7 +1169,7 @@ struct RestoreCluterImpl { return Void(); } - ACTOR static Future getTenantsFromMgmtCluster(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future getTenantsFromMgmtCluster(RestoreClusterImpl* self, Reference tr) { TenantNameRef begin = self->lastTenantName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = @@ -1187,7 +1187,7 @@ struct RestoreCluterImpl { return !tenants.more; } - ACTOR static Future getTenantsFromMgmtClusterForCurrentDataCluster(RestoreCluterImpl* self, + ACTOR static Future getTenantsFromMgmtClusterForCurrentDataCluster(RestoreClusterImpl* self, Reference tr) { std::pair clusterTupleRange = std::make_pair(Tuple::makeTuple(self->lastClusterName), Tuple::makeTuple(keyAfter(self->clusterName))); @@ -1208,7 +1208,7 @@ struct RestoreCluterImpl { return !tenantEntries.more; } - ACTOR static Future getAllTenantsFromMgmtCluster(RestoreCluterImpl* self) { + ACTOR static Future getAllTenantsFromMgmtCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters self->lastClusterName = self->clusterName; loop { @@ -1237,7 +1237,7 @@ struct RestoreCluterImpl { return Void(); } - ACTOR static Future addTenants(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future addTenants(RestoreClusterImpl* self, Reference tr) { for (auto t : self->addList) { // Add the tenant to the tenantmap ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, t.first, t.second); @@ -1251,7 +1251,7 @@ struct RestoreCluterImpl { return Void(); } - ACTOR static Future removeTenants(RestoreCluterImpl* self, Reference tr) { + ACTOR static Future removeTenants(RestoreClusterImpl* self, Reference tr) { for (auto t : self->removeList) { // Erase the tenant from the tenantmap ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, t); @@ -1262,7 +1262,7 @@ struct RestoreCluterImpl { return Void(); } - ACTOR static Future run(RestoreCluterImpl* self) { + ACTOR static Future run(RestoreClusterImpl* self) { state bool clusterIsPresent; // set state to restoring try { @@ -1350,7 +1350,7 @@ Future restoreCluster(Reference db, DataClusterEntry entry, AddNewTenants addNewTenants, RemoveMissingTenants removeMissingTenants) { - state RestoreCluterImpl impl(db, name, connectionString, entry, addNewTenants, removeMissingTenants); + state RestoreClusterImpl impl(db, name, connectionString, entry, addNewTenants, removeMissingTenants); wait(impl.run()); return Void(); } From aa6da92213aea66a5e0e0959799f3146e05e4e16 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 2 Sep 2022 12:53:17 -0700 Subject: [PATCH 05/57] Fix correctness issues and handle some additional cases --- .../fdbclient/MetaclusterManagement.actor.h | 178 ++++++++++-------- 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index e258d5e004..b221d48152 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -22,6 +22,7 @@ #include "fdbclient/CoordinationInterface.h" #include "fdbclient/FDBOptions.g.h" #include "fdbrpc/TenantName.h" +#include "flow/FastRef.h" #include "flow/IRandom.h" #include "flow/Platform.h" #include "flow/ThreadHelper.actor.h" @@ -1082,16 +1083,18 @@ struct RestoreClusterImpl { AddNewTenants addNewTenants; RemoveMissingTenants removeMissingTenants; + // Variables to track state during restore TenantName lastTenantName; ClusterName lastClusterName; + uint64_t tenantId; + TenantName tenantName; + TenantName tenantNewName; + // Tenant list from data and management clusters. std::unordered_map dataClusterTenantMap; std::unordered_map mgmtClusterTenantMap; std::unordered_set mgmtTenantSet; - std::vector> addList; - std::vector removeList; - RestoreClusterImpl(Reference managementDb, ClusterName clusterName, ClusterConnectionString connectionString, @@ -1101,7 +1104,8 @@ struct RestoreClusterImpl { : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry), addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} - ACTOR static Future markClusterRestoring(RestoreClusterImpl* self, Reference tr) { + ACTOR static Future markClusterAsRestoring(RestoreClusterImpl* self, + Reference tr) { if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::RESTORING; @@ -1120,7 +1124,7 @@ struct RestoreClusterImpl { return true; } - ACTOR static Future markClusterReady(RestoreClusterImpl* self, Reference tr) { + ACTOR static Future markClusterAsReady(RestoreClusterImpl* self, Reference tr) { if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::READY; @@ -1139,6 +1143,19 @@ struct RestoreClusterImpl { return Void(); } + ACTOR static Future markManagementTenantAsError(RestoreClusterImpl* self, + Reference tr) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + + if (!tenantEntry.present()) { + return Void(); + } + + tenantEntry.get().tenantState = TenantState::ERROR; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get()); + return Void(); + } + ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { TenantNameRef begin = self->lastTenantName; TenantNameRef end = "\xff\xff"_sr; @@ -1169,7 +1186,7 @@ struct RestoreClusterImpl { return Void(); } - ACTOR static Future getTenantsFromMgmtCluster(RestoreClusterImpl* self, Reference tr) { + ACTOR static Future getTenantsFromManagementCluster(RestoreClusterImpl* self, Reference tr) { TenantNameRef begin = self->lastTenantName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = @@ -1187,8 +1204,8 @@ struct RestoreClusterImpl { return !tenants.more; } - ACTOR static Future getTenantsFromMgmtClusterForCurrentDataCluster(RestoreClusterImpl* self, - Reference tr) { + ACTOR static Future getTenantsFromManagementClusterForCurrentDataCluster(RestoreClusterImpl* self, + Reference tr) { std::pair clusterTupleRange = std::make_pair(Tuple::makeTuple(self->lastClusterName), Tuple::makeTuple(keyAfter(self->clusterName))); state Future> tenantEntriesFuture = @@ -1208,13 +1225,13 @@ struct RestoreClusterImpl { return !tenantEntries.more; } - ACTOR static Future getAllTenantsFromMgmtCluster(RestoreClusterImpl* self) { + ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters self->lastClusterName = self->clusterName; loop { bool gotAllTenants = wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromMgmtCluster(self, tr); + return getTenantsFromManagementCluster(self, tr); })); if (gotAllTenants) { @@ -1226,7 +1243,7 @@ struct RestoreClusterImpl { loop { bool gotAllTenants = wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromMgmtClusterForCurrentDataCluster(self, tr); + return getTenantsFromManagementClusterForCurrentDataCluster(self, tr); })); if (gotAllTenants) { @@ -1237,101 +1254,110 @@ struct RestoreClusterImpl { return Void(); } - ACTOR static Future addTenants(RestoreClusterImpl* self, Reference tr) { - for (auto t : self->addList) { - // Add the tenant to the tenantmap - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, t.first, t.second); - // Add the tenant to the cluster -> tenant index - ManagementClusterMetadata::clusterTenantIndex.insert(tr, Tuple::makeTuple(self->clusterName, t.first)); - // Add to the tenant group - ManagementClusterMetadata::tenantMetadata().tenantGroupTenantIndex.insert( - tr, Tuple::makeTuple(t.second.tenantGroup.get(), t.first)); - } + ACTOR static Future> lookupTenantInManagementClusterById(RestoreClusterImpl* self) { + Optional tenantName = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return ManagementClusterMetadata::tenantMetadata().tenantIdIndex.get(tr, self->tenantId); + })); + return tenantName; + } + + ACTOR static Future> lookupTenantInDataClusterById(RestoreClusterImpl* self) { + Optional tenantName = + wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { + return TenantMetadata::tenantIdIndex().get(tr, self->tenantId); + })); + + return tenantName; + } + + ACTOR static Future renameTenantOnDataCluster(RestoreClusterImpl* self) { + ASSERT(self->tenantId != -1); + wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { + return TenantAPI::renameTenantTransaction( + tr, self->tenantName, self->tenantNewName, self->tenantId, ClusterType::METACLUSTER_DATA); + })); return Void(); } - ACTOR static Future removeTenants(RestoreClusterImpl* self, Reference tr) { - for (auto t : self->removeList) { - // Erase the tenant from the tenantmap - ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, t); - // Remove the tenant from the cluster -> tenant index - ManagementClusterMetadata::clusterTenantIndex.erase(tr, Tuple::makeTuple(self->clusterName, t)); - } - + ACTOR static Future removeTenantFromDataCluster(RestoreClusterImpl* self) { + wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { + return TenantAPI::deleteTenantTransaction( + tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); + })); return Void(); } + // This only supports the restore of an already registered data cluster, for now. ACTOR static Future run(RestoreClusterImpl* self) { - state bool clusterIsPresent; + state bool clusterIsRestoring; // set state to restoring try { - wait(store(clusterIsPresent, + wait(store(clusterIsRestoring, self->ctx.runManagementTransaction([self = self](Reference tr) { - return markClusterRestoring(self, tr); + return markClusterAsRestoring(self, tr); }))); } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to restore the cluster, it // will throw an error indicating that the restore has already started if (e.code() == error_code_cluster_restored) { - clusterIsPresent = true; + clusterIsRestoring = true; } else { throw; } } - if (clusterIsPresent) { - // get all the tenant information from the new data cluster + if (!clusterIsRestoring) { + // get all the tenant information from the newly registered data cluster wait(getAllTenantsFromDataCluster(self)); - // get all the tenant information for this data cluster from mgmt cluster - wait(getAllTenantsFromMgmtCluster(self)); - // compare and build the add list and remove list + // get all the tenant information for this data cluster from manangement cluster + wait(getAllTenantsFromManagementCluster(self)); - for (std::pair t : self->dataClusterTenantMap) { - // Check to ensure that the tenant is not assigned to another data cluster - auto itr = self->mgmtClusterTenantMap.find(t.first); - if ((itr != self->mgmtClusterTenantMap.end()) && itr->second.assignedCluster.present() && - itr->second.assignedCluster.get() != self->clusterName) { - // error, fail the restore - // set error - // throw; - } + state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); + while (itr != self->dataClusterTenantMap.end()) { - // build add list - if (self->mgmtTenantSet.find(t.first) == self->mgmtTenantSet.end()) { - if (self->addNewTenants == AddNewTenants::True) { - self->addList.push_back(t); - } else { - // error - // throw; - } - } - } + state TenantName tenantNameOnDataCluster = itr->first; + state uint64_t tenantId = (itr->second).id; + self->tenantId = tenantId; + self->tenantName = tenantNameOnDataCluster; - for (TenantNameRef t : self->mgmtTenantSet) { - if (self->dataClusterTenantMap.find(t) == self->dataClusterTenantMap.end()) { + state Optional tenantNameOnMgmtCluster = wait(lookupTenantInManagementClusterById(self)); + if (!tenantNameOnMgmtCluster.present()) { + // A tenant with this id is not found on the mgmt cluster. if (self->removeMissingTenants == RemoveMissingTenants::True) { - self->removeList.push_back(t); - } else { - // error - // throw; + wait(removeTenantFromDataCluster(self)); + } + } else { + // A tenant with this id is found on the manangement cluster. + if (tenantNameOnDataCluster.compare(tenantNameOnMgmtCluster.get()) != 0) { + // renamed + self->tenantNewName = tenantNameOnMgmtCluster.get(); + wait(renameTenantOnDataCluster(self)); } } + + ++itr; } - try { - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return addTenants(self, tr); })); - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return removeTenants(self, tr); })); - } catch (Error& e) { - throw; + state std::unordered_set::iterator setItr = self->mgmtTenantSet.begin(); + while (setItr != self->mgmtTenantSet.end()) { + self->tenantId = self->mgmtClusterTenantMap[*setItr].id; + state Optional tNameOnDataCluster = wait(lookupTenantInDataClusterById(self)); + + if (!tNameOnDataCluster.present()) { + // Set Tenant in ERROR state + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return markManagementTenantAsError(self, tr); + })); + } + ++setItr; } - // set to ready state + // set restored cluster to ready state try { wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return markClusterReady(self, tr); })); + [self = self](Reference tr) { return markClusterAsReady(self, tr); })); } catch (Error& e) { throw; } @@ -1413,10 +1439,10 @@ struct CreateTenantImpl { wait(self->ctx.setCluster(tr, existingEntry.get().assignedCluster.get())); return true; } else { - // The previous creation is permanently failed, so cleanup the tenant and create it again from scratch - // We don't need to remove it from the tenantNameIndex because we will overwrite the existing entry - // later in this transaction. - ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, existingEntry.get().id); + // The previous creation is permanently failed, so cleanup the tenant and create it again from + // scratch We don't need to remove it from the tenant map because we will overwrite the existing + // entry later in this transaction. + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id); ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); ManagementClusterMetadata::clusterTenantCount.atomicOp( tr, existingEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); From 8283ea0c7868e2b6965fa083400c6ca2d3d0e0e0 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 8 Sep 2022 15:25:34 -0700 Subject: [PATCH 06/57] Get the entire TenantIdIndex into memory to avoid doing individual lookups --- .../fdbclient/MetaclusterManagement.actor.h | 102 ++++++++++++------ 1 file changed, 70 insertions(+), 32 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index b221d48152..5f29015078 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -26,6 +26,7 @@ #include "flow/IRandom.h" #include "flow/Platform.h" #include "flow/ThreadHelper.actor.h" +#include #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H) #define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H #include "fdbclient/MetaclusterManagement.actor.g.h" @@ -1087,12 +1088,15 @@ struct RestoreClusterImpl { TenantName lastTenantName; ClusterName lastClusterName; uint64_t tenantId; + uint64_t lastTenantId; TenantName tenantName; TenantName tenantNewName; // Tenant list from data and management clusters. std::unordered_map dataClusterTenantMap; + std::unordered_map dataClusterTenantIdIndex; std::unordered_map mgmtClusterTenantMap; + std::unordered_map mgmtClusterTenantIdIndex; std::unordered_set mgmtTenantSet; RestoreClusterImpl(Reference managementDb, @@ -1173,6 +1177,24 @@ struct RestoreClusterImpl { return !tenants.more; } + ACTOR static Future getTenantIdNameIndexFromDataCluster(RestoreClusterImpl* self, + Reference tr) { + uint64_t begin = self->lastTenantId; + uint64_t end = std::numeric_limits::max(); + state Future>> tenantsFuture = + TenantMetadata::tenantIdIndex().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult> tenants = wait(tenantsFuture); + + if (!tenants.results.empty()) { + for (auto t : tenants.results) { + self->dataClusterTenantIdIndex.emplace(t.first, t.second); + self->lastTenantId = t.first; + } + } + + return !tenants.more; + } + ACTOR static Future getAllTenantsFromDataCluster(RestoreClusterImpl* self) { loop { bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( @@ -1183,6 +1205,15 @@ struct RestoreClusterImpl { } } + loop { + bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantIdNameIndexFromDataCluster(self, tr); })); + + if (gotAllTenants) { + break; + } + } + return Void(); } @@ -1204,6 +1235,25 @@ struct RestoreClusterImpl { return !tenants.more; } + ACTOR static Future getTenantIdNameIndexFromManagementCluster(RestoreClusterImpl* self, + Reference tr) { + uint64_t begin = self->lastTenantId; + uint64_t end = std::numeric_limits::max(); + state Future>> tenantsFuture = + ManagementClusterMetadata::tenantMetadata().tenantIdIndex.getRange( + tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult> tenants = wait(tenantsFuture); + + if (!tenants.results.empty()) { + for (auto t : tenants.results) { + self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); + self->lastTenantId = t.first; + } + } + + return !tenants.more; + } + ACTOR static Future getTenantsFromManagementClusterForCurrentDataCluster(RestoreClusterImpl* self, Reference tr) { std::pair clusterTupleRange = @@ -1239,6 +1289,17 @@ struct RestoreClusterImpl { } } + loop { + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantIdNameIndexFromManagementCluster(self, tr); + })); + + if (gotAllTenants) { + break; + } + } + // get tenants for the specific data cluster loop { bool gotAllTenants = @@ -1254,24 +1315,6 @@ struct RestoreClusterImpl { return Void(); } - ACTOR static Future> lookupTenantInManagementClusterById(RestoreClusterImpl* self) { - Optional tenantName = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return ManagementClusterMetadata::tenantMetadata().tenantIdIndex.get(tr, self->tenantId); - })); - - return tenantName; - } - - ACTOR static Future> lookupTenantInDataClusterById(RestoreClusterImpl* self) { - Optional tenantName = - wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { - return TenantMetadata::tenantIdIndex().get(tr, self->tenantId); - })); - - return tenantName; - } - ACTOR static Future renameTenantOnDataCluster(RestoreClusterImpl* self) { ASSERT(self->tenantId != -1); wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { @@ -1322,19 +1365,15 @@ struct RestoreClusterImpl { self->tenantId = tenantId; self->tenantName = tenantNameOnDataCluster; - state Optional tenantNameOnMgmtCluster = wait(lookupTenantInManagementClusterById(self)); - if (!tenantNameOnMgmtCluster.present()) { - // A tenant with this id is not found on the mgmt cluster. - if (self->removeMissingTenants == RemoveMissingTenants::True) { + auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantId); + if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { + if (self->removeMissingTenants) { wait(removeTenantFromDataCluster(self)); } - } else { - // A tenant with this id is found on the manangement cluster. - if (tenantNameOnDataCluster.compare(tenantNameOnMgmtCluster.get()) != 0) { - // renamed - self->tenantNewName = tenantNameOnMgmtCluster.get(); - wait(renameTenantOnDataCluster(self)); - } + } else if (tenantNameOnDataCluster.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { + // renamed + self->tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; + wait(renameTenantOnDataCluster(self)); } ++itr; @@ -1343,9 +1382,8 @@ struct RestoreClusterImpl { state std::unordered_set::iterator setItr = self->mgmtTenantSet.begin(); while (setItr != self->mgmtTenantSet.end()) { self->tenantId = self->mgmtClusterTenantMap[*setItr].id; - state Optional tNameOnDataCluster = wait(lookupTenantInDataClusterById(self)); - if (!tNameOnDataCluster.present()) { + if (self->dataClusterTenantIdIndex.find(self->tenantId) == self->dataClusterTenantIdIndex.end()) { // Set Tenant in ERROR state wait(self->ctx.runManagementTransaction([self = self](Reference tr) { return markManagementTenantAsError(self, tr); @@ -1440,7 +1478,7 @@ struct CreateTenantImpl { return true; } else { // The previous creation is permanently failed, so cleanup the tenant and create it again from - // scratch We don't need to remove it from the tenant map because we will overwrite the existing + // scratch. We don't need to remove it from the tenant map because we will overwrite the existing // entry later in this transaction. ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id); ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); From 3f03f2080c1b92265dd9d8ed63b87d2fb52d4169 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 15 Sep 2022 15:07:51 -0700 Subject: [PATCH 07/57] Rename error code --- fdbclient/include/fdbclient/MetaclusterManagement.actor.h | 6 +++--- flow/include/flow/error_definitions.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 5f29015078..612efa4d91 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -567,7 +567,7 @@ void updateClusterMetadata(Transaction tr, if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); } else if (previousMetadata.entry.clusterState == DataClusterState::RESTORING) { - throw cluster_restored(); + throw cluster_restoring(); } ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get()); @@ -1344,7 +1344,7 @@ struct RestoreClusterImpl { } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to restore the cluster, it // will throw an error indicating that the restore has already started - if (e.code() == error_code_cluster_restored) { + if (e.code() == error_code_cluster_restoring) { clusterIsRestoring = true; } else { throw; @@ -1639,7 +1639,7 @@ struct CreateTenantImpl { if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); } else if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { - throw cluster_restored(); + throw cluster_restoring(); } managementClusterAddTenantToGroup( diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index 282d57542f..bdd627c2f2 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -268,7 +268,7 @@ ERROR( metacluster_no_capacity, 2166, "Metacluster does not have capacity to cre ERROR( management_cluster_invalid_access, 2167, "Standard transactions cannot be run against the management cluster" ) ERROR( tenant_creation_permanently_failed, 2168, "The tenant creation did not complete in a timely manner and has permanently failed" ) ERROR( cluster_removed, 2169, "The cluster is being removed from the metacluster" ) -ERROR( cluster_restored, 2170, "The cluster is being restored to the metacluster" ) +ERROR( cluster_restoring, 2170, "The cluster is being restored to the metacluster" ) // 2200 - errors from bindings and official APIs ERROR( api_version_unset, 2200, "API version is not set" ) From 23601dbdd833db819c773061f901127ad127ad3a Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 16 Sep 2022 12:45:42 -0700 Subject: [PATCH 08/57] Track state locally instead of tracking at class level --- .../fdbclient/MetaclusterManagement.actor.h | 158 +++++++----------- 1 file changed, 57 insertions(+), 101 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 612efa4d91..7f501f505e 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1084,14 +1084,6 @@ struct RestoreClusterImpl { AddNewTenants addNewTenants; RemoveMissingTenants removeMissingTenants; - // Variables to track state during restore - TenantName lastTenantName; - ClusterName lastClusterName; - uint64_t tenantId; - uint64_t lastTenantId; - TenantName tenantName; - TenantName tenantNewName; - // Tenant list from data and management clusters. std::unordered_map dataClusterTenantMap; std::unordered_map dataClusterTenantIdIndex; @@ -1147,21 +1139,21 @@ struct RestoreClusterImpl { return Void(); } - ACTOR static Future markManagementTenantAsError(RestoreClusterImpl* self, - Reference tr) { - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); + ACTOR static Future markManagementTenantAsError(Reference tr, + TenantName tenantName) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); if (!tenantEntry.present()) { return Void(); } tenantEntry.get().tenantState = TenantState::ERROR; - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->tenantName, tenantEntry.get()); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantName, tenantEntry.get()); return Void(); } ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { - TenantNameRef begin = self->lastTenantName; + TenantNameRef begin = self->clusterName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = TenantMetadata::tenantMap().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); @@ -1170,7 +1162,6 @@ struct RestoreClusterImpl { if (!tenants.results.empty()) { for (auto t : tenants.results) { self->dataClusterTenantMap.emplace(t.first, t.second); - self->lastTenantName = t.first; } } @@ -1179,7 +1170,7 @@ struct RestoreClusterImpl { ACTOR static Future getTenantIdNameIndexFromDataCluster(RestoreClusterImpl* self, Reference tr) { - uint64_t begin = self->lastTenantId; + uint64_t begin = 0; uint64_t end = std::numeric_limits::max(); state Future>> tenantsFuture = TenantMetadata::tenantIdIndex().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); @@ -1188,37 +1179,28 @@ struct RestoreClusterImpl { if (!tenants.results.empty()) { for (auto t : tenants.results) { self->dataClusterTenantIdIndex.emplace(t.first, t.second); - self->lastTenantId = t.first; } } return !tenants.more; } - ACTOR static Future getAllTenantsFromDataCluster(RestoreClusterImpl* self) { - loop { - bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); + ACTOR static Future getAllTenantsFromDataCluster(RestoreClusterImpl* self) { + bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - if (gotAllTenants) { - break; - } + if (!gotAllTenants) { + return false; } - loop { - bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantIdNameIndexFromDataCluster(self, tr); })); + bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantIdNameIndexFromDataCluster(self, tr); })); - if (gotAllTenants) { - break; - } - } - - return Void(); + return gotAllTenants; } ACTOR static Future getTenantsFromManagementCluster(RestoreClusterImpl* self, Reference tr) { - TenantNameRef begin = self->lastTenantName; + TenantNameRef begin = ""_sr; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( @@ -1228,7 +1210,6 @@ struct RestoreClusterImpl { if (!tenants.results.empty()) { for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); - self->lastTenantName = t.first; } } @@ -1237,7 +1218,7 @@ struct RestoreClusterImpl { ACTOR static Future getTenantIdNameIndexFromManagementCluster(RestoreClusterImpl* self, Reference tr) { - uint64_t begin = self->lastTenantId; + uint64_t begin = 0; uint64_t end = std::numeric_limits::max(); state Future>> tenantsFuture = ManagementClusterMetadata::tenantMetadata().tenantIdIndex.getRange( @@ -1247,7 +1228,6 @@ struct RestoreClusterImpl { if (!tenants.results.empty()) { for (auto t : tenants.results) { self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); - self->lastTenantId = t.first; } } @@ -1257,7 +1237,7 @@ struct RestoreClusterImpl { ACTOR static Future getTenantsFromManagementClusterForCurrentDataCluster(RestoreClusterImpl* self, Reference tr) { std::pair clusterTupleRange = - std::make_pair(Tuple::makeTuple(self->lastClusterName), Tuple::makeTuple(keyAfter(self->clusterName))); + std::make_pair(Tuple::makeTuple(self->clusterName), Tuple::makeTuple(keyAfter(self->clusterName))); state Future> tenantEntriesFuture = ManagementClusterMetadata::clusterTenantIndex.getRange( tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); @@ -1267,69 +1247,39 @@ struct RestoreClusterImpl { for (auto t : tenantEntries.results) { ASSERT(t.getString(0) == self->clusterName); self->mgmtTenantSet.insert(t.getString(1)); - // fix this ... not correct - self->lastClusterName = t.getString(0); } } return !tenantEntries.more; } - ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { + ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters - self->lastClusterName = self->clusterName; - loop { - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromManagementCluster(self, tr); - })); + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantsFromManagementCluster(self, tr); + })); - if (gotAllTenants) { - break; - } + if (!gotAllTenants) { + return false; } - loop { - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantIdNameIndexFromManagementCluster(self, tr); - })); + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantIdNameIndexFromManagementCluster(self, tr); + })); - if (gotAllTenants) { - break; - } + if (!gotAllTenants) { + return false; } // get tenants for the specific data cluster - loop { - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromManagementClusterForCurrentDataCluster(self, tr); - })); + bool gotAllTenants = + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return getTenantsFromManagementClusterForCurrentDataCluster(self, tr); + })); - if (gotAllTenants) { - break; - } - } - - return Void(); - } - - ACTOR static Future renameTenantOnDataCluster(RestoreClusterImpl* self) { - ASSERT(self->tenantId != -1); - wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { - return TenantAPI::renameTenantTransaction( - tr, self->tenantName, self->tenantNewName, self->tenantId, ClusterType::METACLUSTER_DATA); - })); - return Void(); - } - - ACTOR static Future removeTenantFromDataCluster(RestoreClusterImpl* self) { - wait(self->ctx.runDataClusterTransaction([self = self](Reference tr) { - return TenantAPI::deleteTenantTransaction( - tr, self->tenantName, self->tenantId, ClusterType::METACLUSTER_DATA); - })); - return Void(); + return gotAllTenants; } // This only supports the restore of an already registered data cluster, for now. @@ -1353,27 +1303,32 @@ struct RestoreClusterImpl { if (!clusterIsRestoring) { // get all the tenant information from the newly registered data cluster - wait(getAllTenantsFromDataCluster(self)); + bool gotAllTenants = wait(getAllTenantsFromDataCluster(self)); // get all the tenant information for this data cluster from manangement cluster - wait(getAllTenantsFromManagementCluster(self)); + bool gotAllTenants = wait(getAllTenantsFromManagementCluster(self)); state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); while (itr != self->dataClusterTenantMap.end()) { - - state TenantName tenantNameOnDataCluster = itr->first; - state uint64_t tenantId = (itr->second).id; - self->tenantId = tenantId; - self->tenantName = tenantNameOnDataCluster; + uint64_t tenantId = (itr->second).id; + TenantName tenantName = itr->first; auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantId); if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { + // Delete if (self->removeMissingTenants) { - wait(removeTenantFromDataCluster(self)); + wait(self->ctx.runDataClusterTransaction([tenantName, tenantId](Reference tr) { + return TenantAPI::deleteTenantTransaction( + tr, tenantName, tenantId, ClusterType::METACLUSTER_DATA); + })); } - } else if (tenantNameOnDataCluster.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { - // renamed - self->tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; - wait(renameTenantOnDataCluster(self)); + } else if (tenantName.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { + // Rename + TenantName tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; + wait(self->ctx.runDataClusterTransaction( + [tenantName, tenantNewName, tenantId](Reference tr) { + return TenantAPI::renameTenantTransaction( + tr, tenantName, tenantNewName, tenantId, ClusterType::METACLUSTER_DATA); + })); } ++itr; @@ -1381,12 +1336,13 @@ struct RestoreClusterImpl { state std::unordered_set::iterator setItr = self->mgmtTenantSet.begin(); while (setItr != self->mgmtTenantSet.end()) { - self->tenantId = self->mgmtClusterTenantMap[*setItr].id; + TenantName tenantName = *setItr; + uint64_t tenantId = self->mgmtClusterTenantMap[tenantName].id; - if (self->dataClusterTenantIdIndex.find(self->tenantId) == self->dataClusterTenantIdIndex.end()) { + if (self->dataClusterTenantIdIndex.find(tenantId) == self->dataClusterTenantIdIndex.end()) { // Set Tenant in ERROR state - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return markManagementTenantAsError(self, tr); + wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { + return markManagementTenantAsError(tr, tenantName); })); } ++setItr; From 22d5546206781a0d43cdea57f286ba253407df82 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Fri, 16 Sep 2022 13:47:11 -0700 Subject: [PATCH 09/57] Revert to getting the tenants from Management cluster in a loop --- fdbcli/MetaclusterCommands.actor.cpp | 17 ++--- .../fdbclient/MetaclusterManagement.actor.h | 65 ++++++++++++------- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 9f22689f99..cf9ad178de 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -534,14 +534,15 @@ std::vector metaclusterHintGenerator(std::vector const& CommandFactory metaclusterRegisterFactory( "metacluster", - CommandHelp("metacluster [ARGS]", - "view and manage a metacluster", - "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" - "`register' and `remove' add and remove data clusters from the metacluster.\n" - "`configure' updates the configuration of a data cluster.\n" - "`list' prints a list of data clusters in the metacluster.\n" - "`get' prints the metadata for a particular data cluster.\n" - "`status' prints metacluster metadata.\n"), + CommandHelp( + "metacluster [ARGS]", + "view and manage a metacluster", + "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" + "`register' and `remove' add and remove data clusters from the metacluster.\n" + "`configure' updates the configuration of a data cluster.\n" + "`list' prints a list of data clusters in the metacluster.\n" + "`get' prints the metadata for a particular data cluster.\n" + "`status' prints metacluster metadata.\n"), &metaclusterGenerator, &metaclusterHintGenerator); diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 7f501f505e..0f8b85a827 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1113,9 +1113,7 @@ struct RestoreClusterImpl { updatedEntry); } - TraceEvent("MarkedDataClusterRestoring") - .detail("Name", self->ctx.clusterName.get()) - .detail("Version", tr->getCommittedVersion()); + TraceEvent("MarkedDataClusterRestoring").detail("Name", self->ctx.clusterName.get()); return true; } @@ -1199,39 +1197,46 @@ struct RestoreClusterImpl { return gotAllTenants; } - ACTOR static Future getTenantsFromManagementCluster(RestoreClusterImpl* self, Reference tr) { - TenantNameRef begin = ""_sr; + ACTOR static Future> getTenantsFromManagementCluster(RestoreClusterImpl* self, + Reference tr, + TenantName initialTenantName) { + TenantNameRef begin = initialTenantName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult> tenants = wait(tenantsFuture); + state TenantName lastTenantNameRetrieved; if (!tenants.results.empty()) { for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); + lastTenantNameRetrieved = t.first; } } - return !tenants.more; + return std::pair{ !tenants.more, lastTenantNameRetrieved }; } - ACTOR static Future getTenantIdNameIndexFromManagementCluster(RestoreClusterImpl* self, - Reference tr) { - uint64_t begin = 0; + ACTOR static Future> getTenantIdNameIndexFromManagementCluster(RestoreClusterImpl* self, + Reference tr, + uint64_t initialTenantId) { + uint64_t begin = initialTenantId; uint64_t end = std::numeric_limits::max(); state Future>> tenantsFuture = ManagementClusterMetadata::tenantMetadata().tenantIdIndex.getRange( tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult> tenants = wait(tenantsFuture); + state uint64_t lastTenantIdRetrieved; if (!tenants.results.empty()) { for (auto t : tenants.results) { self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); + lastTenantIdRetrieved = t.first; } } - return !tenants.more; + return std::pair{ !tenants.more, lastTenantIdRetrieved }; } ACTOR static Future getTenantsFromManagementClusterForCurrentDataCluster(RestoreClusterImpl* self, @@ -1255,22 +1260,38 @@ struct RestoreClusterImpl { ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromManagementCluster(self, tr); - })); + state TenantName beginRangeTenantName = ""_sr; + loop { + TenantName initialTenantName = beginRangeTenantName; + std::pair tenantsItr = wait(self->ctx.runManagementTransaction( + [self = self, initialTenantName](Reference tr) { + return getTenantsFromManagementCluster(self, tr, initialTenantName); + })); - if (!gotAllTenants) { - return false; + if (tenantsItr.first) { + // retrieved all the tenants + break; + } else { + // begin range from this tenant + beginRangeTenantName = tenantsItr.second; + } } - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantIdNameIndexFromManagementCluster(self, tr); - })); + state uint64_t beginRangeTenantId = 0; + loop { + uint64_t initialTenantId = beginRangeTenantId; + std::pair tenantsItr = wait(self->ctx.runManagementTransaction( + [self = self, initialTenantId](Reference tr) { + return getTenantIdNameIndexFromManagementCluster(self, tr, initialTenantId); + })); - if (!gotAllTenants) { - return false; + if (tenantsItr.first) { + // retrieved all the tenants + break; + } else { + // begin range from this tenantId + beginRangeTenantId = tenantsItr.second; + } } // get tenants for the specific data cluster From 226976f0a7730131848c42d34b7d9c10408cc48b Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Tue, 20 Sep 2022 11:18:18 -0700 Subject: [PATCH 10/57] Change option to repopulate_from_data_cluster --- fdbcli/MetaclusterCommands.actor.cpp | 44 +++++++--------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index cf9ad178de..6598315942 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -181,50 +181,28 @@ Optional parseToken(StringRef token, const char* str) { ACTOR Future metaclusterRestoreCommand(Reference db, std::vector tokens) { if (tokens.size() < 4 || tokens.size() > 6) { fmt::print("Usage: metacluster restore connection_string=\n" - "[|]\n\n"); + "[repopulate_from_data_cluster]\n\n"); fmt::print("Restore a data cluster.\n"); return false; } - state ClusterNameRef clusterName = tokens[3]; - // connection string ClusterConnectionString connectionString; - auto optVal = parseToken(tokens[4], "connection_string"); + auto optVal = parseToken(tokens[3], "connection_string"); if (optVal.present()) { connectionString = ClusterConnectionString(optVal.get()); } - AddNewTenants addNewTenants(AddNewTenants::True); - if (tokens.size() > 4) { - optVal = parseToken(tokens[4], "add_new_tenants"); - if (optVal.present()) { - if (optVal.get() == "true") { - addNewTenants = AddNewTenants::True; - } else { - addNewTenants = AddNewTenants::False; - } - } + state bool restore_from_data_cluster = tokens.size() == 5; + if (restore_from_data_cluster) { + DataClusterEntry defaultEntry; + wait(MetaclusterAPI::restoreCluster( + db, tokens[2], connectionString, defaultEntry, AddNewTenants::False, RemoveMissingTenants::True)); + + fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); + return true; } - - RemoveMissingTenants removeMissingTenants(RemoveMissingTenants::True); - if (tokens.size() > 5) { - optVal = parseToken(tokens[4], "remove_missing_tenants"); - if (optVal.present()) { - if (optVal.get() == "true") { - removeMissingTenants = RemoveMissingTenants::True; - } else { - removeMissingTenants = RemoveMissingTenants::False; - } - } - } - - DataClusterEntry defaultEntry; - wait(MetaclusterAPI::restoreCluster( - db, clusterName, connectionString, defaultEntry, addNewTenants, removeMissingTenants)); - - fmt::print("The cluster `{}' has been restored\n", printable(clusterName).c_str()); - return true; + return false; } // metacluster configure command From 8fcc3939949eae12f41ce1d1a35382f324219b34 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Tue, 20 Sep 2022 11:40:55 -0700 Subject: [PATCH 11/57] Misc fixes - Added a blurb for the restore command - Remove unnecessary if checks - Fix the next loop start values. --- fdbcli/MetaclusterCommands.actor.cpp | 1 + .../fdbclient/MetaclusterManagement.actor.h | 40 +++++++------------ 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 6598315942..0c229225ff 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -518,6 +518,7 @@ CommandFactory metaclusterRegisterFactory( "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" "`register' and `remove' add and remove data clusters from the metacluster.\n" "`configure' updates the configuration of a data cluster.\n" + "`restore' restores the specified data cluster." "`list' prints a list of data clusters in the metacluster.\n" "`get' prints the metadata for a particular data cluster.\n" "`status' prints metacluster metadata.\n"), diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 0f8b85a827..77aa476070 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1157,10 +1157,8 @@ struct RestoreClusterImpl { TenantMetadata::tenantMap().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult> tenants = wait(tenantsFuture); - if (!tenants.results.empty()) { - for (auto t : tenants.results) { - self->dataClusterTenantMap.emplace(t.first, t.second); - } + for (auto t : tenants.results) { + self->dataClusterTenantMap.emplace(t.first, t.second); } return !tenants.more; @@ -1174,10 +1172,8 @@ struct RestoreClusterImpl { TenantMetadata::tenantIdIndex().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult> tenants = wait(tenantsFuture); - if (!tenants.results.empty()) { - for (auto t : tenants.results) { - self->dataClusterTenantIdIndex.emplace(t.first, t.second); - } + for (auto t : tenants.results) { + self->dataClusterTenantIdIndex.emplace(t.first, t.second); } return !tenants.more; @@ -1208,11 +1204,9 @@ struct RestoreClusterImpl { state KeyBackedRangeResult> tenants = wait(tenantsFuture); state TenantName lastTenantNameRetrieved; - if (!tenants.results.empty()) { - for (auto t : tenants.results) { - self->mgmtClusterTenantMap.emplace(t.first, t.second); - lastTenantNameRetrieved = t.first; - } + for (auto t : tenants.results) { + self->mgmtClusterTenantMap.emplace(t.first, t.second); + lastTenantNameRetrieved = t.first; } return std::pair{ !tenants.more, lastTenantNameRetrieved }; @@ -1229,11 +1223,9 @@ struct RestoreClusterImpl { state KeyBackedRangeResult> tenants = wait(tenantsFuture); state uint64_t lastTenantIdRetrieved; - if (!tenants.results.empty()) { - for (auto t : tenants.results) { - self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); - lastTenantIdRetrieved = t.first; - } + for (auto t : tenants.results) { + self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); + lastTenantIdRetrieved = t.first; } return std::pair{ !tenants.more, lastTenantIdRetrieved }; @@ -1248,11 +1240,9 @@ struct RestoreClusterImpl { tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); - if (!tenantEntries.results.empty()) { - for (auto t : tenantEntries.results) { - ASSERT(t.getString(0) == self->clusterName); - self->mgmtTenantSet.insert(t.getString(1)); - } + for (auto t : tenantEntries.results) { + ASSERT(t.getString(0) == self->clusterName); + self->mgmtTenantSet.insert(t.getString(1)); } return !tenantEntries.more; @@ -1273,7 +1263,7 @@ struct RestoreClusterImpl { break; } else { // begin range from this tenant - beginRangeTenantName = tenantsItr.second; + beginRangeTenantName = keyAfter(tenantsItr.second); } } @@ -1290,7 +1280,7 @@ struct RestoreClusterImpl { break; } else { // begin range from this tenantId - beginRangeTenantId = tenantsItr.second; + beginRangeTenantId = tenantsItr.second + 1; } } From 959c6b9646dfa2936d463afff73c02c6c1f3c4f2 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Tue, 20 Sep 2022 12:10:27 -0700 Subject: [PATCH 12/57] Avoid a range scan by getting the TenantId and TenantName Index information directly from the TenantMap --- .../fdbclient/MetaclusterManagement.actor.h | 113 +++--------------- 1 file changed, 19 insertions(+), 94 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 77aa476070..ffff35c11a 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1086,10 +1086,10 @@ struct RestoreClusterImpl { // Tenant list from data and management clusters. std::unordered_map dataClusterTenantMap; - std::unordered_map dataClusterTenantIdIndex; + std::unordered_set dataClusterTenantIdSet; std::unordered_map mgmtClusterTenantMap; std::unordered_map mgmtClusterTenantIdIndex; - std::unordered_set mgmtTenantSet; + std::unordered_set mgmtClusterTenantSetForCurrentDataCluster; RestoreClusterImpl(Reference managementDb, ClusterName clusterName, @@ -1151,7 +1151,7 @@ struct RestoreClusterImpl { } ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { - TenantNameRef begin = self->clusterName; + TenantNameRef begin = ""_sr; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = TenantMetadata::tenantMap().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); @@ -1159,21 +1159,7 @@ struct RestoreClusterImpl { for (auto t : tenants.results) { self->dataClusterTenantMap.emplace(t.first, t.second); - } - - return !tenants.more; - } - - ACTOR static Future getTenantIdNameIndexFromDataCluster(RestoreClusterImpl* self, - Reference tr) { - uint64_t begin = 0; - uint64_t end = std::numeric_limits::max(); - state Future>> tenantsFuture = - TenantMetadata::tenantIdIndex().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - state KeyBackedRangeResult> tenants = wait(tenantsFuture); - - for (auto t : tenants.results) { - self->dataClusterTenantIdIndex.emplace(t.first, t.second); + self->dataClusterTenantIdSet.emplace(t.second.id); } return !tenants.more; @@ -1183,13 +1169,6 @@ struct RestoreClusterImpl { bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - if (!gotAllTenants) { - return false; - } - - bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantIdNameIndexFromDataCluster(self, tr); })); - return gotAllTenants; } @@ -1206,49 +1185,18 @@ struct RestoreClusterImpl { state TenantName lastTenantNameRetrieved; for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); + self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); + if (t.second.assignedCluster.present() && self->clusterName.compare(t.second.assignedCluster.get()) == 0) { + self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); + } + lastTenantNameRetrieved = t.first; } return std::pair{ !tenants.more, lastTenantNameRetrieved }; } - ACTOR static Future> getTenantIdNameIndexFromManagementCluster(RestoreClusterImpl* self, - Reference tr, - uint64_t initialTenantId) { - uint64_t begin = initialTenantId; - uint64_t end = std::numeric_limits::max(); - state Future>> tenantsFuture = - ManagementClusterMetadata::tenantMetadata().tenantIdIndex.getRange( - tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - state KeyBackedRangeResult> tenants = wait(tenantsFuture); - - state uint64_t lastTenantIdRetrieved; - for (auto t : tenants.results) { - self->mgmtClusterTenantIdIndex.emplace(t.first, t.second); - lastTenantIdRetrieved = t.first; - } - - return std::pair{ !tenants.more, lastTenantIdRetrieved }; - } - - ACTOR static Future getTenantsFromManagementClusterForCurrentDataCluster(RestoreClusterImpl* self, - Reference tr) { - std::pair clusterTupleRange = - std::make_pair(Tuple::makeTuple(self->clusterName), Tuple::makeTuple(keyAfter(self->clusterName))); - state Future> tenantEntriesFuture = - ManagementClusterMetadata::clusterTenantIndex.getRange( - tr, clusterTupleRange.first, clusterTupleRange.second, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); - - for (auto t : tenantEntries.results) { - ASSERT(t.getString(0) == self->clusterName); - self->mgmtTenantSet.insert(t.getString(1)); - } - - return !tenantEntries.more; - } - - ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { + ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters state TenantName beginRangeTenantName = ""_sr; loop { @@ -1258,39 +1206,15 @@ struct RestoreClusterImpl { return getTenantsFromManagementCluster(self, tr, initialTenantName); })); - if (tenantsItr.first) { - // retrieved all the tenants - break; - } else { - // begin range from this tenant + if (!tenantsItr.first) { + // Not all tenants retrieved yet. Begin next loop from this tenant. beginRangeTenantName = keyAfter(tenantsItr.second); - } - } - - state uint64_t beginRangeTenantId = 0; - loop { - uint64_t initialTenantId = beginRangeTenantId; - std::pair tenantsItr = wait(self->ctx.runManagementTransaction( - [self = self, initialTenantId](Reference tr) { - return getTenantIdNameIndexFromManagementCluster(self, tr, initialTenantId); - })); - - if (tenantsItr.first) { - // retrieved all the tenants - break; } else { - // begin range from this tenantId - beginRangeTenantId = tenantsItr.second + 1; + break; } } - // get tenants for the specific data cluster - bool gotAllTenants = - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return getTenantsFromManagementClusterForCurrentDataCluster(self, tr); - })); - - return gotAllTenants; + return Void(); } // This only supports the restore of an already registered data cluster, for now. @@ -1316,7 +1240,7 @@ struct RestoreClusterImpl { // get all the tenant information from the newly registered data cluster bool gotAllTenants = wait(getAllTenantsFromDataCluster(self)); // get all the tenant information for this data cluster from manangement cluster - bool gotAllTenants = wait(getAllTenantsFromManagementCluster(self)); + wait(getAllTenantsFromManagementCluster(self)); state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); while (itr != self->dataClusterTenantMap.end()) { @@ -1345,12 +1269,13 @@ struct RestoreClusterImpl { ++itr; } - state std::unordered_set::iterator setItr = self->mgmtTenantSet.begin(); - while (setItr != self->mgmtTenantSet.end()) { + state std::unordered_set::iterator setItr = + self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { TenantName tenantName = *setItr; uint64_t tenantId = self->mgmtClusterTenantMap[tenantName].id; - if (self->dataClusterTenantIdIndex.find(tenantId) == self->dataClusterTenantIdIndex.end()) { + if (self->dataClusterTenantIdSet.find(tenantId) == self->dataClusterTenantIdSet.end()) { // Set Tenant in ERROR state wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { return markManagementTenantAsError(tr, tenantName); From 7d705ea6cd2485af5d629d407ec6370511dded74 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Tue, 20 Sep 2022 13:04:33 -0700 Subject: [PATCH 13/57] Remove duplicate code for connection string parsing now that a few other restore options are removed --- fdbcli/MetaclusterCommands.actor.cpp | 50 ++++++++++------------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 0c229225ff..aacf57459a 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -33,13 +33,16 @@ namespace fdb_cli { -Optional, Optional>> -parseClusterConfiguration(std::vector const& tokens, DataClusterEntry const& defaults, int startIndex) { +Optional, Optional>> parseClusterConfiguration( + std::vector const& tokens, + DataClusterEntry const& defaults, + int startIndex, + int endIndex) { Optional entry; Optional connectionString; std::set usedParams; - for (int tokenNum = startIndex; tokenNum < tokens.size(); ++tokenNum) { + for (int tokenNum = startIndex; tokenNum < endIndex; ++tokenNum) { StringRef token = tokens[tokenNum]; bool foundEquals; StringRef param = token.eat("=", &foundEquals); @@ -126,7 +129,7 @@ ACTOR Future metaclusterRegisterCommand(Reference db, std::vect } DataClusterEntry defaultEntry; - auto config = parseClusterConfiguration(tokens, defaultEntry, 3); + auto config = parseClusterConfiguration(tokens, defaultEntry, 3, tokens.size()); if (!config.present()) { return false; } else if (!config.get().first.present()) { @@ -158,25 +161,6 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector return true; } -Optional parseToken(StringRef token, const char* str) { - bool foundEquals; - StringRef param = token.eat("=", &foundEquals); - if (!foundEquals) { - fmt::print(stderr, - "ERROR: invalid configuration string `{}'. String must specify a value using `='.\n", - param.toString().c_str()); - return Optional(); - } - - if (!tokencmp(param, str)) { - fmt::print( - stderr, "ERROR: invalid configuration string `{}'. Expected: `{}'.\n", param.toString().c_str(), str); - return Optional(); - } - - return Optional(token.toString()); -} - // metacluster restore command ACTOR Future metaclusterRestoreCommand(Reference db, std::vector tokens) { if (tokens.size() < 4 || tokens.size() > 6) { @@ -187,17 +171,20 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto } // connection string - ClusterConnectionString connectionString; - auto optVal = parseToken(tokens[3], "connection_string"); - if (optVal.present()) { - connectionString = ClusterConnectionString(optVal.get()); + DataClusterEntry defaultEntry; + auto config = parseClusterConfiguration(tokens, defaultEntry, 3, 4); + if (!config.present()) { + return false; + } else if (!config.get().first.present()) { + fmt::print(stderr, "ERROR: connection_string must be configured when registering a cluster.\n"); + return false; } state bool restore_from_data_cluster = tokens.size() == 5; if (restore_from_data_cluster) { DataClusterEntry defaultEntry; wait(MetaclusterAPI::restoreCluster( - db, tokens[2], connectionString, defaultEntry, AddNewTenants::False, RemoveMissingTenants::True)); + db, tokens[2], config.get().first.get(), defaultEntry, AddNewTenants::False, RemoveMissingTenants::True)); fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); return true; @@ -226,7 +213,7 @@ ACTOR Future metaclusterConfigureCommand(Reference db, std::vec throw cluster_not_found(); } - auto config = parseClusterConfiguration(tokens, metadata.get().entry, 3); + auto config = parseClusterConfiguration(tokens, metadata.get().entry, 3, tokens.size()); if (!config.present()) { return false; } @@ -486,11 +473,10 @@ std::vector metaclusterHintGenerator(std::vector const& } else { return {}; } - } else if (tokencmp(tokens[1], "restore") && tokens.size() < 4) { + } else if (tokencmp(tokens[1], "restore") && tokens.size() < 5) { static std::vector opts = { "", "connection_string= ", - "", - "" }; + "[repopulate_from_data_cluster]" }; return std::vector(opts.begin() + tokens.size() - 2, opts.end()); } else if (tokencmp(tokens[1], "configure")) { static std::vector opts = { From a57994b767b39043f55059d495a12dd8db8a0b99 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 28 Sep 2022 10:41:27 -0700 Subject: [PATCH 14/57] Fix merge issue --- .../MetaclusterManagementWorkload.actor.cpp | 186 ------------------ 1 file changed, 186 deletions(-) diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 089aa3e621..4fc3b4e4d1 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -446,9 +446,6 @@ struct MetaclusterManagementWorkload : TestWorkload { tenantMapEntry.tenantName = tenant; tenantMapEntry.tenantGroup = tenantGroup; - state TenantMapEntry tenantMapEntry; - tenantMapEntry.tenantGroup = tenantGroup; - try { loop { try { @@ -814,189 +811,6 @@ struct MetaclusterManagementWorkload : TestWorkload { return Void(); } - ACTOR static Future configureTenant(MetaclusterManagementWorkload* self) { - state TenantName tenant = self->chooseTenantName(); - state Optional newTenantGroup = self->chooseTenantGroup(); - - auto itr = self->createdTenants.find(tenant); - state bool exists = itr != self->createdTenants.end(); - state bool tenantGroupExists = - newTenantGroup.present() && self->tenantGroups.find(newTenantGroup.get()) != self->tenantGroups.end(); - - state bool hasCapacity = false; - if (exists) { - auto& dataDb = self->dataDbs[itr->second.cluster]; - hasCapacity = dataDb.ungroupedTenants.size() + dataDb.tenantGroups.size() < dataDb.tenantGroupCapacity; - } - - state std::map, Optional> configurationParameters = { { "tenant_group"_sr, - newTenantGroup } }; - - try { - loop { - Future configureFuture = - MetaclusterAPI::configureTenant(self->managementDb, tenant, configurationParameters); - Optional result = wait(timeout(configureFuture, deterministicRandom()->randomInt(1, 30))); - - if (result.present()) { - break; - } - } - - ASSERT(exists); - auto tenantData = self->createdTenants.find(tenant); - ASSERT(tenantData != self->createdTenants.end()); - - auto& dataDb = self->dataDbs[tenantData->second.cluster]; - ASSERT(dataDb.registered); - - bool allocationRemoved = false; - bool allocationAdded = false; - if (tenantData->second.tenantGroup != newTenantGroup) { - if (tenantData->second.tenantGroup.present()) { - auto& tenantGroupData = self->tenantGroups[tenantData->second.tenantGroup.get()]; - tenantGroupData.tenants.erase(tenant); - if (tenantGroupData.tenants.empty()) { - allocationRemoved = true; - self->tenantGroups.erase(tenantData->second.tenantGroup.get()); - dataDb.tenantGroups.erase(tenantData->second.tenantGroup.get()); - } - } else { - allocationRemoved = true; - self->ungroupedTenants.erase(tenant); - dataDb.ungroupedTenants.erase(tenant); - } - - if (newTenantGroup.present()) { - auto [tenantGroupData, inserted] = self->tenantGroups.try_emplace( - newTenantGroup.get(), TenantGroupData(tenantData->second.cluster)); - tenantGroupData->second.tenants.insert(tenant); - if (inserted) { - allocationAdded = true; - dataDb.tenantGroups.insert(newTenantGroup.get()); - } - } else { - allocationAdded = true; - self->ungroupedTenants.insert(tenant); - dataDb.ungroupedTenants.insert(tenant); - } - - tenantData->second.tenantGroup = newTenantGroup; - - if (allocationAdded && !allocationRemoved) { - ASSERT(hasCapacity); - } else if (allocationRemoved && !allocationAdded && - dataDb.ungroupedTenants.size() + dataDb.tenantGroups.size() >= dataDb.tenantGroupCapacity) { - --self->totalTenantGroupCapacity; - } - } - - return Void(); - } catch (Error& e) { - if (e.code() == error_code_tenant_not_found) { - ASSERT(!exists); - return Void(); - } else if (e.code() == error_code_cluster_no_capacity) { - ASSERT(exists && !hasCapacity); - return Void(); - } else if (e.code() == error_code_invalid_tenant_configuration) { - ASSERT(exists && tenantGroupExists && - self->createdTenants[tenant].cluster != self->tenantGroups[newTenantGroup.get()].cluster); - return Void(); - } - - TraceEvent(SevError, "ConfigureTenantFailure") - .error(e) - .detail("TenantName", tenant) - .detail("TenantGroup", newTenantGroup); - ASSERT(false); - throw internal_error(); - } - } - - ACTOR static Future renameTenant(MetaclusterManagementWorkload* self) { - state TenantName tenant = self->chooseTenantName(); - state TenantName newTenantName = self->chooseTenantName(); - - auto itr = self->createdTenants.find(tenant); - state bool exists = itr != self->createdTenants.end(); - - itr = self->createdTenants.find(newTenantName); - state bool newTenantExists = itr != self->createdTenants.end(); - - try { - state bool retried = false; - loop { - try { - Future renameFuture = MetaclusterAPI::renameTenant(self->managementDb, tenant, newTenantName); - Optional result = wait(timeout(renameFuture, deterministicRandom()->randomInt(1, 30))); - - if (result.present()) { - break; - } - - retried = true; - } catch (Error& e) { - // If we retry the rename after it had succeeded, we will get an error that we should ignore - if (e.code() == error_code_tenant_not_found && exists && !newTenantExists && retried) { - break; - } - throw e; - } - } - - ASSERT(exists); - ASSERT(!newTenantExists); - - Optional oldEntry = wait(MetaclusterAPI::tryGetTenant(self->managementDb, tenant)); - ASSERT(!oldEntry.present()); - - TenantMapEntry newEntry = wait(MetaclusterAPI::getTenant(self->managementDb, newTenantName)); - - auto tenantData = self->createdTenants.find(tenant); - ASSERT(tenantData != self->createdTenants.end()); - ASSERT(tenantData->second.tenantGroup == newEntry.tenantGroup); - ASSERT(newEntry.assignedCluster.present() && tenantData->second.cluster == newEntry.assignedCluster.get()); - - self->createdTenants[newTenantName] = tenantData->second; - self->createdTenants.erase(tenantData); - - auto& dataDb = self->dataDbs[tenantData->second.cluster]; - ASSERT(dataDb.registered); - - dataDb.tenants.erase(tenant); - dataDb.tenants.insert(newTenantName); - - if (tenantData->second.tenantGroup.present()) { - auto& tenantGroup = self->tenantGroups[tenantData->second.tenantGroup.get()]; - tenantGroup.tenants.erase(tenant); - tenantGroup.tenants.insert(newTenantName); - } else { - dataDb.ungroupedTenants.erase(tenant); - dataDb.ungroupedTenants.insert(newTenantName); - self->ungroupedTenants.erase(tenant); - self->ungroupedTenants.insert(newTenantName); - } - - return Void(); - } catch (Error& e) { - if (e.code() == error_code_tenant_not_found) { - ASSERT(!exists); - return Void(); - } else if (e.code() == error_code_tenant_already_exists) { - ASSERT(newTenantExists); - return Void(); - } - - TraceEvent(SevError, "RenameTenantFailure") - .error(e) - .detail("OldTenantName", tenant) - .detail("NewTenantName", newTenantName); - ASSERT(false); - throw internal_error(); - } - } - Future start(Database const& cx) override { if (clientId == 0) { return _start(cx, this); From 91f5347551411c1379c4cb5a0d789aba46def988 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 3 Oct 2022 10:57:54 -0700 Subject: [PATCH 15/57] Add some simulated testing to the metacluster restore process; fix various issues found by the testing. --- fdbcli/MetaclusterCommands.actor.cpp | 5 +- .../fdbclient/MetaclusterManagement.actor.h | 299 ++++++----- fdbserver/SimulatedCluster.actor.cpp | 2 +- .../workloads/MetaclusterConsistency.actor.h | 19 +- .../workloads/TenantConsistency.actor.h | 3 + .../MetaclusterManagementWorkload.actor.cpp | 71 ++- .../MetaclusterRestoreWorkload.actor.cpp | 473 ++++++++++++++++++ tests/CMakeLists.txt | 1 + tests/slow/MetaclusterRecovery.toml | 18 + 9 files changed, 740 insertions(+), 151 deletions(-) create mode 100644 fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp create mode 100644 tests/slow/MetaclusterRecovery.toml diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index aacf57459a..f544a34a75 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -155,7 +155,7 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); + wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4, true)); fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); return true; @@ -182,9 +182,8 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto state bool restore_from_data_cluster = tokens.size() == 5; if (restore_from_data_cluster) { - DataClusterEntry defaultEntry; wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), defaultEntry, AddNewTenants::False, RemoveMissingTenants::True)); + db, tokens[2], config.get().first.get(), AddNewTenants::False, RemoveMissingTenants::True)); fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); return true; diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index ffff35c11a..4b93c80b0f 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -19,20 +19,17 @@ */ #pragma once -#include "fdbclient/CoordinationInterface.h" -#include "fdbclient/FDBOptions.g.h" -#include "fdbrpc/TenantName.h" -#include "flow/FastRef.h" -#include "flow/IRandom.h" -#include "flow/Platform.h" -#include "flow/ThreadHelper.actor.h" -#include #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H) #define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H #include "fdbclient/MetaclusterManagement.actor.g.h" #elif !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H) #define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H +#include + +#include "fdbclient/CoordinationInterface.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/RunTransaction.actor.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GenericTransactionHelper.h" #include "fdbclient/GenericManagementAPI.actor.h" @@ -42,7 +39,11 @@ #include "fdbclient/SystemData.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/VersionedMap.h" +#include "fdbrpc/TenantName.h" +#include "flow/FastRef.h" #include "flow/flat_buffers.h" +#include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" #include "flow/actorcompiler.h" // has to be last include // This file provides the interfaces to manage metacluster metadata. @@ -566,7 +567,8 @@ void updateClusterMetadata(Transaction tr, if (updatedEntry.present()) { if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); - } else if (previousMetadata.entry.clusterState == DataClusterState::RESTORING) { + } else if (previousMetadata.entry.clusterState == DataClusterState::RESTORING && + (!updatedEntry.present() || updatedEntry.get().clusterState != DataClusterState::READY)) { throw cluster_restoring(); } ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); @@ -721,12 +723,17 @@ struct RemoveClusterImpl { // Initialization parameters bool forceRemove; + bool cleanupManagementClusterState; // Parameters set in markClusterRemoving Optional lastTenantId; - RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) - : ctx(managementDb, clusterName), forceRemove(forceRemove) {} + RemoveClusterImpl(Reference managementDb, + ClusterName clusterName, + bool forceRemove, + bool cleanupManagementClusterState) + : ctx(managementDb, clusterName), forceRemove(forceRemove), + cleanupManagementClusterState(cleanupManagementClusterState) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { @@ -826,10 +833,10 @@ struct RemoveClusterImpl { return !tenantEntries.more; } - // Returns true if all tenant groups and the data cluster have been purged - ACTOR static Future purgeTenantGroupsAndDataCluster(RemoveClusterImpl* self, - Reference tr, - std::pair clusterTupleRange) { + // Returns true if all tenant groups have been purged + ACTOR static Future purgeTenantGroups(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange) { ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING); // Get the list of tenant groups @@ -856,14 +863,41 @@ struct RemoveClusterImpl { keyAfter(tenantGroupEntries.results.rbegin()->getString(1)))); } - // Erase the data cluster record from the management cluster if processing our last batch - if (!tenantGroupEntries.more) { - ManagementClusterMetadata::dataClusters().erase(tr, self->ctx.clusterName.get()); - ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, self->ctx.clusterName.get()); - ManagementClusterMetadata::clusterTenantCount.erase(tr, self->ctx.clusterName.get()); + return !tenantGroupEntries.more; + } + + // Removes the data cluster entry from the management cluster + void removeDataClusterEntry(Reference tr) { + ManagementClusterMetadata::dataClusters().erase(tr, ctx.clusterName.get()); + ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, ctx.clusterName.get()); + ManagementClusterMetadata::clusterTenantCount.erase(tr, ctx.clusterName.get()); + } + + // Removes the next set of metadata from the management cluster; returns true when all specified + // metadata is removed + ACTOR static Future managementClusterPurgeSome(RemoveClusterImpl* self, + Reference tr, + std::pair clusterTupleRange, + bool* deleteTenants, + bool* deleteTenantGroups) { + if (deleteTenants) { + bool deletedAllTenants = wait(purgeTenants(self, tr, clusterTupleRange)); + if (!deletedAllTenants) { + return false; + } + *deleteTenants = false; } - return !tenantGroupEntries.more; + if (deleteTenantGroups) { + bool deletedAllTenantGroups = wait(purgeTenantGroups(self, tr, clusterTupleRange)); + if (!deletedAllTenantGroups) { + return false; + } + *deleteTenantGroups = false; + } + + self->removeDataClusterEntry(tr); + return true; } // Remove all metadata associated with the data cluster from the management cluster @@ -871,11 +905,16 @@ struct RemoveClusterImpl { state std::pair clusterTupleRange = std::make_pair( Tuple::makeTuple(self->ctx.clusterName.get()), Tuple::makeTuple(keyAfter(self->ctx.clusterName.get()))); - // First remove all tenants associated with the data cluster from the management cluster + state bool deleteTenants = self->cleanupManagementClusterState; + state bool deleteTenantGroups = self->cleanupManagementClusterState; + loop { bool clearedAll = wait(self->ctx.runManagementTransaction( - [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { - return purgeTenants(self, tr, clusterTupleRange); + [self = self, + clusterTupleRange = clusterTupleRange, + deleteTenants = &deleteTenants, + deleteTenantGroups = &deleteTenantGroups](Reference tr) { + return managementClusterPurgeSome(self, tr, clusterTupleRange, deleteTenants, deleteTenantGroups); })); if (clearedAll) { @@ -883,17 +922,6 @@ struct RemoveClusterImpl { } } - // Next remove all tenant groups associated with the data cluster from the management cluster - loop { - bool clearedAll = wait(self->ctx.runManagementTransaction( - [self = self, clusterTupleRange = clusterTupleRange](Reference tr) { - return purgeTenantGroupsAndDataCluster(self, tr, clusterTupleRange); - })); - if (clearedAll) { - break; - } - } - TraceEvent("RemovedDataCluster").detail("Name", self->ctx.clusterName.get()); return Void(); } @@ -943,8 +971,8 @@ struct RemoveClusterImpl { }; ACTOR template -Future removeCluster(Reference db, ClusterName name, bool forceRemove) { - state RemoveClusterImpl impl(db, name, forceRemove); +Future removeCluster(Reference db, ClusterName name, bool forceRemove, bool cleanupManagementClusterState) { + state RemoveClusterImpl impl(db, name, forceRemove, cleanupManagementClusterState); wait(impl.run()); return Void(); } @@ -1080,11 +1108,10 @@ struct RestoreClusterImpl { // Initialization parameters ClusterName clusterName; ClusterConnectionString connectionString; - DataClusterEntry clusterEntry; AddNewTenants addNewTenants; RemoveMissingTenants removeMissingTenants; - // Tenant list from data and management clusters. + // Tenant list from data and management clusters std::unordered_map dataClusterTenantMap; std::unordered_set dataClusterTenantIdSet; std::unordered_map mgmtClusterTenantMap; @@ -1094,47 +1121,54 @@ struct RestoreClusterImpl { RestoreClusterImpl(Reference managementDb, ClusterName clusterName, ClusterConnectionString connectionString, - DataClusterEntry clusterEntry, AddNewTenants addNewTenants, RemoveMissingTenants removeMissingTenants) : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), - clusterEntry(clusterEntry), addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} + addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} - ACTOR static Future markClusterAsRestoring(RestoreClusterImpl* self, - Reference tr) { - if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { - DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; - updatedEntry.clusterState = DataClusterState::RESTORING; + // Check that the restored data cluster has a matching metacluster registration entry + ACTOR static Future verifyDataClusterMatch(RestoreClusterImpl* self) { + state Reference db = wait(openDatabase(self->connectionString)); + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(db)); - updateClusterMetadata(tr, - self->ctx.clusterName.get(), - self->ctx.dataClusterMetadata.get(), - Optional(self->connectionString), - updatedEntry); + if (!metaclusterRegistration.present()) { + // TODO: different error + throw invalid_metacluster_operation(); } - TraceEvent("MarkedDataClusterRestoring").detail("Name", self->ctx.clusterName.get()); + if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + throw cluster_already_exists(); + } - return true; + self->ctx.dataClusterDb = db; + + return Void(); } - ACTOR static Future markClusterAsReady(RestoreClusterImpl* self, Reference tr) { - if (self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { - DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + void markClusterRestoring(Reference tr) { + if (ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; + updatedEntry.clusterState = DataClusterState::RESTORING; + + updateClusterMetadata( + tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), connectionString, updatedEntry); + } + + TraceEvent("MarkedDataClusterRestoring").detail("Name", ctx.clusterName.get()); + } + + void markClusterAsReady(Reference tr) { + if (ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::READY; - updateClusterMetadata(tr, - self->ctx.clusterName.get(), - self->ctx.dataClusterMetadata.get(), - Optional(self->connectionString), - updatedEntry); + updateClusterMetadata(tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), {}, updatedEntry); } TraceEvent("MarkedDataClusterReady") - .detail("Name", self->ctx.clusterName.get()) + .detail("Name", ctx.clusterName.get()) .detail("Version", tr->getCommittedVersion()); - - return Void(); } ACTOR static Future markManagementTenantAsError(Reference tr, @@ -1146,11 +1180,12 @@ struct RestoreClusterImpl { } tenantEntry.get().tenantState = TenantState::ERROR; + tenantEntry.get().error = "The tenant is missing after restoring its data cluster"; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantName, tenantEntry.get()); return Void(); } - ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { + ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { TenantNameRef begin = ""_sr; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = @@ -1162,19 +1197,13 @@ struct RestoreClusterImpl { self->dataClusterTenantIdSet.emplace(t.second.id); } - return !tenants.more; + return Void(); } - ACTOR static Future getAllTenantsFromDataCluster(RestoreClusterImpl* self) { - bool gotAllTenants = wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - - return gotAllTenants; - } - - ACTOR static Future> getTenantsFromManagementCluster(RestoreClusterImpl* self, - Reference tr, - TenantName initialTenantName) { + ACTOR static Future> getTenantsFromManagementCluster( + RestoreClusterImpl* self, + Reference tr, + TenantName initialTenantName) { TenantNameRef begin = initialTenantName; TenantNameRef end = "\xff\xff"_sr; state Future>> tenantsFuture = @@ -1219,78 +1248,83 @@ struct RestoreClusterImpl { // This only supports the restore of an already registered data cluster, for now. ACTOR static Future run(RestoreClusterImpl* self) { - state bool clusterIsRestoring; + // Run a management transaction to populate the data cluster metadata + wait(self->ctx.runManagementTransaction( + [](Reference tr) { return Future(Void()); })); + + // Check that the data cluster being restored has the appropriate metacluster registration entry and name + wait(verifyDataClusterMatch(self)); + // set state to restoring try { - wait(store(clusterIsRestoring, - self->ctx.runManagementTransaction([self = self](Reference tr) { - return markClusterAsRestoring(self, tr); - }))); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterRestoring(tr); + return Future(Void()); + })); } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to restore the cluster, it // will throw an error indicating that the restore has already started - if (e.code() == error_code_cluster_restoring) { - clusterIsRestoring = true; - } else { + if (e.code() != error_code_cluster_restoring) { throw; } } - if (!clusterIsRestoring) { - // get all the tenant information from the newly registered data cluster - bool gotAllTenants = wait(getAllTenantsFromDataCluster(self)); - // get all the tenant information for this data cluster from manangement cluster - wait(getAllTenantsFromManagementCluster(self)); + // get all the tenant information from the newly registered data cluster + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); - while (itr != self->dataClusterTenantMap.end()) { - uint64_t tenantId = (itr->second).id; - TenantName tenantName = itr->first; + // get all the tenant information for this data cluster from manangement cluster + wait(getAllTenantsFromManagementCluster(self)); - auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantId); - if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { - // Delete - if (self->removeMissingTenants) { - wait(self->ctx.runDataClusterTransaction([tenantName, tenantId](Reference tr) { - return TenantAPI::deleteTenantTransaction( - tr, tenantName, tenantId, ClusterType::METACLUSTER_DATA); - })); - } - } else if (tenantName.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { - // Rename - TenantName tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; - wait(self->ctx.runDataClusterTransaction( - [tenantName, tenantNewName, tenantId](Reference tr) { - return TenantAPI::renameTenantTransaction( - tr, tenantName, tenantNewName, tenantId, ClusterType::METACLUSTER_DATA); - })); - } + state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); + while (itr != self->dataClusterTenantMap.end()) { + uint64_t tenantId = (itr->second).id; + TenantName tenantName = itr->first; - ++itr; - } - - state std::unordered_set::iterator setItr = - self->mgmtClusterTenantSetForCurrentDataCluster.begin(); - while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { - TenantName tenantName = *setItr; - uint64_t tenantId = self->mgmtClusterTenantMap[tenantName].id; - - if (self->dataClusterTenantIdSet.find(tenantId) == self->dataClusterTenantIdSet.end()) { - // Set Tenant in ERROR state - wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { - return markManagementTenantAsError(tr, tenantName); + auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantId); + if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { + // Delete + if (self->removeMissingTenants) { + wait(self->ctx.runDataClusterTransaction([tenantName, tenantId](Reference tr) { + return TenantAPI::deleteTenantTransaction( + tr, tenantName, tenantId, ClusterType::METACLUSTER_DATA); })); } - ++setItr; + } else if (tenantName.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { + // Rename + TenantName tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; + wait(self->ctx.runDataClusterTransaction( + [tenantName, tenantNewName, tenantId](Reference tr) { + return TenantAPI::renameTenantTransaction( + tr, tenantName, tenantNewName, tenantId, ClusterType::METACLUSTER_DATA); + })); } - // set restored cluster to ready state - try { - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return markClusterAsReady(self, tr); })); - } catch (Error& e) { - throw; + ++itr; + } + + state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { + TenantName tenantName = *setItr; + uint64_t tenantId = self->mgmtClusterTenantMap[tenantName].id; + + if (self->dataClusterTenantIdSet.find(tenantId) == self->dataClusterTenantIdSet.end()) { + // Set Tenant in ERROR state + wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { + return markManagementTenantAsError(tr, tenantName); + })); } + ++setItr; + } + + // set restored cluster to ready state + try { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterAsReady(tr); + return Future(Void()); + })); + } catch (Error& e) { + throw; } return Void(); @@ -1303,10 +1337,9 @@ ACTOR template Future restoreCluster(Reference db, ClusterName name, ClusterConnectionString connectionString, - DataClusterEntry entry, AddNewTenants addNewTenants, RemoveMissingTenants removeMissingTenants) { - state RestoreClusterImpl impl(db, name, connectionString, entry, addNewTenants, removeMissingTenants); + state RestoreClusterImpl impl(db, name, connectionString, addNewTenants, removeMissingTenants); wait(impl.run()); return Void(); } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 8736471f1d..b96d95e740 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2519,7 +2519,7 @@ void setupSimulatedSystem(std::vector>* systemActors, baseFolder, false, machine == useSeedForMachine, - AgentNone, + AgentAddition, sslOnly, whitelistBinPaths, protocolVersion, diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index f7dcfd964d..d03fbf782e 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -288,7 +288,7 @@ private: auto& expectedTenants = self->managementMetadata.clusterTenantMap[clusterName]; - std::map groupExpectedTenantCounts; + std::set tenantGroupsWithCompletedTenants; if (!self->allowPartialMetaclusterOperations) { ASSERT_EQ(dataClusterTenantMap.size(), expectedTenants.size()); } else { @@ -297,12 +297,13 @@ private: TenantMapEntry const& metaclusterEntry = self->managementMetadata.tenantMap[tenantName]; if (!dataClusterTenantMap.count(tenantName)) { if (metaclusterEntry.tenantGroup.present()) { - groupExpectedTenantCounts.try_emplace(metaclusterEntry.tenantGroup.get(), 0); + tenantGroupsWithCompletedTenants.insert(metaclusterEntry.tenantGroup.get()); } ASSERT(metaclusterEntry.tenantState == TenantState::REGISTERING || - metaclusterEntry.tenantState == TenantState::REMOVING); + metaclusterEntry.tenantState == TenantState::REMOVING || + metaclusterEntry.tenantState == TenantState::ERROR); } else if (metaclusterEntry.tenantGroup.present()) { - ++groupExpectedTenantCounts[metaclusterEntry.tenantGroup.get()]; + tenantGroupsWithCompletedTenants.insert(metaclusterEntry.tenantGroup.get()); } } } @@ -337,15 +338,19 @@ private: ASSERT_LE(dataClusterTenantGroupMap.size(), expectedTenantGroups.size()); for (auto const& name : expectedTenantGroups) { if (!dataClusterTenantGroupMap.count(name)) { - auto itr = groupExpectedTenantCounts.find(name); - ASSERT(itr != groupExpectedTenantCounts.end()); - ASSERT_EQ(itr->second, 0); + auto itr = tenantGroupsWithCompletedTenants.find(name); + ASSERT(itr == tenantGroupsWithCompletedTenants.end()); } } } for (auto const& [name, entry] : dataClusterTenantGroupMap) { ASSERT(expectedTenantGroups.count(name)); ASSERT(!entry.assignedCluster.present()); + expectedTenantGroups.erase(name); + } + + for (auto name : expectedTenantGroups) { + ASSERT(tenantGroupsWithCompletedTenants.count(name) == 0); } return Void(); diff --git a/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h index e1b838f239..c8414ac387 100644 --- a/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h @@ -184,6 +184,9 @@ private: ASSERT(!tenantMapEntry.assignedCluster.present()); ASSERT(!tenantMapEntry.renameDestination.present()); } + + // An error string should be set if and only if the tenant state is an error + ASSERT((tenantMapEntry.tenantState == TenantState::ERROR) != tenantMapEntry.error.empty()); } ASSERT_EQ(metadata.tenantMap.size() + renameCount, metadata.tenantNameIndex.size()); diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 4fc3b4e4d1..f2cbf3830b 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -48,6 +48,7 @@ struct MetaclusterManagementWorkload : TestWorkload { struct DataClusterData { Database db; bool registered = false; + bool detached = false; int tenantGroupCapacity = 0; std::set tenants; @@ -191,6 +192,9 @@ struct MetaclusterManagementWorkload : TestWorkload { if (e.code() == error_code_cluster_already_exists) { ASSERT(dataDb->registered); return Void(); + } else if (e.code() == error_code_cluster_not_empty) { + ASSERT(dataDb->registered && dataDb->detached && !dataDb->tenants.empty()); + return Void(); } TraceEvent(SevError, "RegisterClusterFailure").error(e).detail("ClusterName", clusterName); @@ -204,11 +208,13 @@ struct MetaclusterManagementWorkload : TestWorkload { state ClusterName clusterName = self->chooseClusterName(); state DataClusterData* dataDb = &self->dataDbs[clusterName]; state bool retried = false; + state bool detachCluster = deterministicRandom()->coinflip(); try { loop { // TODO: check force removal - Future removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false); + Future removeFuture = + MetaclusterAPI::removeCluster(self->managementDb, clusterName, detachCluster, !detachCluster); try { Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -230,11 +236,15 @@ struct MetaclusterManagementWorkload : TestWorkload { } ASSERT(dataDb->registered); - ASSERT(dataDb->tenants.empty()); + ASSERT(detachCluster || dataDb->tenants.empty()); self->totalTenantGroupCapacity -= dataDb->tenantGroupCapacity; - dataDb->tenantGroupCapacity = 0; - dataDb->registered = false; + if (!detachCluster) { + dataDb->tenantGroupCapacity = 0; + dataDb->registered = false; + } else { + dataDb->detached = true; + } // Get a version to know that the cluster has recovered wait(success(runTransaction(dataDb->db.getReference(), @@ -255,6 +265,49 @@ struct MetaclusterManagementWorkload : TestWorkload { return Void(); } + ACTOR static Future restoreCluster(MetaclusterManagementWorkload* self) { + state ClusterName clusterName = self->chooseClusterName(); + state DataClusterData* dataDb = &self->dataDbs[clusterName]; + state bool retried = false; + + try { + loop { + Future restoreFuture = + MetaclusterAPI::restoreCluster(self->managementDb, + clusterName, + dataDb->db->getConnectionRecord()->getConnectionString(), + AddNewTenants::False, + RemoveMissingTenants::True); + try { + Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); + if (result.present()) { + break; + } else { + retried = true; + } + } catch (Error& e) { + if (false) { + break; + } else { + throw; + } + } + } + + ASSERT(dataDb->registered); + dataDb->detached = false; + } catch (Error& e) { + if (e.code() == error_code_cluster_not_found) { + ASSERT(!dataDb->registered); + return Void(); + } + TraceEvent(SevError, "RestoreClusterFailure").error(e).detail("ClusterName", clusterName); + ASSERT(false); + } + + return Void(); + } + ACTOR static Future listClusters(MetaclusterManagementWorkload* self) { state ClusterName clusterName1 = self->chooseClusterName(); state ClusterName clusterName2 = self->chooseClusterName(); @@ -823,7 +876,7 @@ struct MetaclusterManagementWorkload : TestWorkload { // Run a random sequence of operations for the duration of the test while (now() < start + self->testDuration) { - state int operation = deterministicRandom()->randomInt(0, 9); + state int operation = deterministicRandom()->randomInt(0, 10); if (operation == 0) { wait(registerCluster(self)); } else if (operation == 1) { @@ -842,6 +895,8 @@ struct MetaclusterManagementWorkload : TestWorkload { wait(configureTenant(self)); } else if (operation == 8) { wait(renameTenant(self)); + } else if (operation == 9) { + wait(restoreCluster(self)); } } @@ -880,7 +935,9 @@ struct MetaclusterManagementWorkload : TestWorkload { ASSERT(tenants.size() == clusterData.tenants.size()); for (auto [tenantName, tenantEntry] : tenants) { ASSERT(clusterData.tenants.count(tenantName)); - ASSERT(self->createdTenants[tenantName].cluster == clusterName); + auto tenantData = self->createdTenants[tenantName]; + ASSERT(tenantData.cluster == clusterName); + ASSERT(tenantData.tenantGroup == tenantEntry.tenantGroup); } return Void(); @@ -909,7 +966,7 @@ struct MetaclusterManagementWorkload : TestWorkload { std::vector> removeClusterFutures; for (auto [clusterName, clusterMetadata] : dataClusters) { removeClusterFutures.push_back( - MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants)); + MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants, true)); } wait(waitForAll(removeClusterFutures)); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp new file mode 100644 index 0000000000..dc03679c7c --- /dev/null +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -0,0 +1,473 @@ +/* + * MetaclusterRestoreWorkload.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "fdbclient/BackupAgent.actor.h" +#include "fdbclient/ClusterConnectionMemoryRecord.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/Metacluster.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/ThreadSafeTransaction.h" +#include "fdbrpc/simulator.h" +#include "fdbserver/workloads/MetaclusterConsistency.actor.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/flow.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +struct MetaclusterRestoreWorkload : TestWorkload { + + struct DataClusterData { + Database db; + std::set tenants; + bool restored = false; + + DataClusterData() {} + DataClusterData(Database db) : db(db) {} + }; + + struct TenantData { + ClusterName cluster; + Optional tenantGroup; + bool beforeBackup = true; + + TenantData() {} + TenantData(ClusterName cluster, Optional tenantGroup, bool beforeBackup) + : cluster(cluster), tenantGroup(tenantGroup), beforeBackup(beforeBackup) {} + }; + + Reference managementDb; + std::map dataDbs; + std::vector dataDbIndex; + + std::map createdTenants; + + int initialTenants; + int maxTenants; + int maxTenantGroups; + int tenantGroupCapacity; + + MetaclusterRestoreWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + maxTenants = std::min(1e8 - 1, getOption(options, "maxTenants"_sr, 1000)); + initialTenants = std::min(maxTenants, getOption(options, "initialTenants"_sr, 100)); + maxTenantGroups = std::min(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20)); + + tenantGroupCapacity = (initialTenants / 2 + maxTenantGroups - 1) / g_simulator->extraDatabases.size(); + } + + std::string description() const override { return "MetaclusterRestore"; } + + void disableFailureInjectionWorkloads(std::set& out) const override { + out.insert("MachineAttritionWorkload"); + } + + ClusterName chooseClusterName() { return dataDbIndex[deterministicRandom()->randomInt(0, dataDbIndex.size())]; } + + TenantName chooseTenantName() { + TenantName tenant(format("tenant%08d", deterministicRandom()->randomInt(0, maxTenants))); + return tenant; + } + + Optional chooseTenantGroup() { + Optional tenantGroup; + if (deterministicRandom()->coinflip()) { + tenantGroup = + TenantGroupNameRef(format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups))); + } + + return tenantGroup; + } + + // Used to gradually increase capacity so that the tenants are somewhat evenly distributed across the clusters + ACTOR static Future increaseMetaclusterCapacity(MetaclusterRestoreWorkload* self) { + self->tenantGroupCapacity = ceil(self->tenantGroupCapacity * 1.2); + state Reference tr = self->managementDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state int dbIndex; + for (dbIndex = 0; dbIndex < self->dataDbIndex.size(); ++dbIndex) { + DataClusterMetadata clusterMetadata = + wait(MetaclusterAPI::getClusterTransaction(tr, self->dataDbIndex[dbIndex])); + DataClusterEntry updatedEntry = clusterMetadata.entry; + updatedEntry.capacity.numTenantGroups = self->tenantGroupCapacity; + MetaclusterAPI::updateClusterMetadata( + tr, self->dataDbIndex[dbIndex], clusterMetadata, {}, updatedEntry); + } + wait(safeThreadFutureToFuture(tr->commit())); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + return Void(); + } + + Future setup(Database const& cx) override { + if (clientId == 0) { + return _setup(cx, this); + } else { + return Void(); + } + } + ACTOR static Future _setup(Database cx, MetaclusterRestoreWorkload* self) { + fmt::print("Setup start\n"); + Reference threadSafeHandle = + wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx))); + fmt::print("Create thread safe handle\n"); + + MultiVersionApi::api->selectApiVersion(cx->apiVersion.version()); + self->managementDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle); + wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); + fmt::print("Create metacluster\n"); + + ASSERT(g_simulator->extraDatabases.size() > 0); + state std::vector::iterator extraDatabasesItr; + for (extraDatabasesItr = g_simulator->extraDatabases.begin(); + extraDatabasesItr != g_simulator->extraDatabases.end(); + ++extraDatabasesItr) { + ClusterConnectionString ccs(*extraDatabasesItr); + auto extraFile = makeReference(ccs); + state ClusterName clusterName = ClusterName(format("cluster_%08d", self->dataDbs.size())); + Database db = Database::createDatabase(extraFile, ApiVersion::LATEST_VERSION); + self->dataDbIndex.push_back(clusterName); + self->dataDbs[clusterName] = DataClusterData(db); + + DataClusterEntry clusterEntry; + clusterEntry.capacity.numTenantGroups = self->tenantGroupCapacity; + + wait(MetaclusterAPI::registerCluster(self->managementDb, clusterName, ccs, clusterEntry)); + fmt::print("Register cluster {}\n", printable(clusterName)); + } + + while (self->createdTenants.size() < self->initialTenants) { + wait(createTenant(self, true)); + } + + fmt::print("Setup complete\n"); + return Void(); + } + + ACTOR static Future backupCluster(ClusterName clusterName, + Database dataDb, + MetaclusterRestoreWorkload* self) { + state FileBackupAgent backupAgent; + state Standalone backupContainer = "file://simfdb/backups/"_sr.withSuffix(clusterName); + state Standalone> backupRanges; + + addDefaultBackupRanges(backupRanges); + + fmt::print("Backup cluster start {}\n", printable(clusterName)); + + try { + wait(backupAgent.submitBackup( + dataDb, backupContainer, {}, 0, 0, clusterName.toString(), backupRanges, StopWhenDone::True)); + } catch (Error& e) { + if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate) + throw; + } + + fmt::print("Backup submitted {}\n", printable(clusterName)); + + state Reference container; + wait(success(backupAgent.waitBackup(dataDb, clusterName.toString(), StopWhenDone::True, &container))); + fmt::print("Backup completed {} {}\n", printable(clusterName), container->getURL()); + return container->getURL(); + } + + ACTOR static Future restoreCluster(ClusterName clusterName, + Database dataDb, + std::string backupUrl, + MetaclusterRestoreWorkload* self) { + state FileBackupAgent backupAgent; + state Standalone> backupRanges; + addDefaultBackupRanges(backupRanges); + + fmt::print("Restore cluster start {}\n", printable(clusterName)); + + wait(runTransaction(dataDb.getReference(), + [backupRanges = backupRanges](Reference tr) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + for (auto range : backupRanges) { + tr->clear(range); + } + return Future(Void()); + })); + + fmt::print("Restore cleared data {}\n", printable(clusterName)); + + wait(success(backupAgent.restore(dataDb, dataDb, clusterName, StringRef(backupUrl), {}, backupRanges))); + + fmt::print("Restore cluster {}\n", printable(clusterName)); + + wait(MetaclusterAPI::restoreCluster(self->managementDb, + clusterName, + dataDb->getConnectionRecord()->getConnectionString(), + AddNewTenants::False, + RemoveMissingTenants::True)); + + self->dataDbs[clusterName].restored = true; + + fmt::print("Restore added back to metacluster {}\n", printable(clusterName)); + + return Void(); + } + + ACTOR static Future createTenant(MetaclusterRestoreWorkload* self, bool beforeBackup) { + state TenantName tenantName; + for (int i = 0; i < 10; ++i) { + tenantName = self->chooseTenantName(); + if (self->createdTenants.count(tenantName) == 0) { + break; + } + } + + if (self->createdTenants.count(tenantName)) { + return Void(); + } + + fmt::print("Create tenant {}\n", printable(tenantName)); + + loop { + try { + TenantMapEntry tenantEntry; + tenantEntry.tenantGroup = self->chooseTenantGroup(); + wait(MetaclusterAPI::createTenant(self->managementDb, tenantName, tenantEntry)); + fmt::print("Created tenant {}\n", printable(tenantName)); + TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); + self->createdTenants[tenantName] = + TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, beforeBackup); + self->dataDbs[createdEntry.assignedCluster.get()].tenants.insert(tenantName); + return Void(); + } catch (Error& e) { + fmt::print("Tenant create error {} {}\n", e.what(), printable(tenantName)); + if (e.code() != error_code_metacluster_no_capacity) { + throw; + } + + wait(increaseMetaclusterCapacity(self)); + fmt::print("Increased metacluster capacity {}\n", printable(tenantName)); + } + } + } + + ACTOR static Future deleteTenant(MetaclusterRestoreWorkload* self) { + state TenantName tenantName; + for (int i = 0; i < 10; ++i) { + tenantName = self->chooseTenantName(); + if (self->createdTenants.count(tenantName) != 0) { + break; + } + } + + if (self->createdTenants.count(tenantName) == 0) { + return Void(); + } + + fmt::print("Delete tenant {}\n", printable(tenantName)); + + loop { + try { + wait(MetaclusterAPI::deleteTenant(self->managementDb, tenantName)); + auto const& tenantData = self->createdTenants[tenantName]; + self->dataDbs[tenantData.cluster].tenants.erase(tenantName); + self->createdTenants.erase(tenantName); + return Void(); + } catch (Error& e) { + fmt::print("Tenant create error {} {}\n", e.what(), printable(tenantName)); + if (e.code() != error_code_metacluster_no_capacity) { + throw; + } + + wait(increaseMetaclusterCapacity(self)); + fmt::print("Increased metacluster capacity {}\n", printable(tenantName)); + } + } + } + + Future start(Database const& cx) override { + if (clientId == 0) { + return _start(cx, this); + } else { + return Void(); + } + } + ACTOR static Future _start(Database cx, MetaclusterRestoreWorkload* self) { + state std::set clustersToRestore; + + for (auto db : self->dataDbIndex) { + if (deterministicRandom()->random01() < 0.1) { + clustersToRestore.insert(db); + } + } + + if (clustersToRestore.empty()) { + clustersToRestore.insert(deterministicRandom()->randomChoice(self->dataDbIndex)); + } + + // TODO: partially completed operations before backup + + state std::map> backups; + for (auto cluster : clustersToRestore) { + backups[cluster] = backupCluster(cluster, self->dataDbs[cluster].db, self); + } + + for (auto [_, f] : backups) { + wait(success(f)); + } + + // Make random tenant mutations + state int tenantMutationNum; + for (tenantMutationNum = 0; tenantMutationNum < 100; ++tenantMutationNum) { + state int operation = deterministicRandom()->randomInt(0, 2); + if (operation == 0) { + wait(createTenant(self, false)); + } else if (operation == 1) { + wait(deleteTenant(self)); + /*} else if (operation == 2) { + wait(configureTenant(self)); + } else if (operation == 3) { + wait(renameTenant(self));*/ + } + } + + std::vector> restores; + for (auto [cluster, backupUrl] : backups) { + restores.push_back(restoreCluster(cluster, self->dataDbs[cluster].db, backupUrl.get(), self)); + } + + wait(waitForAll(restores)); + + return Void(); + } + + // Checks that the data cluster state matches our local state + ACTOR static Future checkDataCluster(MetaclusterRestoreWorkload* self, + ClusterName clusterName, + DataClusterData clusterData) { + state Optional metaclusterRegistration; + state std::vector> tenants; + state Reference tr = clusterData.db->createTransaction(); + + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + wait( + store(metaclusterRegistration, + MetaclusterMetadata::metaclusterRegistration().get(clusterData.db.getReference())) && + store(tenants, + TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1))); + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + + ASSERT(metaclusterRegistration.present() && + metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + + if (!clusterData.restored) { + ASSERT(tenants.size() == clusterData.tenants.size()); + for (auto [tenantName, tenantEntry] : tenants) { + ASSERT(clusterData.tenants.count(tenantName)); + auto tenantData = self->createdTenants[tenantName]; + ASSERT(tenantData.cluster == clusterName); + ASSERT(tenantData.tenantGroup == tenantEntry.tenantGroup); + } + } else { + int expectedTenantCount = 0; + std::map tenantMap(tenants.begin(), tenants.end()); + for (auto tenantName : clusterData.tenants) { + TenantData tenantData = self->createdTenants[tenantName]; + if (tenantData.beforeBackup) { + fmt::print("Expected tenant: {}\n", printable(tenantName)); + ++expectedTenantCount; + auto tenantItr = tenantMap.find(tenantName); + ASSERT(tenantItr != tenantMap.end()); + ASSERT(tenantData.cluster == clusterName); + ASSERT(tenantItr->second.tenantGroup == tenantData.tenantGroup); + } else { + ASSERT(tenantMap.count(tenantName) == 0); + } + } + + fmt::print("Size check: {} {}\n", tenants.size(), expectedTenantCount); + for (auto tenant : tenants) { + fmt::print("Has tenant {}, {}\n", + printable(tenant.first), + clusterData.tenants.find(tenant.first) != clusterData.tenants.end()); + } + ASSERT(tenants.size() == expectedTenantCount); + } + + return Void(); + } + + ACTOR static Future checkTenants(MetaclusterRestoreWorkload* self) { + state std::vector> tenants = wait(MetaclusterAPI::listTenants( + self->managementDb, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + ASSERT(tenants.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + + std::map tenantMap(tenants.begin(), tenants.end()); + for (auto& [tenantName, tenantData] : self->createdTenants) { + TenantMapEntry const& entry = tenantMap[tenantName]; + if (!tenantData.beforeBackup && self->dataDbs[tenantData.cluster].restored) { + ASSERT(entry.tenantState == TenantState::ERROR); + } else { + ASSERT(entry.tenantState == TenantState::READY); + } + } + return Void(); + } + + Future check(Database const& cx) override { + if (clientId == 0) { + return _check(this); + } else { + return true; + } + } + ACTOR static Future _check(MetaclusterRestoreWorkload* self) { + // The metacluster consistency check runs the tenant consistency check for each cluster + state MetaclusterConsistencyCheck metaclusterConsistencyCheck( + self->managementDb, AllowPartialMetaclusterOperations::False); + + wait(metaclusterConsistencyCheck.run()); + + std::vector> dataClusterChecks; + for (auto [clusterName, dataClusterData] : self->dataDbs) { + dataClusterChecks.push_back(checkDataCluster(self, clusterName, dataClusterData)); + } + wait(waitForAll(dataClusterChecks)); + wait(checkTenants(self)); + return true; + } + + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory MetaclusterRestoreWorkloadFactory("MetaclusterRestore"); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 725af89c18..a6c375ed6a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -383,6 +383,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES slow/LongRunning.toml LONG_RUNNING) add_fdb_test(TEST_FILES slow/LowLatencyWithFailures.toml) add_fdb_test(TEST_FILES slow/MetaclusterManagement.toml) + add_fdb_test(TEST_FILES slow/MetaclusterRecovery.toml) add_fdb_test(TEST_FILES slow/MoveKeysClean.toml) add_fdb_test(TEST_FILES slow/MoveKeysSideband.toml) add_fdb_test(TEST_FILES slow/RyowCorrectness.toml) diff --git a/tests/slow/MetaclusterRecovery.toml b/tests/slow/MetaclusterRecovery.toml new file mode 100644 index 0000000000..e41bc1bc1d --- /dev/null +++ b/tests/slow/MetaclusterRecovery.toml @@ -0,0 +1,18 @@ +[configuration] +allowDefaultTenant = false +allowDisablingTenants = false +allowCreatingTenants = false +extraDatabaseMode = 'Multiple' +extraDatabaseCount = 5 + +[[test]] +testTitle = 'MetaclusterRestoreTest' +clearAfterTest = true +timeout = 2100 +runSetup = true +simBackupAgents = 'BackupToFile' + + [[test.workload]] + testName = 'MetaclusterRestore' + maxTenants = 1000 + maxTenantGroups = 20 From af55feb493ab0b46cf2b12de1780af353e4c442c Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 3 Oct 2022 14:18:26 -0700 Subject: [PATCH 16/57] Fix some test issues --- fdbcli/MetaclusterCommands.actor.cpp | 2 +- .../fdbclient/MetaclusterManagement.actor.h | 17 ++++++----------- fdbserver/SimulatedCluster.actor.cpp | 4 +++- .../MetaclusterManagementWorkload.actor.cpp | 10 ++++++---- tests/slow/MetaclusterRecovery.toml | 1 + 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index f544a34a75..9de5dc8fc2 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -155,7 +155,7 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4, true)); + wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); return true; diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 4b93c80b0f..5c4600d14b 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -723,17 +723,12 @@ struct RemoveClusterImpl { // Initialization parameters bool forceRemove; - bool cleanupManagementClusterState; // Parameters set in markClusterRemoving Optional lastTenantId; - RemoveClusterImpl(Reference managementDb, - ClusterName clusterName, - bool forceRemove, - bool cleanupManagementClusterState) - : ctx(managementDb, clusterName), forceRemove(forceRemove), - cleanupManagementClusterState(cleanupManagementClusterState) {} + RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) + : ctx(managementDb, clusterName), forceRemove(forceRemove) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { @@ -905,8 +900,8 @@ struct RemoveClusterImpl { state std::pair clusterTupleRange = std::make_pair( Tuple::makeTuple(self->ctx.clusterName.get()), Tuple::makeTuple(keyAfter(self->ctx.clusterName.get()))); - state bool deleteTenants = self->cleanupManagementClusterState; - state bool deleteTenantGroups = self->cleanupManagementClusterState; + state bool deleteTenants = true; + state bool deleteTenantGroups = true; loop { bool clearedAll = wait(self->ctx.runManagementTransaction( @@ -971,8 +966,8 @@ struct RemoveClusterImpl { }; ACTOR template -Future removeCluster(Reference db, ClusterName name, bool forceRemove, bool cleanupManagementClusterState) { - state RemoveClusterImpl impl(db, name, forceRemove, cleanupManagementClusterState); +Future removeCluster(Reference db, ClusterName name, bool forceRemove) { + state RemoveClusterImpl impl(db, name, forceRemove); wait(impl.run()); return Void(); } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index b96d95e740..99d0a426a3 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -398,6 +398,7 @@ public: ISimulator::ExtraDatabaseMode extraDatabaseMode = ISimulator::ExtraDatabaseMode::Disabled; // The number of extra database used if the database mode is MULTIPLE int extraDatabaseCount = 1; + bool extraDatabaseBackupAgents = false; int minimumReplication = 0; int minimumRegions = 0; bool configureLocked = false; @@ -481,6 +482,7 @@ public: .add("testPriority", &testPriority) .add("extraDatabaseMode", &extraDatabaseModeStr) .add("extraDatabaseCount", &extraDatabaseCount) + .add("extraDatabaseBackupAgents", &extraDatabaseBackupAgents) .add("minimumReplication", &minimumReplication) .add("minimumRegions", &minimumRegions) .add("configureLocked", &configureLocked) @@ -2519,7 +2521,7 @@ void setupSimulatedSystem(std::vector>* systemActors, baseFolder, false, machine == useSeedForMachine, - AgentAddition, + testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, sslOnly, whitelistBinPaths, protocolVersion, diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index f2cbf3830b..29d01fd2fa 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -180,6 +180,7 @@ struct MetaclusterManagementWorkload : TestWorkload { } ASSERT(!dataDb->registered); + ASSERT(!dataDb->detached || dataDb->tenants.empty()); dataDb->tenantGroupCapacity = entry.capacity.numTenantGroups; self->totalTenantGroupCapacity += entry.capacity.numTenantGroups; @@ -193,7 +194,7 @@ struct MetaclusterManagementWorkload : TestWorkload { ASSERT(dataDb->registered); return Void(); } else if (e.code() == error_code_cluster_not_empty) { - ASSERT(dataDb->registered && dataDb->detached && !dataDb->tenants.empty()); + ASSERT(dataDb->detached && !dataDb->tenants.empty()); return Void(); } @@ -208,13 +209,13 @@ struct MetaclusterManagementWorkload : TestWorkload { state ClusterName clusterName = self->chooseClusterName(); state DataClusterData* dataDb = &self->dataDbs[clusterName]; state bool retried = false; - state bool detachCluster = deterministicRandom()->coinflip(); + state bool detachCluster = false; try { loop { // TODO: check force removal Future removeFuture = - MetaclusterAPI::removeCluster(self->managementDb, clusterName, detachCluster, !detachCluster); + MetaclusterAPI::removeCluster(self->managementDb, clusterName, detachCluster); try { Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -243,6 +244,7 @@ struct MetaclusterManagementWorkload : TestWorkload { dataDb->tenantGroupCapacity = 0; dataDb->registered = false; } else { + dataDb->registered = false; dataDb->detached = true; } @@ -966,7 +968,7 @@ struct MetaclusterManagementWorkload : TestWorkload { std::vector> removeClusterFutures; for (auto [clusterName, clusterMetadata] : dataClusters) { removeClusterFutures.push_back( - MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants, true)); + MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants)); } wait(waitForAll(removeClusterFutures)); diff --git a/tests/slow/MetaclusterRecovery.toml b/tests/slow/MetaclusterRecovery.toml index e41bc1bc1d..98d5b5fc71 100644 --- a/tests/slow/MetaclusterRecovery.toml +++ b/tests/slow/MetaclusterRecovery.toml @@ -4,6 +4,7 @@ allowDisablingTenants = false allowCreatingTenants = false extraDatabaseMode = 'Multiple' extraDatabaseCount = 5 +extraDatabaseBackupAgents = true [[test]] testTitle = 'MetaclusterRestoreTest' From be5b0f451d95e8dd451e9df73c8db970ad1e7849 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 5 Oct 2022 16:23:11 -0700 Subject: [PATCH 17/57] Expand the restore test; fix some bugs mainly around renaming and changing tenant configuration during a restore --- fdbcli/MetaclusterCommands.actor.cpp | 2 +- fdbclient/Metacluster.cpp | 2 +- .../fdbclient/MetaclusterManagement.actor.h | 187 ++++++++++++++---- .../fdbclient/TenantManagement.actor.h | 3 +- .../MetaclusterManagementWorkload.actor.cpp | 20 +- .../MetaclusterRestoreWorkload.actor.cpp | 147 ++++++++++++-- 6 files changed, 288 insertions(+), 73 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 9de5dc8fc2..af538afe05 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -183,7 +183,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto state bool restore_from_data_cluster = tokens.size() == 5; if (restore_from_data_cluster) { wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), AddNewTenants::False, RemoveMissingTenants::True)); + db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::True)); fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); return true; diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp index b298762f11..4d4efcda56 100644 --- a/fdbclient/Metacluster.cpp +++ b/fdbclient/Metacluster.cpp @@ -21,7 +21,7 @@ #include "fdbclient/Metacluster.h" #include "fdbclient/MetaclusterManagement.actor.h" -FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants); +FDB_DEFINE_BOOLEAN_PARAM(ApplyManagementClusterUpdates); FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DEFINE_BOOLEAN_PARAM(AssignClusterAutomatically); diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 5c4600d14b..723c57462a 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -84,12 +84,17 @@ struct DataClusterMetadata { } }; -FDB_DECLARE_BOOLEAN_PARAM(AddNewTenants); +FDB_DECLARE_BOOLEAN_PARAM(ApplyManagementClusterUpdates); FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DECLARE_BOOLEAN_PARAM(AssignClusterAutomatically); namespace MetaclusterAPI { +// This prefix is used during a cluster restore if the desired name is in use +// TODO: this should probably live in the `\xff` tenant namespace, but other parts +// of the code are not able to work with `\xff` tenants yet. +const StringRef metaclusterTemporaryRenamePrefix = "\xfe/restoreTenant/"_sr; + struct ManagementClusterMetadata { struct ConnectionStringCodec { static inline Standalone pack(ClusterConnectionString const& val) { @@ -1103,8 +1108,7 @@ struct RestoreClusterImpl { // Initialization parameters ClusterName clusterName; ClusterConnectionString connectionString; - AddNewTenants addNewTenants; - RemoveMissingTenants removeMissingTenants; + ApplyManagementClusterUpdates applyManagementClusterUpdates; // Tenant list from data and management clusters std::unordered_map dataClusterTenantMap; @@ -1116,10 +1120,9 @@ struct RestoreClusterImpl { RestoreClusterImpl(Reference managementDb, ClusterName clusterName, ClusterConnectionString connectionString, - AddNewTenants addNewTenants, - RemoveMissingTenants removeMissingTenants) + ApplyManagementClusterUpdates applyManagementClusterUpdates) : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), - addNewTenants(addNewTenants), removeMissingTenants(removeMissingTenants) {} + applyManagementClusterUpdates(applyManagementClusterUpdates) {} // Check that the restored data cluster has a matching metacluster registration entry ACTOR static Future verifyDataClusterMatch(RestoreClusterImpl* self) { @@ -1241,6 +1244,112 @@ struct RestoreClusterImpl { return Void(); } + ACTOR static Future renameTenant(RestoreClusterImpl* self, + Reference tr, + int64_t tenantId, + TenantName oldTenantName, + TenantName newTenantName, + int configurationSequenceNum) { + state Optional oldEntry; + state Optional newEntry; + + wait(store(oldEntry, TenantAPI::tryGetTenantTransaction(tr, oldTenantName)) && + store(newEntry, TenantAPI::tryGetTenantTransaction(tr, newTenantName))); + + if (oldEntry.present() && oldEntry.get().id == tenantId && !newEntry.present()) { + wait(TenantAPI::renameTenantTransaction( + tr, oldTenantName, newTenantName, tenantId, ClusterType::METACLUSTER_DATA, configurationSequenceNum)); + return Void(); + } else if (newEntry.present() && newEntry.get().id == tenantId) { + // Rename already succeeded + return Void(); + } + + TraceEvent(SevWarnAlways, "RestoreDataClusterRenameError") + .detail("OldName", oldTenantName) + .detail("NewName", newTenantName) + .detail("TenantID", tenantId) + .detail("OldEntryPresent", oldEntry.present()) + .detail("NewEntryPresent", newEntry.present()); + + if (newEntry.present()) { + throw tenant_already_exists(); + } else { + throw tenant_not_found(); + } + } + + ACTOR static Future updateTenantConfiguration(RestoreClusterImpl* self, + Reference tr, + TenantName tenantName, + TenantMapEntry updatedEntry) { + TenantMapEntry existingEntry = wait(TenantAPI::getTenantTransaction(tr, tenantName)); + updatedEntry.assignedCluster = Optional(); + if (existingEntry.configurationSequenceNum <= updatedEntry.configurationSequenceNum) { + wait(TenantAPI::configureTenantTransaction(tr, tenantName, existingEntry, updatedEntry)); + } + + return Void(); + } + + // Updates a tenant to match the management cluster state + // Returns the name of the tenant after it has been reconciled + ACTOR static Future>> reconcileTenant(RestoreClusterImpl* self, + TenantName tenantName, + TenantMapEntry tenantEntry) { + + auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantEntry.id); + if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { + // Delete + if (self->applyManagementClusterUpdates) { + wait(self->ctx.runDataClusterTransaction( + [tenantName = tenantName, tenantEntry = tenantEntry](Reference tr) { + return TenantAPI::deleteTenantTransaction( + tr, tenantName, tenantEntry.id, ClusterType::METACLUSTER_DATA); + })); + } + + return Optional>(); + } else { + state TenantName managementName = self->mgmtClusterTenantIdIndex[tenantEntry.id]; + state TenantMapEntry managementTenant = self->mgmtClusterTenantMap[managementName]; + if (tenantName.compare(managementName) != 0) { + state TenantName temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementName); + // Rename + if (self->applyManagementClusterUpdates) { + wait(self->ctx.runDataClusterTransaction( + [self = self, + tenantName = tenantName, + temporaryName = temporaryName, + tenantEntry = tenantEntry, + managementTenant = managementTenant](Reference tr) { + return renameTenant(self, + tr, + tenantEntry.id, + tenantName, + temporaryName, + managementTenant.configurationSequenceNum); + })); + } + tenantName = temporaryName; + } + + if (!managementTenant.matchesConfiguration(tenantEntry) || + managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { + if (self->applyManagementClusterUpdates) { + ASSERT(managementTenant.configurationSequenceNum >= tenantEntry.configurationSequenceNum); + wait(self->ctx.runDataClusterTransaction( + [self = self, tenantName = tenantName, managementTenant = managementTenant]( + Reference tr) { + return updateTenantConfiguration(self, tr, tenantName, managementTenant); + })); + } + } + + return std::make_pair(tenantName, managementTenant); + } + } + // This only supports the restore of an already registered data cluster, for now. ACTOR static Future run(RestoreClusterImpl* self) { // Run a management transaction to populate the data cluster metadata @@ -1271,33 +1380,30 @@ struct RestoreClusterImpl { // get all the tenant information for this data cluster from manangement cluster wait(getAllTenantsFromManagementCluster(self)); + state std::unordered_map partiallyRenamedTenants; state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); while (itr != self->dataClusterTenantMap.end()) { - uint64_t tenantId = (itr->second).id; - TenantName tenantName = itr->first; - - auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantId); - if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { - // Delete - if (self->removeMissingTenants) { - wait(self->ctx.runDataClusterTransaction([tenantName, tenantId](Reference tr) { - return TenantAPI::deleteTenantTransaction( - tr, tenantName, tenantId, ClusterType::METACLUSTER_DATA); - })); - } - } else if (tenantName.compare(self->mgmtClusterTenantIdIndex[tenantId]) != 0) { - // Rename - TenantName tenantNewName = self->mgmtClusterTenantIdIndex[tenantId]; - wait(self->ctx.runDataClusterTransaction( - [tenantName, tenantNewName, tenantId](Reference tr) { - return TenantAPI::renameTenantTransaction( - tr, tenantName, tenantNewName, tenantId, ClusterType::METACLUSTER_DATA); - })); + Optional> result = + wait(reconcileTenant(self, itr->first, itr->second)); + if (result.present() && result.get().first.startsWith(metaclusterTemporaryRenamePrefix)) { + partiallyRenamedTenants[result.get().first] = result.get().second; } - ++itr; } + state std::unordered_map::iterator renameItr = partiallyRenamedTenants.begin(); + while (renameItr != partiallyRenamedTenants.end()) { + wait(self->ctx.runDataClusterTransaction([self = self, renameItr = renameItr](Reference tr) { + return renameTenant(self, + tr, + renameItr->second.id, + renameItr->first, + renameItr->first.removePrefix(metaclusterTemporaryRenamePrefix), + renameItr->second.configurationSequenceNum); + })); + ++renameItr; + } + state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { TenantName tenantName = *setItr; @@ -1332,9 +1438,8 @@ ACTOR template Future restoreCluster(Reference db, ClusterName name, ClusterConnectionString connectionString, - AddNewTenants addNewTenants, - RemoveMissingTenants removeMissingTenants) { - state RestoreClusterImpl impl(db, name, connectionString, addNewTenants, removeMissingTenants); + ApplyManagementClusterUpdates applyManagementClusterUpdates) { + state RestoreClusterImpl impl(db, name, connectionString, applyManagementClusterUpdates); wait(impl.run()); return Void(); } @@ -1965,7 +2070,7 @@ struct ConfigureTenantImpl { } // Updates the configuration in the management cluster and marks it as being in the UPDATING_CONFIGURATION state - ACTOR static Future updateManagementCluster(ConfigureTenantImpl* self, + ACTOR static Future updateManagementCluster(ConfigureTenantImpl* self, Reference tr) { state Optional tenantEntry = wait(tryGetTenantTransaction(tr, self->tenantName)); @@ -1992,11 +2097,15 @@ struct ConfigureTenantImpl { self->updatedEntry.configure(configItr->first, configItr->second); } + if (self->updatedEntry.matchesConfiguration(tenantEntry.get())) { + return false; + } + ++self->updatedEntry.configurationSequenceNum; ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, self->updatedEntry.id, self->updatedEntry); ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); - return Void(); + return true; } // Updates the configuration in the data cluster @@ -2035,12 +2144,16 @@ struct ConfigureTenantImpl { } ACTOR static Future run(ConfigureTenantImpl* self) { - wait(self->ctx.runManagementTransaction( + bool configUpdated = wait(self->ctx.runManagementTransaction( [self = self](Reference tr) { return updateManagementCluster(self, tr); })); - wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return updateDataCluster(self, tr); })); - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return markManagementTenantAsReady(self, tr); })); + + if (configUpdated) { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); })); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return markManagementTenantAsReady(self, tr); + })); + } return Void(); } diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index 44f61c1a75..f584743947 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -430,6 +430,7 @@ Future configureTenantTransaction(Transaction tr, TenantMapEntry originalEntry, TenantMapEntry updatedTenantEntry) { ASSERT(updatedTenantEntry.id == originalEntry.id); + ASSERT(!updatedTenantEntry.assignedCluster.present()); tr->setOption(FDBTransactionOptions::RAW_ACCESS); TenantMetadata::tenantMap().set(tr, updatedTenantEntry.id, updatedTenantEntry); @@ -555,7 +556,7 @@ Future renameTenantTransaction(Transaction tr, } if (configureSequenceNum.present()) { - if (entry.configurationSequenceNum >= configureSequenceNum.get()) { + if (entry.configurationSequenceNum > configureSequenceNum.get()) { return Void(); } entry.configurationSequenceNum = configureSequenceNum.get(); diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 29d01fd2fa..a34a6799d6 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -270,7 +270,6 @@ struct MetaclusterManagementWorkload : TestWorkload { ACTOR static Future restoreCluster(MetaclusterManagementWorkload* self) { state ClusterName clusterName = self->chooseClusterName(); state DataClusterData* dataDb = &self->dataDbs[clusterName]; - state bool retried = false; try { loop { @@ -278,21 +277,10 @@ struct MetaclusterManagementWorkload : TestWorkload { MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->db->getConnectionRecord()->getConnectionString(), - AddNewTenants::False, - RemoveMissingTenants::True); - try { - Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); - if (result.present()) { - break; - } else { - retried = true; - } - } catch (Error& e) { - if (false) { - break; - } else { - throw; - } + ApplyManagementClusterUpdates::True); + Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); + if (result.present()) { + break; } } diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index dc03679c7c..c31f98039e 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -43,6 +43,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { struct DataClusterData { Database db; std::set tenants; + std::set tenantGroups; bool restored = false; DataClusterData() {} @@ -64,6 +65,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::vector dataDbIndex; std::map createdTenants; + std::map> tenantGroups; int initialTenants; int maxTenants; @@ -91,11 +93,24 @@ struct MetaclusterRestoreWorkload : TestWorkload { return tenant; } - Optional chooseTenantGroup() { + Optional chooseTenantGroup(Optional cluster = Optional()) { Optional tenantGroup; if (deterministicRandom()->coinflip()) { - tenantGroup = - TenantGroupNameRef(format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups))); + if (!cluster.present()) { + tenantGroup = + TenantGroupNameRef(format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups))); + } else { + auto const& existingGroups = dataDbs[cluster.get()].tenantGroups; + if (deterministicRandom()->coinflip() && !existingGroups.empty()) { + tenantGroup = deterministicRandom()->randomChoice( + std::vector(existingGroups.begin(), existingGroups.end())); + } else if (tenantGroups.size() < maxTenantGroups) { + do { + tenantGroup = TenantGroupNameRef( + format("tenantgroup%08d", deterministicRandom()->randomInt(0, maxTenantGroups))); + } while (tenantGroups.count(tenantGroup.get()) > 0); + } + } } return tenantGroup; @@ -227,8 +242,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { wait(MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->getConnectionRecord()->getConnectionString(), - AddNewTenants::False, - RemoveMissingTenants::True)); + ApplyManagementClusterUpdates::True)); self->dataDbs[clusterName].restored = true; @@ -261,7 +275,12 @@ struct MetaclusterRestoreWorkload : TestWorkload { TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); self->createdTenants[tenantName] = TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, beforeBackup); - self->dataDbs[createdEntry.assignedCluster.get()].tenants.insert(tenantName); + auto& dataDb = self->dataDbs[createdEntry.assignedCluster.get()]; + dataDb.tenants.insert(tenantName); + if (createdEntry.tenantGroup.present()) { + self->tenantGroups[createdEntry.tenantGroup.get()].insert(tenantName); + dataDb.tenantGroups.insert(createdEntry.tenantGroup.get()); + } return Void(); } catch (Error& e) { fmt::print("Tenant create error {} {}\n", e.what(), printable(tenantName)); @@ -290,16 +309,70 @@ struct MetaclusterRestoreWorkload : TestWorkload { fmt::print("Delete tenant {}\n", printable(tenantName)); + wait(MetaclusterAPI::deleteTenant(self->managementDb, tenantName)); + auto const& tenantData = self->createdTenants[tenantName]; + + auto& dataDb = self->dataDbs[tenantData.cluster]; + dataDb.tenants.erase(tenantName); + if (tenantData.tenantGroup.present()) { + auto groupItr = self->tenantGroups.find(tenantData.tenantGroup.get()); + groupItr->second.erase(tenantName); + if (groupItr->second.empty()) { + self->tenantGroups.erase(groupItr); + dataDb.tenantGroups.erase(tenantData.tenantGroup.get()); + } + } + + self->createdTenants.erase(tenantName); + + return Void(); + } + + ACTOR static Future configureTenant(MetaclusterRestoreWorkload* self) { + state TenantName tenantName; + for (int i = 0; i < 10; ++i) { + tenantName = self->chooseTenantName(); + if (self->createdTenants.count(tenantName) != 0) { + break; + } + } + + if (self->createdTenants.count(tenantName) == 0) { + return Void(); + } + + + state Optional tenantGroup = self->chooseTenantGroup(self->createdTenants[tenantName].cluster); + state std::map, Optional> configurationParams = { { "tenant_group"_sr, + tenantGroup } }; + loop { try { - wait(MetaclusterAPI::deleteTenant(self->managementDb, tenantName)); - auto const& tenantData = self->createdTenants[tenantName]; - self->dataDbs[tenantData.cluster].tenants.erase(tenantName); - self->createdTenants.erase(tenantName); + wait(MetaclusterAPI::configureTenant(self->managementDb, tenantName, configurationParams)); + + auto& tenantData = self->createdTenants[tenantName]; + if (tenantData.tenantGroup != tenantGroup) { + auto& dataDb = self->dataDbs[tenantData.cluster]; + if (tenantData.tenantGroup.present()) { + auto groupItr = self->tenantGroups.find(tenantData.tenantGroup.get()); + groupItr->second.erase(tenantName); + if (groupItr->second.empty()) { + self->tenantGroups.erase(groupItr); + dataDb.tenantGroups.erase(tenantData.tenantGroup.get()); + } + self->tenantGroups[tenantData.tenantGroup.get()].erase(tenantName); + } + + if (tenantGroup.present()) { + self->tenantGroups[tenantGroup.get()].insert(tenantName); + dataDb.tenantGroups.insert(tenantGroup.get()); + } + + tenantData.tenantGroup = tenantGroup; + } return Void(); } catch (Error& e) { - fmt::print("Tenant create error {} {}\n", e.what(), printable(tenantName)); - if (e.code() != error_code_metacluster_no_capacity) { + if (e.code() != error_code_cluster_no_capacity) { throw; } @@ -309,6 +382,46 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } + ACTOR static Future renameTenant(MetaclusterRestoreWorkload* self) { + state TenantName oldTenantName; + state TenantName newTenantName; + for (int i = 0; i < 10; ++i) { + oldTenantName = self->chooseTenantName(); + if (self->createdTenants.count(oldTenantName) != 0) { + break; + } + } + for (int i = 0; i < 10; ++i) { + newTenantName = self->chooseTenantName(); + if (self->createdTenants.count(newTenantName) == 0) { + break; + } + } + + if (self->createdTenants.count(oldTenantName) == 0 || self->createdTenants.count(newTenantName) != 0) { + return Void(); + } + + + wait(MetaclusterAPI::renameTenant(self->managementDb, oldTenantName, newTenantName)); + + auto const& tenantData = self->createdTenants[oldTenantName]; + if (tenantData.tenantGroup.present()) { + auto& tenantGroup = self->tenantGroups[tenantData.tenantGroup.get()]; + tenantGroup.erase(oldTenantName); + tenantGroup.insert(newTenantName); + } + + auto& dataDb = self->dataDbs[tenantData.cluster]; + dataDb.tenants.erase(oldTenantName); + dataDb.tenants.insert(newTenantName); + + self->createdTenants[newTenantName] = tenantData; + self->createdTenants.erase(oldTenantName); + + return Void(); + } + Future start(Database const& cx) override { if (clientId == 0) { return _start(cx, this); @@ -343,15 +456,15 @@ struct MetaclusterRestoreWorkload : TestWorkload { // Make random tenant mutations state int tenantMutationNum; for (tenantMutationNum = 0; tenantMutationNum < 100; ++tenantMutationNum) { - state int operation = deterministicRandom()->randomInt(0, 2); + state int operation = deterministicRandom()->randomInt(0, 4); if (operation == 0) { wait(createTenant(self, false)); } else if (operation == 1) { wait(deleteTenant(self)); - /*} else if (operation == 2) { - wait(configureTenant(self)); - } else if (operation == 3) { - wait(renameTenant(self));*/ + } else if (operation == 2) { + wait(configureTenant(self)); + } else if (operation == 3) { + wait(renameTenant(self)); } } From 54eb847dcb3c6a852d000e55949017a7818efae8 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 20 Oct 2022 14:41:21 -0700 Subject: [PATCH 18/57] Fix a bug in the tenant configuration function where the tenant configuration wouldn't get applied if it retried after a successful commit. Fix some formatting and and merge issues. --- .../fdbclient/MetaclusterManagement.actor.h | 3 +- fdbserver/SimulatedCluster.actor.cpp | 35 ++++++++++--------- .../MetaclusterRestoreWorkload.actor.cpp | 5 ++- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 723c57462a..2469fed360 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -2097,7 +2097,8 @@ struct ConfigureTenantImpl { self->updatedEntry.configure(configItr->first, configItr->second); } - if (self->updatedEntry.matchesConfiguration(tenantEntry.get())) { + if (self->updatedEntry.matchesConfiguration(tenantEntry.get()) && + tenantEntry.get().tenantState == TenantState::READY) { return false; } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 99d0a426a3..ed1afd0981 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -482,7 +482,7 @@ public: .add("testPriority", &testPriority) .add("extraDatabaseMode", &extraDatabaseModeStr) .add("extraDatabaseCount", &extraDatabaseCount) - .add("extraDatabaseBackupAgents", &extraDatabaseBackupAgents) + .add("extraDatabaseBackupAgents", &extraDatabaseBackupAgents) .add("minimumReplication", &minimumReplication) .add("minimumRegions", &minimumRegions) .add("configureLocked", &configureLocked) @@ -2512,22 +2512,23 @@ void setupSimulatedSystem(std::vector>* systemActors, LocalityData localities(Optional>(), newZoneId, newMachineId, dcUID); localities.set("data_hall"_sr, dcUID); - systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), - conn, - extraIps, - sslEnabled, - localities, - processClass, - baseFolder, - false, - machine == useSeedForMachine, - testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, - sslOnly, - whitelistBinPaths, - protocolVersion, - configDBType, - true), - "SimulatedMachine")); + systemActors->push_back( + reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), + conn, + extraIps, + sslEnabled, + localities, + processClass, + baseFolder, + false, + machine == useSeedForMachine, + testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, + sslOnly, + whitelistBinPaths, + protocolVersion, + configDBType, + true), + "SimulatedMachine")); ++cluster; } } diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index c31f98039e..8ad2293390 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -39,6 +39,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. struct MetaclusterRestoreWorkload : TestWorkload { + static constexpr auto NAME = "MetaclusterRestore"; struct DataClusterData { Database db; @@ -80,8 +81,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { tenantGroupCapacity = (initialTenants / 2 + maxTenantGroups - 1) / g_simulator->extraDatabases.size(); } - std::string description() const override { return "MetaclusterRestore"; } - void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("MachineAttritionWorkload"); } @@ -583,4 +582,4 @@ struct MetaclusterRestoreWorkload : TestWorkload { void getMetrics(std::vector& m) override {} }; -WorkloadFactory MetaclusterRestoreWorkloadFactory("MetaclusterRestore"); +WorkloadFactory MetaclusterRestoreWorkloadFactory; From 22d907651b95bc92241f04337e136a4e116c6339 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 21 Oct 2022 16:16:28 -0700 Subject: [PATCH 19/57] Add a couple someday comments; reduce the number of cases where we use temporary names; make the restore behavior when a rename is partially complete more defined --- .../fdbclient/MetaclusterManagement.actor.h | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 2469fed360..fadff4a917 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1212,7 +1212,9 @@ struct RestoreClusterImpl { state TenantName lastTenantNameRetrieved; for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); - self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); + if (t.second.tenantState != TenantState::RENAMING_FROM) { + self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); + } if (t.second.assignedCluster.present() && self->clusterName.compare(t.second.assignedCluster.get()) == 0) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); } @@ -1314,7 +1316,14 @@ struct RestoreClusterImpl { state TenantName managementName = self->mgmtClusterTenantIdIndex[tenantEntry.id]; state TenantMapEntry managementTenant = self->mgmtClusterTenantMap[managementName]; if (tenantName.compare(managementName) != 0) { - state TenantName temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementName); + ASSERT(managementTenant.tenantState != TenantState::RENAMING_FROM); + state TenantName temporaryName; + if (self->dataClusterTenantMap.count(managementName) > 0) { + temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementName); + } else { + temporaryName = managementName; + } + // Rename if (self->applyManagementClusterUpdates) { wait(self->ctx.runDataClusterTransaction( @@ -1332,6 +1341,7 @@ struct RestoreClusterImpl { })); } tenantName = temporaryName; + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state } if (!managementTenant.matchesConfiguration(tenantEntry) || @@ -1344,6 +1354,8 @@ struct RestoreClusterImpl { return updateTenantConfiguration(self, tr, tenantName, managementTenant); })); } + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the + // UPDATING_CONFIGURATION state } return std::make_pair(tenantName, managementTenant); @@ -1407,10 +1419,18 @@ struct RestoreClusterImpl { state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { TenantName tenantName = *setItr; - uint64_t tenantId = self->mgmtClusterTenantMap[tenantName].id; + TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantName]; - if (self->dataClusterTenantIdSet.find(tenantId) == self->dataClusterTenantIdSet.end()) { - // Set Tenant in ERROR state + // If a tenant is present on the management cluster and not on the data cluster, mark it in an error state + // unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be missing + // on the data cluster + // + // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the + // tenant) + if (self->dataClusterTenantIdSet.find(managementTenant.id) == self->dataClusterTenantIdSet.end() && + managementTenant.tenantState != TenantState::REGISTERING && + managementTenant.tenantState != TenantState::REMOVING && + managementTenant.tenantState != TenantState::ERROR) { wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { return markManagementTenantAsError(tr, tenantName); })); From f84dd5ac2b98b219fef2d2659cc68a98a5b4b669 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 21 Oct 2022 16:24:05 -0700 Subject: [PATCH 20/57] Formatting fix --- fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 8ad2293390..b843185ca5 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -340,7 +340,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } - state Optional tenantGroup = self->chooseTenantGroup(self->createdTenants[tenantName].cluster); state std::map, Optional> configurationParams = { { "tenant_group"_sr, tenantGroup } }; @@ -401,7 +400,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } - wait(MetaclusterAPI::renameTenant(self->managementDb, oldTenantName, newTenantName)); auto const& tenantData = self->createdTenants[oldTenantName]; From 56b716040f404c148ce316f85ede6a64fc24f93f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 24 Oct 2022 12:52:07 -0700 Subject: [PATCH 21/57] Various fixes and cleanup --- fdbcli/MetaclusterCommands.actor.cpp | 30 +++- .../fdbclient/MetaclusterManagement.actor.h | 139 ++++++++++-------- .../MetaclusterRestoreWorkload.actor.cpp | 33 ----- flow/include/flow/error_definitions.h | 1 + 4 files changed, 100 insertions(+), 103 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index af538afe05..2b3c508b41 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -163,10 +163,20 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector // metacluster restore command ACTOR Future metaclusterRestoreCommand(Reference db, std::vector tokens) { - if (tokens.size() < 4 || tokens.size() > 6) { + if (tokens.size() != 5) { fmt::print("Usage: metacluster restore connection_string=\n" - "[repopulate_from_data_cluster]\n\n"); - fmt::print("Restore a data cluster.\n"); + "\n\n"); + fmt::print("Add a restored data cluster back to a metacluster.\n"); + fmt::print("Use `restore_known_data_cluster' to add back a restored copy of a data cluster\n"); + fmt::print("that the metacluster is already tracking. This mode should be used if only data\n"); + fmt::print("clusters are being restored, and any discrepancies between the management and\n"); + fmt::print("data clusters will be resolved using the management cluster metadata.\n"); + fmt::print("Use `repopulate_from_data_cluster' to rebuild a lost management cluster from the\n"); + fmt::print("data clusters in a metacluster. This mode should be used if the management\n"); + fmt::print("cluster is being restored. If any data clusters are also being restored, the\n"); + fmt::print("oldest data clusters should be added first before any non-recovered data\n"); + fmt::print("clusters. Any discrepancies arising between the data clusters will be resolved\n"); + fmt::print("using the data cluster that was added last."); return false; } @@ -180,15 +190,19 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto return false; } - state bool restore_from_data_cluster = tokens.size() == 5; - if (restore_from_data_cluster) { + if (tokens[4] == "restore_known_data_cluster"_sr) { wait(MetaclusterAPI::restoreCluster( db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::True)); fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); return true; + } else if (tokens[4] == "repopulate_from_data_cluster"_sr) { + fmt::print(stderr, "ERROR: the `repopulate_from_data_cluster' restore mode is not currently supported\n"); + return false; + } else { + fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(tokens[4])); + return false; } - return false; } // metacluster configure command @@ -475,7 +489,7 @@ std::vector metaclusterHintGenerator(std::vector const& } else if (tokencmp(tokens[1], "restore") && tokens.size() < 5) { static std::vector opts = { "", "connection_string= ", - "[repopulate_from_data_cluster]" }; + "" }; return std::vector(opts.begin() + tokens.size() - 2, opts.end()); } else if (tokencmp(tokens[1], "configure")) { static std::vector opts = { @@ -503,7 +517,7 @@ CommandFactory metaclusterRegisterFactory( "`create_experimental' and `decommission' set up or deconfigure a metacluster.\n" "`register' and `remove' add and remove data clusters from the metacluster.\n" "`configure' updates the configuration of a data cluster.\n" - "`restore' restores the specified data cluster." + "`restore' is used to recover from lost management or data clusters.\n" "`list' prints a list of data clusters in the metacluster.\n" "`get' prints the metadata for a particular data cluster.\n" "`status' prints metacluster metadata.\n"), diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index fadff4a917..d809434307 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1112,9 +1112,9 @@ struct RestoreClusterImpl { // Tenant list from data and management clusters std::unordered_map dataClusterTenantMap; - std::unordered_set dataClusterTenantIdSet; + std::unordered_set dataClusterTenantIdSet; std::unordered_map mgmtClusterTenantMap; - std::unordered_map mgmtClusterTenantIdIndex; + std::unordered_map mgmtClusterTenantIdIndex; std::unordered_set mgmtClusterTenantSetForCurrentDataCluster; RestoreClusterImpl(Reference managementDb, @@ -1131,8 +1131,7 @@ struct RestoreClusterImpl { wait(MetaclusterMetadata::metaclusterRegistration().get(db)); if (!metaclusterRegistration.present()) { - // TODO: different error - throw invalid_metacluster_operation(); + throw invalid_data_cluster(); } if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { @@ -1209,35 +1208,32 @@ struct RestoreClusterImpl { tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); state KeyBackedRangeResult> tenants = wait(tenantsFuture); - state TenantName lastTenantNameRetrieved; for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); if (t.second.tenantState != TenantState::RENAMING_FROM) { self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); } - if (t.second.assignedCluster.present() && self->clusterName.compare(t.second.assignedCluster.get()) == 0) { + if (t.second.assignedCluster.present() && self->clusterName == t.second.assignedCluster.get()) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); } - - lastTenantNameRetrieved = t.first; } - return std::pair{ !tenants.more, lastTenantNameRetrieved }; + return std::pair{ !tenants.more, + tenants.results.empty() ? ""_sr : tenants.results.rbegin()->first }; } ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters - state TenantName beginRangeTenantName = ""_sr; + state TenantName beginTenant = ""_sr; loop { - TenantName initialTenantName = beginRangeTenantName; std::pair tenantsItr = wait(self->ctx.runManagementTransaction( - [self = self, initialTenantName](Reference tr) { - return getTenantsFromManagementCluster(self, tr, initialTenantName); + [self = self, beginTenant = beginTenant](Reference tr) { + return getTenantsFromManagementCluster(self, tr, beginTenant); })); if (!tenantsItr.first) { // Not all tenants retrieved yet. Begin next loop from this tenant. - beginRangeTenantName = keyAfter(tenantsItr.second); + beginTenant = keyAfter(tenantsItr.second); } else { break; } @@ -1301,8 +1297,8 @@ struct RestoreClusterImpl { TenantMapEntry tenantEntry) { auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantEntry.id); + // Delete if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { - // Delete if (self->applyManagementClusterUpdates) { wait(self->ctx.runDataClusterTransaction( [tenantName = tenantName, tenantEntry = tenantEntry](Reference tr) { @@ -1315,7 +1311,12 @@ struct RestoreClusterImpl { } else { state TenantName managementName = self->mgmtClusterTenantIdIndex[tenantEntry.id]; state TenantMapEntry managementTenant = self->mgmtClusterTenantMap[managementName]; - if (tenantName.compare(managementName) != 0) { + + ASSERT(managementTenant.assignedCluster.present() && + managementTenant.assignedCluster.get() == self->clusterName); + + // Rename + if (tenantName != managementName) { ASSERT(managementTenant.tenantState != TenantState::RENAMING_FROM); state TenantName temporaryName; if (self->dataClusterTenantMap.count(managementName) > 0) { @@ -1324,7 +1325,6 @@ struct RestoreClusterImpl { temporaryName = managementName; } - // Rename if (self->applyManagementClusterUpdates) { wait(self->ctx.runDataClusterTransaction( [self = self, @@ -1344,6 +1344,7 @@ struct RestoreClusterImpl { // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state } + // Update configuration if (!managementTenant.matchesConfiguration(tenantEntry) || managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { if (self->applyManagementClusterUpdates) { @@ -1362,6 +1363,60 @@ struct RestoreClusterImpl { } } + ACTOR static Future reconcileTenants(RestoreClusterImpl* self) { + state std::unordered_map partiallyRenamedTenants; + state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); + while (itr != self->dataClusterTenantMap.end()) { + Optional> result = + wait(reconcileTenant(self, itr->first, itr->second)); + if (result.present() && result.get().first.startsWith(metaclusterTemporaryRenamePrefix)) { + partiallyRenamedTenants[result.get().first] = result.get().second; + } + ++itr; + } + + state std::unordered_map::iterator renameItr = partiallyRenamedTenants.begin(); + while (renameItr != partiallyRenamedTenants.end()) { + wait(self->ctx.runDataClusterTransaction([self = self, renameItr = renameItr](Reference tr) { + return renameTenant(self, + tr, + renameItr->second.id, + renameItr->first, + renameItr->first.removePrefix(metaclusterTemporaryRenamePrefix), + renameItr->second.configurationSequenceNum); + })); + ++renameItr; + } + + return Void(); + } + + ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { + state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { + TenantName tenantName = *setItr; + TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantName]; + + // If a tenant is present on the management cluster and not on the data cluster, mark it in an error state + // unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be missing + // on the data cluster + // + // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the + // tenant) + if (self->dataClusterTenantIdSet.find(managementTenant.id) == self->dataClusterTenantIdSet.end() && + managementTenant.tenantState != TenantState::REGISTERING && + managementTenant.tenantState != TenantState::REMOVING && + managementTenant.tenantState != TenantState::ERROR) { + wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { + return markManagementTenantAsError(tr, tenantName); + })); + } + ++setItr; + } + + return Void(); + } + // This only supports the restore of an already registered data cluster, for now. ACTOR static Future run(RestoreClusterImpl* self) { // Run a management transaction to populate the data cluster metadata @@ -1389,54 +1444,14 @@ struct RestoreClusterImpl { wait(self->ctx.runDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - // get all the tenant information for this data cluster from manangement cluster + // get all the tenant in the metacluster wait(getAllTenantsFromManagementCluster(self)); - state std::unordered_map partiallyRenamedTenants; - state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); - while (itr != self->dataClusterTenantMap.end()) { - Optional> result = - wait(reconcileTenant(self, itr->first, itr->second)); - if (result.present() && result.get().first.startsWith(metaclusterTemporaryRenamePrefix)) { - partiallyRenamedTenants[result.get().first] = result.get().second; - } - ++itr; - } + // Fix any differences between the data cluster and the management cluster + wait(reconcileTenants(self)); - state std::unordered_map::iterator renameItr = partiallyRenamedTenants.begin(); - while (renameItr != partiallyRenamedTenants.end()) { - wait(self->ctx.runDataClusterTransaction([self = self, renameItr = renameItr](Reference tr) { - return renameTenant(self, - tr, - renameItr->second.id, - renameItr->first, - renameItr->first.removePrefix(metaclusterTemporaryRenamePrefix), - renameItr->second.configurationSequenceNum); - })); - ++renameItr; - } - - state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); - while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { - TenantName tenantName = *setItr; - TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantName]; - - // If a tenant is present on the management cluster and not on the data cluster, mark it in an error state - // unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be missing - // on the data cluster - // - // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the - // tenant) - if (self->dataClusterTenantIdSet.find(managementTenant.id) == self->dataClusterTenantIdSet.end() && - managementTenant.tenantState != TenantState::REGISTERING && - managementTenant.tenantState != TenantState::REMOVING && - managementTenant.tenantState != TenantState::ERROR) { - wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { - return markManagementTenantAsError(tr, tenantName); - })); - } - ++setItr; - } + // Mark tenants that are missing from the data cluster in an error state on the management cluster + wait(processMissingTenants(self)); // set restored cluster to ready state try { diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index b843185ca5..bea63ad232 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -149,15 +149,12 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } ACTOR static Future _setup(Database cx, MetaclusterRestoreWorkload* self) { - fmt::print("Setup start\n"); Reference threadSafeHandle = wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx))); - fmt::print("Create thread safe handle\n"); MultiVersionApi::api->selectApiVersion(cx->apiVersion.version()); self->managementDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle); wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); - fmt::print("Create metacluster\n"); ASSERT(g_simulator->extraDatabases.size() > 0); state std::vector::iterator extraDatabasesItr; @@ -175,14 +172,12 @@ struct MetaclusterRestoreWorkload : TestWorkload { clusterEntry.capacity.numTenantGroups = self->tenantGroupCapacity; wait(MetaclusterAPI::registerCluster(self->managementDb, clusterName, ccs, clusterEntry)); - fmt::print("Register cluster {}\n", printable(clusterName)); } while (self->createdTenants.size() < self->initialTenants) { wait(createTenant(self, true)); } - fmt::print("Setup complete\n"); return Void(); } @@ -195,8 +190,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { addDefaultBackupRanges(backupRanges); - fmt::print("Backup cluster start {}\n", printable(clusterName)); - try { wait(backupAgent.submitBackup( dataDb, backupContainer, {}, 0, 0, clusterName.toString(), backupRanges, StopWhenDone::True)); @@ -205,11 +198,8 @@ struct MetaclusterRestoreWorkload : TestWorkload { throw; } - fmt::print("Backup submitted {}\n", printable(clusterName)); - state Reference container; wait(success(backupAgent.waitBackup(dataDb, clusterName.toString(), StopWhenDone::True, &container))); - fmt::print("Backup completed {} {}\n", printable(clusterName), container->getURL()); return container->getURL(); } @@ -221,8 +211,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { state Standalone> backupRanges; addDefaultBackupRanges(backupRanges); - fmt::print("Restore cluster start {}\n", printable(clusterName)); - wait(runTransaction(dataDb.getReference(), [backupRanges = backupRanges](Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -232,12 +220,8 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Future(Void()); })); - fmt::print("Restore cleared data {}\n", printable(clusterName)); - wait(success(backupAgent.restore(dataDb, dataDb, clusterName, StringRef(backupUrl), {}, backupRanges))); - fmt::print("Restore cluster {}\n", printable(clusterName)); - wait(MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->getConnectionRecord()->getConnectionString(), @@ -245,8 +229,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { self->dataDbs[clusterName].restored = true; - fmt::print("Restore added back to metacluster {}\n", printable(clusterName)); - return Void(); } @@ -263,14 +245,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } - fmt::print("Create tenant {}\n", printable(tenantName)); - loop { try { TenantMapEntry tenantEntry; tenantEntry.tenantGroup = self->chooseTenantGroup(); wait(MetaclusterAPI::createTenant(self->managementDb, tenantName, tenantEntry)); - fmt::print("Created tenant {}\n", printable(tenantName)); TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); self->createdTenants[tenantName] = TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, beforeBackup); @@ -282,13 +261,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { } return Void(); } catch (Error& e) { - fmt::print("Tenant create error {} {}\n", e.what(), printable(tenantName)); if (e.code() != error_code_metacluster_no_capacity) { throw; } wait(increaseMetaclusterCapacity(self)); - fmt::print("Increased metacluster capacity {}\n", printable(tenantName)); } } } @@ -306,8 +283,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } - fmt::print("Delete tenant {}\n", printable(tenantName)); - wait(MetaclusterAPI::deleteTenant(self->managementDb, tenantName)); auto const& tenantData = self->createdTenants[tenantName]; @@ -375,7 +350,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { } wait(increaseMetaclusterCapacity(self)); - fmt::print("Increased metacluster capacity {}\n", printable(tenantName)); } } } @@ -514,7 +488,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { for (auto tenantName : clusterData.tenants) { TenantData tenantData = self->createdTenants[tenantName]; if (tenantData.beforeBackup) { - fmt::print("Expected tenant: {}\n", printable(tenantName)); ++expectedTenantCount; auto tenantItr = tenantMap.find(tenantName); ASSERT(tenantItr != tenantMap.end()); @@ -525,12 +498,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } - fmt::print("Size check: {} {}\n", tenants.size(), expectedTenantCount); - for (auto tenant : tenants) { - fmt::print("Has tenant {}, {}\n", - printable(tenant.first), - clusterData.tenants.find(tenant.first) != clusterData.tenants.end()); - } ASSERT(tenants.size() == expectedTenantCount); } diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index bdd627c2f2..05548883c8 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -269,6 +269,7 @@ ERROR( management_cluster_invalid_access, 2167, "Standard transactions cannot be ERROR( tenant_creation_permanently_failed, 2168, "The tenant creation did not complete in a timely manner and has permanently failed" ) ERROR( cluster_removed, 2169, "The cluster is being removed from the metacluster" ) ERROR( cluster_restoring, 2170, "The cluster is being restored to the metacluster" ) +ERROR( invalid_data_cluster, 2171, "The data cluster being restored has no record of its metacluster" ) // 2200 - errors from bindings and official APIs ERROR( api_version_unset, 2200, "API version is not set" ) From 5c7329570926a9db211764dcdb090f4cedbc320d Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 24 Oct 2022 14:51:20 -0700 Subject: [PATCH 22/57] Better enforcement that a restoring data cluster won't be modified --- .../fdbclient/MetaclusterManagement.actor.h | 64 +++++++++++++++---- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index d809434307..13855f432d 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -209,8 +209,27 @@ struct MetaclusterOperationContext { Optional metaclusterRegistration; Optional dataClusterMetadata; - MetaclusterOperationContext(Reference managementDb, Optional clusterName = {}) - : managementDb(managementDb), clusterName(clusterName) {} + std::set extraSupportedDataClusterStates; + + MetaclusterOperationContext(Reference managementDb, + Optional clusterName = {}, + std::set extraSupportedDataClusterStates = {}) + : managementDb(managementDb), clusterName(clusterName), + extraSupportedDataClusterStates(extraSupportedDataClusterStates) {} + + void checkClusterState() { + DataClusterState clusterState = + dataClusterMetadata.present() ? dataClusterMetadata.get().entry.clusterState : DataClusterState::READY; + if (clusterState != DataClusterState::READY && extraSupportedDataClusterStates.count(clusterState) == 0) { + if (clusterState == DataClusterState::RESTORING) { + throw cluster_restoring(); + } else if (clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + + ASSERT(false); + } + } // Run a transaction on the management cluster. This verifies that the cluster is a management cluster and matches // the same metacluster that we've run any previous transactions on. If a clusterName is set, it also verifies that @@ -285,6 +304,8 @@ struct MetaclusterOperationContext { } } + self->checkClusterState(); + state decltype(std::declval()(Reference()).getValue()) result = wait(func(tr)); @@ -313,6 +334,8 @@ struct MetaclusterOperationContext { ASSERT(self->metaclusterRegistration.present() && self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + self->checkClusterState(); + state Reference tr = self->dataClusterDb->createTransaction(); loop { try { @@ -358,6 +381,8 @@ struct MetaclusterOperationContext { wait(store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); } + self->checkClusterState(); + return Void(); } @@ -733,7 +758,8 @@ struct RemoveClusterImpl { Optional lastTenantId; RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) - : ctx(managementDb, clusterName), forceRemove(forceRemove) {} + : ctx(managementDb, clusterName, { DataClusterState::REMOVING, DataClusterState::RESTORING }), + forceRemove(forceRemove) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { @@ -1106,7 +1132,6 @@ struct RestoreClusterImpl { MetaclusterOperationContext ctx; // Initialization parameters - ClusterName clusterName; ClusterConnectionString connectionString; ApplyManagementClusterUpdates applyManagementClusterUpdates; @@ -1121,7 +1146,7 @@ struct RestoreClusterImpl { ClusterName clusterName, ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates) - : ctx(managementDb, clusterName), clusterName(clusterName), connectionString(connectionString), + : ctx(managementDb, clusterName, { DataClusterState::RESTORING }), connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates) {} // Check that the restored data cluster has a matching metacluster registration entry @@ -1150,6 +1175,13 @@ struct RestoreClusterImpl { updateClusterMetadata( tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), connectionString, updatedEntry); + + // Remove this cluster from the cluster capacity index, but leave its configured capacity intact in the + // cluster entry. This allows us to retain the configured capacity while preventing the cluster from + // being used to allocate new tenant groups. + DataClusterEntry noCapacityEntry = updatedEntry; + noCapacityEntry.capacity.numTenantGroups = 0; + updateClusterCapacityIndex(tr, ctx.clusterName.get(), updatedEntry, noCapacityEntry); } TraceEvent("MarkedDataClusterRestoring").detail("Name", ctx.clusterName.get()); @@ -1161,6 +1193,11 @@ struct RestoreClusterImpl { updatedEntry.clusterState = DataClusterState::READY; updateClusterMetadata(tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), {}, updatedEntry); + + // Add this cluster back to the cluster capacity index so that it can be assigned to again. + DataClusterEntry noCapacityEntry = updatedEntry; + noCapacityEntry.capacity.numTenantGroups = 0; + updateClusterCapacityIndex(tr, ctx.clusterName.get(), noCapacityEntry, updatedEntry); } TraceEvent("MarkedDataClusterReady") @@ -1169,10 +1206,11 @@ struct RestoreClusterImpl { } ACTOR static Future markManagementTenantAsError(Reference tr, - TenantName tenantName) { + TenantName tenantName, + int64_t tenantId) { state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); - if (!tenantEntry.present()) { + if (!tenantEntry.present() || tenantEntry.get().id != tenantId) { return Void(); } @@ -1213,7 +1251,7 @@ struct RestoreClusterImpl { if (t.second.tenantState != TenantState::RENAMING_FROM) { self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); } - if (t.second.assignedCluster.present() && self->clusterName == t.second.assignedCluster.get()) { + if (t.second.assignedCluster.present() && self->ctx.clusterName.get() == t.second.assignedCluster.get()) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); } } @@ -1283,6 +1321,10 @@ struct RestoreClusterImpl { TenantMapEntry updatedEntry) { TenantMapEntry existingEntry = wait(TenantAPI::getTenantTransaction(tr, tenantName)); updatedEntry.assignedCluster = Optional(); + + // It should not be possible to modify tenants in the data cluster while it is in the restoring state + ASSERT(existingEntry.id == updatedEntry.id); + if (existingEntry.configurationSequenceNum <= updatedEntry.configurationSequenceNum) { wait(TenantAPI::configureTenantTransaction(tr, tenantName, existingEntry, updatedEntry)); } @@ -1313,7 +1355,7 @@ struct RestoreClusterImpl { state TenantMapEntry managementTenant = self->mgmtClusterTenantMap[managementName]; ASSERT(managementTenant.assignedCluster.present() && - managementTenant.assignedCluster.get() == self->clusterName); + managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); // Rename if (tenantName != managementName) { @@ -1407,8 +1449,8 @@ struct RestoreClusterImpl { managementTenant.tenantState != TenantState::REGISTERING && managementTenant.tenantState != TenantState::REMOVING && managementTenant.tenantState != TenantState::ERROR) { - wait(self->ctx.runManagementTransaction([tenantName](Reference tr) { - return markManagementTenantAsError(tr, tenantName); + wait(self->ctx.runManagementTransaction([tenantName, managementTenant](Reference tr) { + return markManagementTenantAsError(tr, tenantName, managementTenant.id); })); } ++setItr; From 7179655b4402d7e3a2470bdb94c994cf47431811 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 25 Oct 2022 09:04:55 -0700 Subject: [PATCH 23/57] Fix merge issue --- fdbserver/SimulatedCluster.actor.cpp | 31 +++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index ed1afd0981..9e27d5c4c2 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2512,23 +2512,20 @@ void setupSimulatedSystem(std::vector>* systemActors, LocalityData localities(Optional>(), newZoneId, newMachineId, dcUID); localities.set("data_hall"_sr, dcUID); - systemActors->push_back( - reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), - conn, - extraIps, - sslEnabled, - localities, - processClass, - baseFolder, - false, - machine == useSeedForMachine, - testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, - sslOnly, - whitelistBinPaths, - protocolVersion, - configDBType, - true), - "SimulatedMachine")); + systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), + extraIps, + sslEnabled, + localities, + processClass, + baseFolder, + false, + machine == useSeedForMachine, + testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, + sslOnly, + whitelistBinPaths, + protocolVersion, + configDBType), + "SimulatedMachine")); ++cluster; } } From 73b4cae1ae1ef0949a4692a08eef8d2880d73b84 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 25 Oct 2022 09:11:29 -0700 Subject: [PATCH 24/57] Fix formatting issues --- .../fdbclient/MetaclusterManagement.actor.h | 7 +++-- fdbserver/SimulatedCluster.actor.cpp | 29 ++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 13855f432d..48f5fe13b6 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1449,9 +1449,10 @@ struct RestoreClusterImpl { managementTenant.tenantState != TenantState::REGISTERING && managementTenant.tenantState != TenantState::REMOVING && managementTenant.tenantState != TenantState::ERROR) { - wait(self->ctx.runManagementTransaction([tenantName, managementTenant](Reference tr) { - return markManagementTenantAsError(tr, tenantName, managementTenant.id); - })); + wait(self->ctx.runManagementTransaction( + [tenantName, managementTenant](Reference tr) { + return markManagementTenantAsError(tr, tenantName, managementTenant.id); + })); } ++setItr; } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 9e27d5c4c2..78c96e56e2 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2512,20 +2512,21 @@ void setupSimulatedSystem(std::vector>* systemActors, LocalityData localities(Optional>(), newZoneId, newMachineId, dcUID); localities.set("data_hall"_sr, dcUID); - systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), - extraIps, - sslEnabled, - localities, - processClass, - baseFolder, - false, - machine == useSeedForMachine, - testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, - sslOnly, - whitelistBinPaths, - protocolVersion, - configDBType), - "SimulatedMachine")); + systemActors->push_back( + reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), + extraIps, + sslEnabled, + localities, + processClass, + baseFolder, + false, + machine == useSeedForMachine, + testConfig.extraDatabaseBackupAgents ? AgentAddition : AgentNone, + sslOnly, + whitelistBinPaths, + protocolVersion, + configDBType), + "SimulatedMachine")); ++cluster; } } From 795e8aa63f46e328a2b892bc2515b53d5fba898f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 28 Oct 2022 15:08:38 -0700 Subject: [PATCH 25/57] Update the metacluster restore test to run operations while the backup is in flight --- .../MetaclusterRestoreWorkload.actor.cpp | 75 ++++++++++++------- tests/slow/MetaclusterRecovery.toml | 2 +- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index bea63ad232..d29cd6142e 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -52,13 +52,15 @@ struct MetaclusterRestoreWorkload : TestWorkload { }; struct TenantData { + enum class CreateTime { BEFORE_BACKUP, DURING_BACKUP, AFTER_BACKUP }; + ClusterName cluster; Optional tenantGroup; - bool beforeBackup = true; + CreateTime createTime = CreateTime::BEFORE_BACKUP; TenantData() {} - TenantData(ClusterName cluster, Optional tenantGroup, bool beforeBackup) - : cluster(cluster), tenantGroup(tenantGroup), beforeBackup(beforeBackup) {} + TenantData(ClusterName cluster, Optional tenantGroup, CreateTime createTime) + : cluster(cluster), tenantGroup(tenantGroup), createTime(createTime) {} }; Reference managementDb; @@ -73,6 +75,9 @@ struct MetaclusterRestoreWorkload : TestWorkload { int maxTenantGroups; int tenantGroupCapacity; + bool backupComplete = false; + double endTime = std::numeric_limits::max(); + MetaclusterRestoreWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { maxTenants = std::min(1e8 - 1, getOption(options, "maxTenants"_sr, 1000)); initialTenants = std::min(maxTenants, getOption(options, "initialTenants"_sr, 100)); @@ -175,7 +180,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { } while (self->createdTenants.size() < self->initialTenants) { - wait(createTenant(self, true)); + wait(createTenant(self, TenantData::CreateTime::BEFORE_BACKUP)); } return Void(); @@ -232,7 +237,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } - ACTOR static Future createTenant(MetaclusterRestoreWorkload* self, bool beforeBackup) { + ACTOR static Future createTenant(MetaclusterRestoreWorkload* self, TenantData::CreateTime createTime) { state TenantName tenantName; for (int i = 0; i < 10; ++i) { tenantName = self->chooseTenantName(); @@ -252,7 +257,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { wait(MetaclusterAPI::createTenant(self->managementDb, tenantName, tenantEntry)); TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); self->createdTenants[tenantName] = - TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, beforeBackup); + TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, createTime); auto& dataDb = self->dataDbs[createdEntry.assignedCluster.get()]; dataDb.tenants.insert(tenantName); if (createdEntry.tenantGroup.present()) { @@ -393,6 +398,25 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } + ACTOR static Future runOperations(MetaclusterRestoreWorkload* self) { + while (now() < self->endTime) { + state int operation = deterministicRandom()->randomInt(0, 4); + if (operation == 0) { + wait(createTenant(self, + self->backupComplete ? TenantData::CreateTime::AFTER_BACKUP + : TenantData::CreateTime::DURING_BACKUP)); + } else if (operation == 1) { + wait(deleteTenant(self)); + } else if (operation == 2) { + wait(configureTenant(self)); + } else if (operation == 3) { + wait(renameTenant(self)); + } + } + + return Void(); + } + Future start(Database const& cx) override { if (clientId == 0) { return _start(cx, this); @@ -413,7 +437,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { clustersToRestore.insert(deterministicRandom()->randomChoice(self->dataDbIndex)); } - // TODO: partially completed operations before backup + state Future opsFuture = runOperations(self); state std::map> backups; for (auto cluster : clustersToRestore) { @@ -424,20 +448,10 @@ struct MetaclusterRestoreWorkload : TestWorkload { wait(success(f)); } - // Make random tenant mutations - state int tenantMutationNum; - for (tenantMutationNum = 0; tenantMutationNum < 100; ++tenantMutationNum) { - state int operation = deterministicRandom()->randomInt(0, 4); - if (operation == 0) { - wait(createTenant(self, false)); - } else if (operation == 1) { - wait(deleteTenant(self)); - } else if (operation == 2) { - wait(configureTenant(self)); - } else if (operation == 3) { - wait(renameTenant(self)); - } - } + self->backupComplete = true; + self->endTime = now() + 30.0; + + wait(opsFuture); std::vector> restores; for (auto [cluster, backupUrl] : backups) { @@ -487,14 +501,16 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::map tenantMap(tenants.begin(), tenants.end()); for (auto tenantName : clusterData.tenants) { TenantData tenantData = self->createdTenants[tenantName]; - if (tenantData.beforeBackup) { + auto tenantItr = tenantMap.find(tenantName); + if (tenantData.createTime == TenantData::CreateTime::BEFORE_BACKUP) { ++expectedTenantCount; - auto tenantItr = tenantMap.find(tenantName); ASSERT(tenantItr != tenantMap.end()); ASSERT(tenantData.cluster == clusterName); ASSERT(tenantItr->second.tenantGroup == tenantData.tenantGroup); - } else { - ASSERT(tenantMap.count(tenantName) == 0); + } else if (tenantData.createTime == TenantData::CreateTime::AFTER_BACKUP) { + ASSERT(tenantItr == tenantMap.end()); + } else if (tenantItr != tenantMap.end()) { + ++expectedTenantCount; } } @@ -512,8 +528,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::map tenantMap(tenants.begin(), tenants.end()); for (auto& [tenantName, tenantData] : self->createdTenants) { TenantMapEntry const& entry = tenantMap[tenantName]; - if (!tenantData.beforeBackup && self->dataDbs[tenantData.cluster].restored) { - ASSERT(entry.tenantState == TenantState::ERROR); + if (tenantData.createTime != TenantData::CreateTime::BEFORE_BACKUP && + self->dataDbs[tenantData.cluster].restored) { + ASSERT(entry.tenantState == TenantState::ERROR || + (entry.tenantState == TenantState::READY && + tenantData.createTime == TenantData::CreateTime::DURING_BACKUP)); } else { ASSERT(entry.tenantState == TenantState::READY); } @@ -531,7 +550,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR static Future _check(MetaclusterRestoreWorkload* self) { // The metacluster consistency check runs the tenant consistency check for each cluster state MetaclusterConsistencyCheck metaclusterConsistencyCheck( - self->managementDb, AllowPartialMetaclusterOperations::False); + self->managementDb, AllowPartialMetaclusterOperations::True); wait(metaclusterConsistencyCheck.run()); diff --git a/tests/slow/MetaclusterRecovery.toml b/tests/slow/MetaclusterRecovery.toml index 98d5b5fc71..304d789bf8 100644 --- a/tests/slow/MetaclusterRecovery.toml +++ b/tests/slow/MetaclusterRecovery.toml @@ -1,10 +1,10 @@ [configuration] allowDefaultTenant = false -allowDisablingTenants = false allowCreatingTenants = false extraDatabaseMode = 'Multiple' extraDatabaseCount = 5 extraDatabaseBackupAgents = true +tenantModes = ['optional', 'required'] [[test]] testTitle = 'MetaclusterRestoreTest' From 3a9b2d072138a14d49519c9d58681963fa2551cf Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 3 Nov 2022 15:38:33 -0700 Subject: [PATCH 26/57] Fix merge error --- fdbserver/SimulatedCluster.actor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 78c96e56e2..ed1afd0981 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2514,6 +2514,7 @@ void setupSimulatedSystem(std::vector>* systemActors, localities.set("data_hall"_sr, dcUID); systemActors->push_back( reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), + conn, extraIps, sslEnabled, localities, @@ -2525,7 +2526,8 @@ void setupSimulatedSystem(std::vector>* systemActors, sslOnly, whitelistBinPaths, protocolVersion, - configDBType), + configDBType, + true), "SimulatedMachine")); ++cluster; } From 6665eb66547172f9b24dbab25cdb36aabb36bdda Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 31 Jan 2023 16:06:45 -0800 Subject: [PATCH 27/57] Update cluster recovery for tenant ID changes --- .../fdbclient/MetaclusterManagement.actor.h | 234 ++++++++---------- .../workloads/MetaclusterConsistency.actor.h | 3 - .../MetaclusterRestoreWorkload.actor.cpp | 3 +- .../TenantManagementWorkload.actor.cpp | 11 +- 4 files changed, 112 insertions(+), 139 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 48f5fe13b6..a6959c09b8 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1136,11 +1136,10 @@ struct RestoreClusterImpl { ApplyManagementClusterUpdates applyManagementClusterUpdates; // Tenant list from data and management clusters - std::unordered_map dataClusterTenantMap; - std::unordered_set dataClusterTenantIdSet; - std::unordered_map mgmtClusterTenantMap; - std::unordered_map mgmtClusterTenantIdIndex; - std::unordered_set mgmtClusterTenantSetForCurrentDataCluster; + std::unordered_map dataClusterTenantMap; + std::unordered_set dataClusterTenantNames; + std::unordered_map mgmtClusterTenantMap; + std::unordered_set mgmtClusterTenantSetForCurrentDataCluster; RestoreClusterImpl(Reference managementDb, ClusterName clusterName, @@ -1205,76 +1204,57 @@ struct RestoreClusterImpl { .detail("Version", tr->getCommittedVersion()); } - ACTOR static Future markManagementTenantAsError(Reference tr, - TenantName tenantName, - int64_t tenantId) { - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantName)); + ACTOR static Future markManagementTenantAsError(Reference tr, int64_t tenantId) { + state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantId)); - if (!tenantEntry.present() || tenantEntry.get().id != tenantId) { + if (!tenantEntry.present()) { return Void(); } tenantEntry.get().tenantState = TenantState::ERROR; tenantEntry.get().error = "The tenant is missing after restoring its data cluster"; - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantName, tenantEntry.get()); + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantId, tenantEntry.get()); return Void(); } ACTOR static Future getTenantsFromDataCluster(RestoreClusterImpl* self, Reference tr) { - TenantNameRef begin = ""_sr; - TenantNameRef end = "\xff\xff"_sr; - state Future>> tenantsFuture = - TenantMetadata::tenantMap().getRange(tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - state KeyBackedRangeResult> tenants = wait(tenantsFuture); + state KeyBackedRangeResult> tenants = + wait(TenantMetadata::tenantMap().getRange(tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); for (auto t : tenants.results) { self->dataClusterTenantMap.emplace(t.first, t.second); - self->dataClusterTenantIdSet.emplace(t.second.id); + self->dataClusterTenantNames.insert(t.second.tenantName); } return Void(); } - ACTOR static Future> getTenantsFromManagementCluster( - RestoreClusterImpl* self, - Reference tr, - TenantName initialTenantName) { - TenantNameRef begin = initialTenantName; - TenantNameRef end = "\xff\xff"_sr; - state Future>> tenantsFuture = - ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( - tr, begin, end, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - state KeyBackedRangeResult> tenants = wait(tenantsFuture); + ACTOR static Future> getTenantsFromManagementCluster(RestoreClusterImpl* self, + Reference tr, + int64_t initialTenantId) { + state KeyBackedRangeResult> tenants = + wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( + tr, initialTenantId, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); for (auto t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); - if (t.second.tenantState != TenantState::RENAMING_FROM) { - self->mgmtClusterTenantIdIndex.emplace(t.second.id, t.first); - } if (t.second.assignedCluster.present() && self->ctx.clusterName.get() == t.second.assignedCluster.get()) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); } } - return std::pair{ !tenants.more, - tenants.results.empty() ? ""_sr : tenants.results.rbegin()->first }; + return tenants.more ? Optional(tenants.results.rbegin()->first + 1) : Optional(); } ACTOR static Future getAllTenantsFromManagementCluster(RestoreClusterImpl* self) { // get all tenants across all data clusters - state TenantName beginTenant = ""_sr; - loop { - std::pair tenantsItr = wait(self->ctx.runManagementTransaction( - [self = self, beginTenant = beginTenant](Reference tr) { - return getTenantsFromManagementCluster(self, tr, beginTenant); - })); - - if (!tenantsItr.first) { - // Not all tenants retrieved yet. Begin next loop from this tenant. - beginTenant = keyAfter(tenantsItr.second); - } else { - break; - } + state Optional beginTenant = 0; + while (beginTenant.present()) { + wait(store(beginTenant, + self->ctx.runManagementTransaction( + [self = self, beginTenant = beginTenant](Reference tr) { + return getTenantsFromManagementCluster(self, tr, beginTenant.get()); + }))); } return Void(); @@ -1286,29 +1266,26 @@ struct RestoreClusterImpl { TenantName oldTenantName, TenantName newTenantName, int configurationSequenceNum) { - state Optional oldEntry; - state Optional newEntry; + state Optional entry; + state Optional newId; - wait(store(oldEntry, TenantAPI::tryGetTenantTransaction(tr, oldTenantName)) && - store(newEntry, TenantAPI::tryGetTenantTransaction(tr, newTenantName))); + wait(store(entry, TenantAPI::tryGetTenantTransaction(tr, tenantId))); + wait(store(newId, TenantMetadata::tenantNameIndex().get(tr, newTenantName))); - if (oldEntry.present() && oldEntry.get().id == tenantId && !newEntry.present()) { + if (entry.present() && entry.get().tenantName == oldTenantName && !newId.present()) { wait(TenantAPI::renameTenantTransaction( tr, oldTenantName, newTenantName, tenantId, ClusterType::METACLUSTER_DATA, configurationSequenceNum)); return Void(); - } else if (newEntry.present() && newEntry.get().id == tenantId) { - // Rename already succeeded - return Void(); } TraceEvent(SevWarnAlways, "RestoreDataClusterRenameError") .detail("OldName", oldTenantName) .detail("NewName", newTenantName) .detail("TenantID", tenantId) - .detail("OldEntryPresent", oldEntry.present()) - .detail("NewEntryPresent", newEntry.present()); + .detail("OldEntryPresent", entry.present()) + .detail("NewEntryPresent", newId.present()); - if (newEntry.present()) { + if (newId.present()) { throw tenant_already_exists(); } else { throw tenant_not_found(); @@ -1317,16 +1294,16 @@ struct RestoreClusterImpl { ACTOR static Future updateTenantConfiguration(RestoreClusterImpl* self, Reference tr, - TenantName tenantName, + int64_t tenantId, TenantMapEntry updatedEntry) { - TenantMapEntry existingEntry = wait(TenantAPI::getTenantTransaction(tr, tenantName)); + TenantMapEntry existingEntry = wait(TenantAPI::getTenantTransaction(tr, tenantId)); updatedEntry.assignedCluster = Optional(); // It should not be possible to modify tenants in the data cluster while it is in the restoring state - ASSERT(existingEntry.id == updatedEntry.id); + ASSERT(existingEntry.tenantName == updatedEntry.tenantName); if (existingEntry.configurationSequenceNum <= updatedEntry.configurationSequenceNum) { - wait(TenantAPI::configureTenantTransaction(tr, tenantName, existingEntry, updatedEntry)); + wait(TenantAPI::configureTenantTransaction(tr, existingEntry, updatedEntry)); } return Void(); @@ -1335,82 +1312,72 @@ struct RestoreClusterImpl { // Updates a tenant to match the management cluster state // Returns the name of the tenant after it has been reconciled ACTOR static Future>> reconcileTenant(RestoreClusterImpl* self, - TenantName tenantName, TenantMapEntry tenantEntry) { - - auto tenantNameOnMgmtCluster = self->mgmtClusterTenantIdIndex.find(tenantEntry.id); + state std::unordered_map::iterator managementEntry = + self->mgmtClusterTenantMap.find(tenantEntry.id); // Delete - if (tenantNameOnMgmtCluster == self->mgmtClusterTenantIdIndex.end()) { + if (managementEntry == self->mgmtClusterTenantMap.end()) { if (self->applyManagementClusterUpdates) { - wait(self->ctx.runDataClusterTransaction( - [tenantName = tenantName, tenantEntry = tenantEntry](Reference tr) { - return TenantAPI::deleteTenantTransaction( - tr, tenantName, tenantEntry.id, ClusterType::METACLUSTER_DATA); - })); + wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { + return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); + })); } return Optional>(); - } else { - state TenantName managementName = self->mgmtClusterTenantIdIndex[tenantEntry.id]; - state TenantMapEntry managementTenant = self->mgmtClusterTenantMap[managementName]; - - ASSERT(managementTenant.assignedCluster.present() && - managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); - - // Rename - if (tenantName != managementName) { - ASSERT(managementTenant.tenantState != TenantState::RENAMING_FROM); - state TenantName temporaryName; - if (self->dataClusterTenantMap.count(managementName) > 0) { - temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementName); - } else { - temporaryName = managementName; - } - - if (self->applyManagementClusterUpdates) { - wait(self->ctx.runDataClusterTransaction( - [self = self, - tenantName = tenantName, - temporaryName = temporaryName, - tenantEntry = tenantEntry, - managementTenant = managementTenant](Reference tr) { - return renameTenant(self, - tr, - tenantEntry.id, - tenantName, - temporaryName, - managementTenant.configurationSequenceNum); - })); - } - tenantName = temporaryName; - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state - } - - // Update configuration - if (!managementTenant.matchesConfiguration(tenantEntry) || - managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { - if (self->applyManagementClusterUpdates) { - ASSERT(managementTenant.configurationSequenceNum >= tenantEntry.configurationSequenceNum); - wait(self->ctx.runDataClusterTransaction( - [self = self, tenantName = tenantName, managementTenant = managementTenant]( - Reference tr) { - return updateTenantConfiguration(self, tr, tenantName, managementTenant); - })); - } - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the - // UPDATING_CONFIGURATION state - } - - return std::make_pair(tenantName, managementTenant); } + + state TenantName tenantName = tenantEntry.tenantName; + state TenantMapEntry managementTenant = managementEntry->second; + ASSERT(managementTenant.assignedCluster.present() && + managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); + + // Rename + if (tenantName != managementTenant.tenantName) { + state TenantName temporaryName; + if (self->dataClusterTenantNames.count(managementTenant.tenantName) > 0) { + temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementTenant.tenantName); + } else { + temporaryName = managementTenant.tenantName; + } + + if (self->applyManagementClusterUpdates) { + wait(self->ctx.runDataClusterTransaction([self = self, + tenantName = tenantName, + temporaryName = temporaryName, + tenantEntry = tenantEntry, + managementTenant = + managementTenant](Reference tr) { + return renameTenant( + self, tr, tenantEntry.id, tenantName, temporaryName, managementTenant.configurationSequenceNum); + })); + } + tenantName = temporaryName; + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state + } + + // Update configuration + if (!managementTenant.matchesConfiguration(tenantEntry) || + managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { + if (self->applyManagementClusterUpdates) { + ASSERT(managementTenant.configurationSequenceNum >= tenantEntry.configurationSequenceNum); + wait(self->ctx.runDataClusterTransaction( + [self = self, tenantId = tenantEntry.id, managementTenant = managementTenant]( + Reference tr) { + return updateTenantConfiguration(self, tr, tenantId, managementTenant); + })); + } + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the + // UPDATING_CONFIGURATION state + } + + return std::make_pair(tenantName, managementTenant); } ACTOR static Future reconcileTenants(RestoreClusterImpl* self) { state std::unordered_map partiallyRenamedTenants; - state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); + state std::unordered_map::iterator itr = self->dataClusterTenantMap.begin(); while (itr != self->dataClusterTenantMap.end()) { - Optional> result = - wait(reconcileTenant(self, itr->first, itr->second)); + Optional> result = wait(reconcileTenant(self, itr->second)); if (result.present() && result.get().first.startsWith(metaclusterTemporaryRenamePrefix)) { partiallyRenamedTenants[result.get().first] = result.get().second; } @@ -1434,10 +1401,10 @@ struct RestoreClusterImpl { } ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { - state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { - TenantName tenantName = *setItr; - TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantName]; + int64_t tenantId = *setItr; + TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantId]; // If a tenant is present on the management cluster and not on the data cluster, mark it in an error state // unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be missing @@ -1445,14 +1412,13 @@ struct RestoreClusterImpl { // // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the // tenant) - if (self->dataClusterTenantIdSet.find(managementTenant.id) == self->dataClusterTenantIdSet.end() && + if (self->dataClusterTenantMap.find(tenantId) == self->dataClusterTenantMap.end() && managementTenant.tenantState != TenantState::REGISTERING && managementTenant.tenantState != TenantState::REMOVING && managementTenant.tenantState != TenantState::ERROR) { - wait(self->ctx.runManagementTransaction( - [tenantName, managementTenant](Reference tr) { - return markManagementTenantAsError(tr, tenantName, managementTenant.id); - })); + wait(self->ctx.runManagementTransaction([tenantId](Reference tr) { + return markManagementTenantAsError(tr, tenantId); + })); } ++setItr; } @@ -1581,9 +1547,9 @@ struct CreateTenantImpl { return true; } else { // The previous creation is permanently failed, so cleanup the tenant and create it again from - // scratch. We don't need to remove it from the tenant map because we will overwrite the existing + // scratch. We don't need to remove it from the tenant name index because we will overwrite the existing // entry later in this transaction. - ManagementClusterMetadata::tenantMetadata().tenantIdIndex.erase(tr, existingEntry.get().id); + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, existingEntry.get().id); ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, -1, MutationRef::AddValue); ManagementClusterMetadata::clusterTenantCount.atomicOp( tr, existingEntry.get().assignedCluster.get(), -1, MutationRef::AddValue); diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index d03fbf782e..3398363d7b 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -296,9 +296,6 @@ private: for (auto tenantName : expectedTenants) { TenantMapEntry const& metaclusterEntry = self->managementMetadata.tenantMap[tenantName]; if (!dataClusterTenantMap.count(tenantName)) { - if (metaclusterEntry.tenantGroup.present()) { - tenantGroupsWithCompletedTenants.insert(metaclusterEntry.tenantGroup.get()); - } ASSERT(metaclusterEntry.tenantState == TenantState::REGISTERING || metaclusterEntry.tenantState == TenantState::REMOVING || metaclusterEntry.tenantState == TenantState::ERROR); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index d29cd6142e..7cbbdec54d 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -253,8 +253,9 @@ struct MetaclusterRestoreWorkload : TestWorkload { loop { try { TenantMapEntry tenantEntry; + tenantEntry.tenantName = tenantName; tenantEntry.tenantGroup = self->chooseTenantGroup(); - wait(MetaclusterAPI::createTenant(self->managementDb, tenantName, tenantEntry)); + wait(MetaclusterAPI::createTenant(self->managementDb, tenantEntry, AssignClusterAutomatically::True)); TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); self->createdTenants[tenantName] = TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, createTime); diff --git a/fdbserver/workloads/TenantManagementWorkload.actor.cpp b/fdbserver/workloads/TenantManagementWorkload.actor.cpp index f60258f3c0..7f7b534ab0 100644 --- a/fdbserver/workloads/TenantManagementWorkload.actor.cpp +++ b/fdbserver/workloads/TenantManagementWorkload.actor.cpp @@ -1496,12 +1496,19 @@ struct TenantManagementWorkload : TestWorkload { state bool specialKeysUseInvalidTuple = operationType == OperationType::SPECIAL_KEYS && deterministicRandom()->random01() < 0.1; + // True if any selected options would change the tenant's configuration and we would expect an update to be + // written + state bool configurationChanging = false; + // Generate a tenant group. Sometimes do this at the same time that we include an invalid option to ensure // that the configure function still fails if (!hasInvalidOption || deterministicRandom()->coinflip()) { newTenantGroup = self->chooseTenantGroup(true); hasSystemTenantGroup = hasSystemTenantGroup || newTenantGroup.orDefault(""_sr).startsWith("\xff"_sr); configuration["tenant_group"_sr] = newTenantGroup; + if (exists && itr->second.tenantGroup != newTenantGroup) { + configurationChanging = true; + } } if (hasInvalidOption) { configuration["invalid_option"_sr] = ""_sr; @@ -1517,7 +1524,9 @@ struct TenantManagementWorkload : TestWorkload { ASSERT(!hasSystemTenantGroup); ASSERT(!specialKeysUseInvalidTuple); Versionstamp currentVersionstamp = wait(getLastTenantModification(self, operationType)); - ASSERT_GT(currentVersionstamp.version, originalReadVersion); + if (configurationChanging) { + ASSERT_GT(currentVersionstamp.version, originalReadVersion); + } auto itr = self->createdTenants.find(tenant); if (itr->second.tenantGroup.present()) { From 0410f7d9a1e59327a7e3242339a6222f93e7acb9 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 7 Feb 2023 15:08:46 -0800 Subject: [PATCH 28/57] Add intial support for restoring a management cluster in a metacluster --- fdbcli/MetaclusterCommands.actor.cpp | 3 +- fdbclient/Metacluster.cpp | 2 + .../fdbclient/MetaclusterManagement.actor.h | 346 ++++++++---- .../include/fdbclient/RunTransaction.actor.h | 15 + .../MetaclusterManagementWorkload.actor.cpp | 3 +- .../MetaclusterRestoreWorkload.actor.cpp | 524 ++++++++++++++---- 6 files changed, 692 insertions(+), 201 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 2b3c508b41..32acf57988 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -197,7 +197,8 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); return true; } else if (tokens[4] == "repopulate_from_data_cluster"_sr) { - fmt::print(stderr, "ERROR: the `repopulate_from_data_cluster' restore mode is not currently supported\n"); + wait(MetaclusterAPI::restoreCluster( + db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::False)); return false; } else { fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(tokens[4])); diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp index 4d4efcda56..ffc14bd778 100644 --- a/fdbclient/Metacluster.cpp +++ b/fdbclient/Metacluster.cpp @@ -24,6 +24,8 @@ FDB_DEFINE_BOOLEAN_PARAM(ApplyManagementClusterUpdates); FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DEFINE_BOOLEAN_PARAM(AssignClusterAutomatically); +FDB_DEFINE_BOOLEAN_PARAM(GroupAlreadyExists); +FDB_DEFINE_BOOLEAN_PARAM(IsRestoring); std::string clusterTypeToString(const ClusterType& clusterType) { switch (clusterType) { diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index a6959c09b8..7f5f170076 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -87,6 +87,8 @@ struct DataClusterMetadata { FDB_DECLARE_BOOLEAN_PARAM(ApplyManagementClusterUpdates); FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DECLARE_BOOLEAN_PARAM(AssignClusterAutomatically); +FDB_DECLARE_BOOLEAN_PARAM(GroupAlreadyExists); +FDB_DECLARE_BOOLEAN_PARAM(IsRestoring); namespace MetaclusterAPI { @@ -592,14 +594,18 @@ void updateClusterMetadata(Transaction tr, ClusterNameRef name, DataClusterMetadata const& previousMetadata, Optional const& updatedConnectionString, - Optional const& updatedEntry) { + Optional const& updatedEntry, + IsRestoring isRestoring = IsRestoring::False) { if (updatedEntry.present()) { if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); - } else if (previousMetadata.entry.clusterState == DataClusterState::RESTORING && + } else if (!isRestoring && previousMetadata.entry.clusterState == DataClusterState::RESTORING && (!updatedEntry.present() || updatedEntry.get().clusterState != DataClusterState::READY)) { throw cluster_restoring(); + } else if (isRestoring) { + ASSERT(previousMetadata.entry.clusterState == DataClusterState::RESTORING || + updatedEntry.get().clusterState == DataClusterState::RESTORING); } ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); updateClusterCapacityIndex(tr, name, previousMetadata.entry, updatedEntry.get()); @@ -609,6 +615,37 @@ void updateClusterMetadata(Transaction tr, } } +// Store the cluster entry for the new cluster +ACTOR template +static Future registerInManagementCluster(Transaction tr, + ClusterName clusterName, + DataClusterEntry clusterEntry, + ClusterConnectionString connectionString) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, clusterName)); + if (dataClusterMetadata.present() && + !dataClusterMetadata.get().matchesConfiguration(DataClusterMetadata(clusterEntry, connectionString))) { + throw cluster_already_exists(); + } else if (!dataClusterMetadata.present()) { + clusterEntry.allocated = ClusterUsage(); + + if (clusterEntry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(clusterEntry.allocated.numTenantGroups, clusterName)); + } + ManagementClusterMetadata::dataClusters().set(tr, clusterName, clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, clusterName, connectionString); + } + + TraceEvent("RegisteredDataCluster") + .detail("ClusterName", clusterName) + .detail("ClusterID", clusterEntry.id) + .detail("Capacity", clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", connectionString.toString()); + + return Void(); +} + template struct RegisterClusterImpl { MetaclusterOperationContext ctx; @@ -696,42 +733,15 @@ struct RegisterClusterImpl { } } - // Store the cluster entry for the new cluster - ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, - Reference tr) { - state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); - if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration( - DataClusterMetadata(self->clusterEntry, self->connectionString))) { - throw cluster_already_exists(); - } else if (!dataClusterMetadata.present()) { - self->clusterEntry.allocated = ClusterUsage(); - - if (self->clusterEntry.hasCapacity()) { - ManagementClusterMetadata::clusterCapacityIndex.insert( - tr, Tuple::makeTuple(self->clusterEntry.allocated.numTenantGroups, self->clusterName)); - } - ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); - ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); - } - - TraceEvent("RegisteredDataCluster") - .detail("ClusterName", self->clusterName) - .detail("ClusterID", self->clusterEntry.id) - .detail("Capacity", self->clusterEntry.capacity) - .detail("Version", tr->getCommittedVersion()) - .detail("ConnectionString", self->connectionString.toString()); - - return Void(); - } - ACTOR static Future run(RegisterClusterImpl* self) { wait(self->ctx.runManagementTransaction( [self = self](Reference tr) { return registrationPrecheck(self, tr); })); // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we don't // have a metacluster registration on the data cluster wait(configureDataCluster(self)); - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return registerInManagementCluster(tr, self->clusterName, self->clusterEntry, self->connectionString); + })); return Void(); } Future run() { return run(this); } @@ -1059,7 +1069,8 @@ template void managementClusterAddTenantToGroup(Transaction tr, TenantMapEntry tenantEntry, DataClusterMetadata* clusterMetadata, - bool groupAlreadyExists) { + GroupAlreadyExists groupAlreadyExists, + IsRestoring isRestoring = IsRestoring::False) { if (tenantEntry.tenantGroup.present()) { if (tenantEntry.tenantGroup.get().startsWith("\xff"_sr)) { throw invalid_tenant_group_name(); @@ -1076,13 +1087,17 @@ void managementClusterAddTenantToGroup(Transaction tr, } if (!groupAlreadyExists) { - ASSERT(clusterMetadata->entry.hasCapacity()); + ASSERT(isRestoring || clusterMetadata->entry.hasCapacity()); DataClusterEntry updatedEntry = clusterMetadata->entry; ++updatedEntry.allocated.numTenantGroups; - updateClusterMetadata( - tr, tenantEntry.assignedCluster.get(), *clusterMetadata, Optional(), updatedEntry); + updateClusterMetadata(tr, + tenantEntry.assignedCluster.get(), + *clusterMetadata, + Optional(), + updatedEntry, + isRestoring); clusterMetadata->entry = updatedEntry; } @@ -1132,9 +1147,13 @@ struct RestoreClusterImpl { MetaclusterOperationContext ctx; // Initialization parameters + ClusterName clusterName; ClusterConnectionString connectionString; ApplyManagementClusterUpdates applyManagementClusterUpdates; + // Loaded from the data cluster + UID dataClusterId; + // Tenant list from data and management clusters std::unordered_map dataClusterTenantMap; std::unordered_set dataClusterTenantNames; @@ -1145,45 +1164,78 @@ struct RestoreClusterImpl { ClusterName clusterName, ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates) - : ctx(managementDb, clusterName, { DataClusterState::RESTORING }), connectionString(connectionString), - applyManagementClusterUpdates(applyManagementClusterUpdates) {} + : ctx(managementDb, {}, { DataClusterState::RESTORING }), clusterName(clusterName), + connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates) {} - // Check that the restored data cluster has a matching metacluster registration entry - ACTOR static Future verifyDataClusterMatch(RestoreClusterImpl* self) { + // If restoring a data cluster, verify that it has a matching registration entry + // If adding a data cluster to a restored management cluster, update the data cluster registration entry + // with the new management cluster name/ID + ACTOR static Future processMetaclusterRegistration(RestoreClusterImpl* self) { state Reference db = wait(openDatabase(self->connectionString)); - Optional metaclusterRegistration = - wait(MetaclusterMetadata::metaclusterRegistration().get(db)); + state Reference tr = db->createTransaction(); - if (!metaclusterRegistration.present()) { - throw invalid_data_cluster(); - } + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); - if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { - throw cluster_already_exists(); + if (!metaclusterRegistration.present()) { + throw invalid_data_cluster(); + } + + if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + if (self->applyManagementClusterUpdates) { + throw cluster_already_exists(); + } else { + MetaclusterMetadata::metaclusterRegistration().set( + tr, + self->ctx.metaclusterRegistration.get().toDataClusterRegistration( + metaclusterRegistration.get().name, metaclusterRegistration.get().id)); + + wait(safeThreadFutureToFuture(tr->commit())); + } + } + + self->dataClusterId = metaclusterRegistration.get().id; + break; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } } self->ctx.dataClusterDb = db; - return Void(); } - void markClusterRestoring(Reference tr) { - if (ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { - DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; + ACTOR static Future markClusterRestoring(RestoreClusterImpl* self, Reference tr) { + // If we are attaching a new data cluster, then we need to register it + if (!self->applyManagementClusterUpdates) { + DataClusterEntry entry; + entry.id = self->dataClusterId; + entry.clusterState = DataClusterState::RESTORING; + wait(registerInManagementCluster(tr, self->clusterName, entry, self->connectionString)); + wait(self->ctx.setCluster(tr, self->clusterName)); + } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::RESTORING; - updateClusterMetadata( - tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), connectionString, updatedEntry); + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + self->ctx.dataClusterMetadata.get(), + self->connectionString, + updatedEntry); // Remove this cluster from the cluster capacity index, but leave its configured capacity intact in the // cluster entry. This allows us to retain the configured capacity while preventing the cluster from // being used to allocate new tenant groups. DataClusterEntry noCapacityEntry = updatedEntry; noCapacityEntry.capacity.numTenantGroups = 0; - updateClusterCapacityIndex(tr, ctx.clusterName.get(), updatedEntry, noCapacityEntry); + updateClusterCapacityIndex(tr, self->ctx.clusterName.get(), updatedEntry, noCapacityEntry); } - TraceEvent("MarkedDataClusterRestoring").detail("Name", ctx.clusterName.get()); + TraceEvent("MarkedDataClusterRestoring").detail("Name", self->ctx.clusterName.get()); + return Void(); } void markClusterAsReady(Reference tr) { @@ -1260,6 +1312,61 @@ struct RestoreClusterImpl { return Void(); } + ACTOR static Future addTenantToManagementCluster(RestoreClusterImpl* self, + Reference tr, + TenantMapEntry tenantEntry) { + state Future> tenantGroupEntry = Optional(); + if (tenantEntry.tenantGroup.present()) { + tenantGroupEntry = + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, tenantEntry.tenantGroup.get()); + } + + Optional existingEntry = wait(tryGetTenantTransaction(tr, tenantEntry.tenantName)); + if (existingEntry.present()) { + if (existingEntry.get().assignedCluster == self->ctx.clusterName) { + ASSERT(existingEntry.get().matchesConfiguration(tenantEntry)); + // This is a retry, so return success + return Void(); + } else { + // TODO: Handle this better + throw tenant_already_exists(); + } + } + + int64_t lastTenantId = + wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.getD(tr, Snapshot::False, 0)); + ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, std::max(lastTenantId, tenantEntry.id)); + + tenantEntry.tenantState = TenantState::READY; + tenantEntry.assignedCluster = self->ctx.clusterName; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantEntry.id, tenantEntry); + ManagementClusterMetadata::tenantMetadata().tenantNameIndex.set(tr, tenantEntry.tenantName, tenantEntry.id); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantName, tenantEntry.id)); + + wait(success(tenantGroupEntry)); + + if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->ctx.clusterName) { + // TODO: Handle this better + throw invalid_tenant_configuration(); + } + + managementClusterAddTenantToGroup(tr, + tenantEntry, + &self->ctx.dataClusterMetadata.get(), + GroupAlreadyExists(tenantGroupEntry.get().present()), + IsRestoring::True); + + return Void(); + } + ACTOR static Future renameTenant(RestoreClusterImpl* self, Reference tr, int64_t tenantId, @@ -1272,10 +1379,19 @@ struct RestoreClusterImpl { wait(store(entry, TenantAPI::tryGetTenantTransaction(tr, tenantId))); wait(store(newId, TenantMetadata::tenantNameIndex().get(tr, newTenantName))); - if (entry.present() && entry.get().tenantName == oldTenantName && !newId.present()) { - wait(TenantAPI::renameTenantTransaction( - tr, oldTenantName, newTenantName, tenantId, ClusterType::METACLUSTER_DATA, configurationSequenceNum)); - return Void(); + if (entry.present()) { + if (entry.get().tenantName == oldTenantName && !newId.present()) { + wait(TenantAPI::renameTenantTransaction(tr, + oldTenantName, + newTenantName, + tenantId, + ClusterType::METACLUSTER_DATA, + configurationSequenceNum)); + return Void(); + } else if (entry.get().tenantName == newTenantName && newId.present() && newId.get() == tenantId) { + // The tenant has already been renamed + return Void(); + } } TraceEvent(SevWarnAlways, "RestoreDataClusterRenameError") @@ -1299,8 +1415,13 @@ struct RestoreClusterImpl { TenantMapEntry existingEntry = wait(TenantAPI::getTenantTransaction(tr, tenantId)); updatedEntry.assignedCluster = Optional(); - // It should not be possible to modify tenants in the data cluster while it is in the restoring state - ASSERT(existingEntry.tenantName == updatedEntry.tenantName); + // The tenant should have already been renamed, so in most cases its name will match. + // If we had to break a rename cycle using temporary tenant names, use that in the updated + // entry here since the rename will be completed later. + if (existingEntry.tenantName != updatedEntry.tenantName) { + ASSERT(existingEntry.tenantName.startsWith(metaclusterTemporaryRenamePrefix)); + updatedEntry.tenantName = existingEntry.tenantName; + } if (existingEntry.configurationSequenceNum <= updatedEntry.configurationSequenceNum) { wait(TenantAPI::configureTenantTransaction(tr, existingEntry, updatedEntry)); @@ -1315,32 +1436,47 @@ struct RestoreClusterImpl { TenantMapEntry tenantEntry) { state std::unordered_map::iterator managementEntry = self->mgmtClusterTenantMap.find(tenantEntry.id); + // Delete if (managementEntry == self->mgmtClusterTenantMap.end()) { if (self->applyManagementClusterUpdates) { wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); })); + } else { + wait(self->ctx.runManagementTransaction( + [self = self, tenantEntry = tenantEntry](Reference tr) { + return addTenantToManagementCluster(self, tr, tenantEntry); + })); } return Optional>(); - } - - state TenantName tenantName = tenantEntry.tenantName; - state TenantMapEntry managementTenant = managementEntry->second; - ASSERT(managementTenant.assignedCluster.present() && - managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); - - // Rename - if (tenantName != managementTenant.tenantName) { - state TenantName temporaryName; - if (self->dataClusterTenantNames.count(managementTenant.tenantName) > 0) { - temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementTenant.tenantName); - } else { - temporaryName = managementTenant.tenantName; + } else if (!self->applyManagementClusterUpdates) { + // We have an ID match with an existing tenant. This is only allowed if we are retrying the restore, in + // which case we expect that the tenant will have the same name, ID, and assigned cluster. + if (managementEntry->second.tenantName != tenantEntry.tenantName || + managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { + // TODO: better handling of this error + throw tenant_already_exists(); } - if (self->applyManagementClusterUpdates) { + return Optional>(); + } else { + state TenantName tenantName = tenantEntry.tenantName; + state TenantMapEntry managementTenant = managementEntry->second; + + ASSERT(managementTenant.assignedCluster.present() && + managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); + + // Rename + if (tenantName != managementTenant.tenantName) { + state TenantName temporaryName; + if (self->dataClusterTenantNames.count(managementTenant.tenantName) > 0) { + temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementTenant.tenantName); + } else { + temporaryName = managementTenant.tenantName; + } + wait(self->ctx.runDataClusterTransaction([self = self, tenantName = tenantName, temporaryName = temporaryName, @@ -1350,27 +1486,24 @@ struct RestoreClusterImpl { return renameTenant( self, tr, tenantEntry.id, tenantName, temporaryName, managementTenant.configurationSequenceNum); })); + tenantName = temporaryName; + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state } - tenantName = temporaryName; - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING state - } - // Update configuration - if (!managementTenant.matchesConfiguration(tenantEntry) || - managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { - if (self->applyManagementClusterUpdates) { + // Update configuration + if (!managementTenant.matchesConfiguration(tenantEntry) || + managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { ASSERT(managementTenant.configurationSequenceNum >= tenantEntry.configurationSequenceNum); wait(self->ctx.runDataClusterTransaction( - [self = self, tenantId = tenantEntry.id, managementTenant = managementTenant]( - Reference tr) { - return updateTenantConfiguration(self, tr, tenantId, managementTenant); + [self = self, managementTenant = managementTenant](Reference tr) { + return updateTenantConfiguration(self, tr, managementTenant.id, managementTenant); })); + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the + // UPDATING_CONFIGURATION state } - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the - // UPDATING_CONFIGURATION state - } - return std::make_pair(tenantName, managementTenant); + return std::make_pair(tenantName, managementTenant); + } } ACTOR static Future reconcileTenants(RestoreClusterImpl* self) { @@ -1428,19 +1561,23 @@ struct RestoreClusterImpl { // This only supports the restore of an already registered data cluster, for now. ACTOR static Future run(RestoreClusterImpl* self) { - // Run a management transaction to populate the data cluster metadata - wait(self->ctx.runManagementTransaction( - [](Reference tr) { return Future(Void()); })); + // If we are applying management cluster updates, then we expect the data cluster to already be registered there + // Run a management transaction to load the data cluster metadata + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + if (self->applyManagementClusterUpdates) { + return self->ctx.setCluster(tr, self->clusterName); + } else { + return Future(Void()); + } + })); - // Check that the data cluster being restored has the appropriate metacluster registration entry and name - wait(verifyDataClusterMatch(self)); + // Make sure that the data cluster being restored has the appropriate metacluster registration entry and name + wait(processMetaclusterRegistration(self)); // set state to restoring try { - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - self->markClusterRestoring(tr); - return Future(Void()); - })); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markClusterRestoring(self, tr); })); } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to restore the cluster, it // will throw an error indicating that the restore has already started @@ -1712,7 +1849,7 @@ struct CreateTenantImpl { } managementClusterAddTenantToGroup( - tr, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), assignment.second); + tr, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), GroupAlreadyExists(assignment.second)); return Void(); } @@ -2076,7 +2213,8 @@ struct ConfigureTenantImpl { } wait(managementClusterRemoveTenantFromGroup(tr, tenantEntry, &self->ctx.dataClusterMetadata.get())); - managementClusterAddTenantToGroup(tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + managementClusterAddTenantToGroup( + tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), GroupAlreadyExists::False); return Void(); } @@ -2089,14 +2227,16 @@ struct ConfigureTenantImpl { throw cluster_no_capacity(); } wait(managementClusterRemoveTenantFromGroup(tr, tenantEntry, &self->ctx.dataClusterMetadata.get())); - managementClusterAddTenantToGroup(tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), false); + managementClusterAddTenantToGroup( + tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), GroupAlreadyExists::False); return Void(); } // Moves between groups in the same cluster are freely allowed else if (tenantGroupEntry.get().assignedCluster == tenantEntry.assignedCluster) { wait(managementClusterRemoveTenantFromGroup(tr, tenantEntry, &self->ctx.dataClusterMetadata.get())); - managementClusterAddTenantToGroup(tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), true); + managementClusterAddTenantToGroup( + tr, entryWithUpdatedGroup, &self->ctx.dataClusterMetadata.get(), GroupAlreadyExists::True); return Void(); } diff --git a/fdbclient/include/fdbclient/RunTransaction.actor.h b/fdbclient/include/fdbclient/RunTransaction.actor.h index fce0de83e6..fadbf8d82a 100644 --- a/fdbclient/include/fdbclient/RunTransaction.actor.h +++ b/fdbclient/include/fdbclient/RunTransaction.actor.h @@ -51,5 +51,20 @@ Future()(Reference()) } } +ACTOR template +Future runTransactionVoid(Reference db, Function func) { + state Reference tr = db->createTransaction(); + loop { + try { + // func should be idempotent; otherwise, retry will get undefined result + wait(func(tr)); + wait(safeThreadFutureToFuture(tr->commit())); + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } +} + #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index a34a6799d6..fdb5565d72 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -905,8 +905,7 @@ struct MetaclusterManagementWorkload : TestWorkload { try { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); wait( - store(metaclusterRegistration, - MetaclusterMetadata::metaclusterRegistration().get(clusterData.db.getReference())) && + store(metaclusterRegistration, MetaclusterMetadata::metaclusterRegistration().get(tr)) && store(tenants, TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1))); break; diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 7cbbdec54d..30ce110309 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -43,7 +43,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { struct DataClusterData { Database db; - std::set tenants; + std::set tenants; std::set tenantGroups; bool restored = false; @@ -52,29 +52,43 @@ struct MetaclusterRestoreWorkload : TestWorkload { }; struct TenantData { - enum class CreateTime { BEFORE_BACKUP, DURING_BACKUP, AFTER_BACKUP }; + enum class AccessTime { NONE, BEFORE_BACKUP, DURING_BACKUP, AFTER_BACKUP }; + TenantName name; ClusterName cluster; Optional tenantGroup; - CreateTime createTime = CreateTime::BEFORE_BACKUP; + AccessTime createTime = AccessTime::BEFORE_BACKUP; + AccessTime renameTime = AccessTime::NONE; + AccessTime configureTime = AccessTime::NONE; TenantData() {} - TenantData(ClusterName cluster, Optional tenantGroup, CreateTime createTime) - : cluster(cluster), tenantGroup(tenantGroup), createTime(createTime) {} + TenantData(TenantName name, ClusterName cluster, Optional tenantGroup, AccessTime createTime) + : name(name), cluster(cluster), tenantGroup(tenantGroup), createTime(createTime) {} + }; + + struct TenantGroupData { + ClusterName cluster; + std::set tenants; }; Reference managementDb; std::map dataDbs; std::vector dataDbIndex; - std::map createdTenants; - std::map> tenantGroups; + std::map createdTenants; + std::map tenantNameIndex; + std::map tenantGroups; + + std::set deletedTenants; int initialTenants; int maxTenants; int maxTenantGroups; int tenantGroupCapacity; + bool recoverManagementCluster; + bool recoverDataClusters; + bool backupComplete = false; double endTime = std::numeric_limits::max(); @@ -84,6 +98,9 @@ struct MetaclusterRestoreWorkload : TestWorkload { maxTenantGroups = std::min(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20)); tenantGroupCapacity = (initialTenants / 2 + maxTenantGroups - 1) / g_simulator->extraDatabases.size(); + int mode = deterministicRandom()->randomInt(0, 3); + recoverManagementCluster = (mode != 2); + recoverDataClusters = (mode != 1); } void disableFailureInjectionWorkloads(std::set& out) const override { @@ -180,7 +197,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { } while (self->createdTenants.size() < self->initialTenants) { - wait(createTenant(self, TenantData::CreateTime::BEFORE_BACKUP)); + wait(createTenant(self, TenantData::AccessTime::BEFORE_BACKUP)); } return Void(); @@ -195,6 +212,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { addDefaultBackupRanges(backupRanges); + TraceEvent("MetaclusterRestoreWorkloadSubmitBackup").detail("ClusterName", clusterName); try { wait(backupAgent.submitBackup( dataDb, backupContainer, {}, 0, 0, clusterName.toString(), backupRanges, StopWhenDone::True)); @@ -203,19 +221,23 @@ struct MetaclusterRestoreWorkload : TestWorkload { throw; } + TraceEvent("MetaclusterRestoreWorkloadWaitBackup").detail("ClusterName", clusterName); state Reference container; wait(success(backupAgent.waitBackup(dataDb, clusterName.toString(), StopWhenDone::True, &container))); + TraceEvent("MetaclusterRestoreWorkloadBackupComplete").detail("ClusterName", clusterName); return container->getURL(); } ACTOR static Future restoreCluster(ClusterName clusterName, Database dataDb, std::string backupUrl, + bool addToMetacluster, MetaclusterRestoreWorkload* self) { state FileBackupAgent backupAgent; state Standalone> backupRanges; addDefaultBackupRanges(backupRanges); + TraceEvent("MetaclusterRestoreWorkloadClearDatabase").detail("ClusterName", clusterName); wait(runTransaction(dataDb.getReference(), [backupRanges = backupRanges](Reference tr) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); @@ -225,28 +247,264 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Future(Void()); })); + TraceEvent("MetaclusterRestoreWorkloadRestoreCluster").detail("ClusterName", clusterName); wait(success(backupAgent.restore(dataDb, dataDb, clusterName, StringRef(backupUrl), {}, backupRanges))); - wait(MetaclusterAPI::restoreCluster(self->managementDb, - clusterName, - dataDb->getConnectionRecord()->getConnectionString(), - ApplyManagementClusterUpdates::True)); + if (addToMetacluster) { + TraceEvent("MetaclusterRestoreWorkloadAddClusterToMetacluster").detail("ClusterName", clusterName); + wait(MetaclusterAPI::restoreCluster(self->managementDb, + clusterName, + dataDb->getConnectionRecord()->getConnectionString(), + ApplyManagementClusterUpdates::True)); + TraceEvent("MetaclusterRestoreWorkloadRestoreComplete").detail("ClusterName", clusterName); + } self->dataDbs[clusterName].restored = true; return Void(); } - ACTOR static Future createTenant(MetaclusterRestoreWorkload* self, TenantData::CreateTime createTime) { + void removeTrackedTenant(int64_t tenantId) { + auto itr = createdTenants.find(tenantId); + if (itr != createdTenants.end()) { + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadRemoveTrackedTenant") + .detail("TenantId", tenantId) + .detail("TenantName", itr->second.name); + deletedTenants.insert(tenantId); + dataDbs[itr->second.cluster].tenants.erase(tenantId); + if (itr->second.tenantGroup.present()) { + tenantGroups[itr->second.tenantGroup.get()].tenants.erase(tenantId); + } + createdTenants.erase(itr); + } + } + + Future resolveTenantCollisions( + MetaclusterRestoreWorkload* self, + ClusterName clusterName, + Database dataDb, + std::unordered_map> const& tenantCollisions) { + TraceEvent("MetaclusterRestoreWorkloadDeleteTenantCollisions") + .detail("FromCluster", clusterName) + .detail("TenantCollisions", tenantCollisions.size()); + std::vector> deleteFutures; + for (auto const& t : tenantCollisions) { + // If the data cluster tenant is expected, then remove the management tenant + // Note that the management tenant may also have been expected + if (self->createdTenants.count(t.second.first)) { + removeTrackedTenant(t.second.second); + deleteFutures.push_back(MetaclusterAPI::deleteTenant(self->managementDb, t.second.second)); + } + // We don't expect the data cluster tenant, so delete it + else { + removeTrackedTenant(t.second.first); + deleteFutures.push_back(TenantAPI::deleteTenant( + dataDb.getReference(), t.first, t.second.first, ClusterType::METACLUSTER_DATA)); + } + } + + return waitForAll(deleteFutures); + } + + ACTOR template + static Future> getTenantsInGroup(Transaction tr, + TenantMetadataSpecification tenantMetadata, + TenantGroupName tenantGroup) { + KeyBackedRangeResult groupTenants = + wait(tenantMetadata.tenantGroupTenantIndex.getRange(tr, + Tuple::makeTuple(tenantGroup), + Tuple::makeTuple(keyAfter(tenantGroup)), + CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + std::unordered_set tenants; + for (auto const& tuple : groupTenants.results) { + tenants.insert(tuple.getInt(1)); + } + + return tenants; + } + + ACTOR Future resolveGroupCollisions(MetaclusterRestoreWorkload* self, + ClusterName clusterName, + Database dataDb, + std::unordered_map groupCollisions) { + TraceEvent("MetaclusterRestoreWorkloadDeleteTenantGroupCollisions") + .detail("FromCluster", clusterName) + .detail("GroupCollisions", groupCollisions.size()); + + state std::vector> deleteFutures; + + state std::unordered_map::const_iterator collisionItr; + for (collisionItr = groupCollisions.begin(); collisionItr != groupCollisions.end(); ++collisionItr) { + // The tenant group from the data cluster is what we expect + auto itr = self->tenantGroups.find(collisionItr->first); + if (itr->second.cluster == clusterName) { + TraceEvent(SevDebug, "MetaclusterRestoreWorlkoadDeleteTenantGroupCollision") + .detail("From", "ManagementCluster") + .detail("TenantGroup", collisionItr->first); + std::unordered_set tenantsInGroup = + wait(runTransaction(self->managementDb, [collisionItr = collisionItr](Reference tr) { + return getTenantsInGroup( + tr, MetaclusterAPI::ManagementClusterMetadata::tenantMetadata(), collisionItr->first); + })); + + for (auto const& t : tenantsInGroup) { + self->removeTrackedTenant(t); + deleteFutures.push_back(MetaclusterAPI::deleteTenant(self->managementDb, t)); + } + + } + // The tenant group from the management cluster is what we expect + else { + TraceEvent(SevDebug, "MetaclusterRestoreWorlkoadDeleteTenantGroupCollision") + .detail("From", "DataCluster") + .detail("TenantGroup", collisionItr->first); + std::unordered_set tenantsInGroup = wait(runTransaction( + dataDb.getReference(), [collisionItr = collisionItr](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return getTenantsInGroup(tr, TenantMetadata::instance(), collisionItr->first); + })); + + deleteFutures.push_back(runTransactionVoid( + dataDb.getReference(), [self = self, tenantsInGroup](Reference tr) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + std::vector> groupDeletions; + for (auto const& t : tenantsInGroup) { + self->removeTrackedTenant(t); + groupDeletions.push_back( + TenantAPI::deleteTenantTransaction(tr, t, ClusterType::METACLUSTER_DATA)); + } + return waitForAll(groupDeletions); + })); + } + } + + wait(waitForAll(deleteFutures)); + return Void(); + } + + ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { + wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); + state std::map::iterator clusterItr; + for (clusterItr = self->dataDbs.begin(); clusterItr != self->dataDbs.end(); ++clusterItr) { + TraceEvent("MetaclusterRestoreWorkloadRecoverManagementCluster").detail("FromCluster", clusterItr->first); + state KeyBackedRangeResult> managementTenantList; + state KeyBackedRangeResult> managementGroupList; + + wait(runTransactionVoid( + self->managementDb, + [managementTenantList = &managementTenantList, + managementGroupList = &managementGroupList](Reference tr) { + return store(*managementTenantList, + MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && + store(*managementGroupList, + MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + })); + ASSERT(managementTenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + ASSERT(managementGroupList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + + state std::unordered_map managementTenants(managementTenantList.results.begin(), + managementTenantList.results.end()); + state std::unordered_map managementGroups( + managementGroupList.results.begin(), managementGroupList.results.end()); + + state KeyBackedRangeResult> dataClusterTenants; + state KeyBackedRangeResult> dataClusterGroups; + + wait(runTransaction(clusterItr->second.db.getReference(), + [dataClusterTenants = &dataClusterTenants, + dataClusterGroups = &dataClusterGroups](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return store(*dataClusterTenants, + TenantMetadata::tenantNameIndex().getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && + store(*dataClusterGroups, + TenantMetadata::tenantGroupMap().getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + })); + ASSERT(dataClusterTenants.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + ASSERT(dataClusterGroups.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + + state std::unordered_map> tenantCollisions; + state std::unordered_map groupCollisions; + for (auto const& t : dataClusterTenants.results) { + auto itr = managementTenants.find(t.first); + if (itr != managementTenants.end()) { + tenantCollisions[t.first] = std::make_pair(t.second, itr->second); + } + } + for (auto const& g : dataClusterGroups.results) { + if (managementGroups.count(g.first)) { + groupCollisions[g.first] = g.second; + } + } + + loop { + try { + TraceEvent("MetaclusterRestoreWorkloadRecoverManagementCluster") + .detail("FromCluster", clusterItr->first) + .detail("TenantCollisions", tenantCollisions.size()); + wait(MetaclusterAPI::restoreCluster( + self->managementDb, + clusterItr->first, + clusterItr->second.db->getConnectionRecord()->getConnectionString(), + ApplyManagementClusterUpdates::False)); + + ASSERT(tenantCollisions.empty() && groupCollisions.empty()); + break; + } catch (Error& e) { + if ((e.code() == error_code_tenant_already_exists && !tenantCollisions.empty()) || + (e.code() == error_code_invalid_tenant_configuration && !groupCollisions.empty())) { + if (!tenantCollisions.empty()) { + wait(self->resolveTenantCollisions( + self, clusterItr->first, clusterItr->second.db, tenantCollisions)); + tenantCollisions.clear(); + } + if (!groupCollisions.empty()) { + wait(self->resolveGroupCollisions( + self, clusterItr->first, clusterItr->second.db, groupCollisions)); + groupCollisions.clear(); + } + } else { + throw; + } + } + } + TraceEvent("MetaclusterRestoreWorkloadRecoveredManagementCluster").detail("FromCluster", clusterItr->first); + } + + TraceEvent("MetaclusterRestoreWorkloadRestoredManagementCluster"); + return Void(); + } + + ACTOR static Future resetManagementCluster(MetaclusterRestoreWorkload* self) { + state Reference tr = self->managementDb->createTransaction(); + TraceEvent("MetaclusterRestoreWorkloadEraseManagementCluster"); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->clear(""_sr, "\xff"_sr); + MetaclusterMetadata::metaclusterRegistration().clear(tr); + wait(safeThreadFutureToFuture(tr->commit())); + TraceEvent("MetaclusterRestoreWorkloadManagementClusterErased"); + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + ACTOR static Future createTenant(MetaclusterRestoreWorkload* self, TenantData::AccessTime createTime) { state TenantName tenantName; for (int i = 0; i < 10; ++i) { tenantName = self->chooseTenantName(); - if (self->createdTenants.count(tenantName) == 0) { + if (self->tenantNameIndex.count(tenantName) == 0) { break; } } - if (self->createdTenants.count(tenantName)) { + if (self->tenantNameIndex.count(tenantName)) { return Void(); } @@ -257,12 +515,19 @@ struct MetaclusterRestoreWorkload : TestWorkload { tenantEntry.tenantGroup = self->chooseTenantGroup(); wait(MetaclusterAPI::createTenant(self->managementDb, tenantEntry, AssignClusterAutomatically::True)); TenantMapEntry createdEntry = wait(MetaclusterAPI::getTenant(self->managementDb, tenantName)); - self->createdTenants[tenantName] = - TenantData(createdEntry.assignedCluster.get(), createdEntry.tenantGroup, createTime); + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadCreatedTenant") + .detail("Tenant", tenantName) + .detail("TenantId", createdEntry.id) + .detail("AccessTime", createTime); + self->createdTenants[createdEntry.id] = + TenantData(tenantName, createdEntry.assignedCluster.get(), createdEntry.tenantGroup, createTime); + self->tenantNameIndex[tenantName] = createdEntry.id; auto& dataDb = self->dataDbs[createdEntry.assignedCluster.get()]; - dataDb.tenants.insert(tenantName); + dataDb.tenants.insert(createdEntry.id); if (createdEntry.tenantGroup.present()) { - self->tenantGroups[createdEntry.tenantGroup.get()].insert(tenantName); + auto& tenantGroupData = self->tenantGroups[createdEntry.tenantGroup.get()]; + tenantGroupData.cluster = createdEntry.assignedCluster.get(); + tenantGroupData.tenants.insert(createdEntry.id); dataDb.tenantGroups.insert(createdEntry.tenantGroup.get()); } return Void(); @@ -276,52 +541,62 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } - ACTOR static Future deleteTenant(MetaclusterRestoreWorkload* self) { + ACTOR static Future deleteTenant(MetaclusterRestoreWorkload* self, TenantData::AccessTime accessTime) { state TenantName tenantName; for (int i = 0; i < 10; ++i) { tenantName = self->chooseTenantName(); - if (self->createdTenants.count(tenantName) != 0) { + if (self->tenantNameIndex.count(tenantName) != 0) { break; } } - if (self->createdTenants.count(tenantName) == 0) { + if (self->tenantNameIndex.count(tenantName) == 0) { return Void(); } + state int64_t tenantId = self->tenantNameIndex[tenantName]; + + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadDeleteTenant") + .detail("Tenant", tenantName) + .detail("TenantId", tenantId) + .detail("AccessTime", accessTime); wait(MetaclusterAPI::deleteTenant(self->managementDb, tenantName)); - auto const& tenantData = self->createdTenants[tenantName]; + auto const& tenantData = self->createdTenants[tenantId]; auto& dataDb = self->dataDbs[tenantData.cluster]; - dataDb.tenants.erase(tenantName); + dataDb.tenants.erase(tenantId); if (tenantData.tenantGroup.present()) { auto groupItr = self->tenantGroups.find(tenantData.tenantGroup.get()); - groupItr->second.erase(tenantName); - if (groupItr->second.empty()) { + groupItr->second.tenants.erase(tenantId); + if (groupItr->second.tenants.empty()) { self->tenantGroups.erase(groupItr); dataDb.tenantGroups.erase(tenantData.tenantGroup.get()); } } - self->createdTenants.erase(tenantName); + self->createdTenants.erase(tenantId); + self->tenantNameIndex.erase(tenantName); + self->deletedTenants.insert(tenantId); return Void(); } - ACTOR static Future configureTenant(MetaclusterRestoreWorkload* self) { + ACTOR static Future configureTenant(MetaclusterRestoreWorkload* self, TenantData::AccessTime accessTime) { state TenantName tenantName; for (int i = 0; i < 10; ++i) { tenantName = self->chooseTenantName(); - if (self->createdTenants.count(tenantName) != 0) { + if (self->tenantNameIndex.count(tenantName) != 0) { break; } } - if (self->createdTenants.count(tenantName) == 0) { + if (self->tenantNameIndex.count(tenantName) == 0) { return Void(); } - state Optional tenantGroup = self->chooseTenantGroup(self->createdTenants[tenantName].cluster); + state int64_t tenantId = self->tenantNameIndex[tenantName]; + + state Optional tenantGroup = self->chooseTenantGroup(self->createdTenants[tenantId].cluster); state std::map, Optional> configurationParams = { { "tenant_group"_sr, tenantGroup } }; @@ -329,25 +604,33 @@ struct MetaclusterRestoreWorkload : TestWorkload { try { wait(MetaclusterAPI::configureTenant(self->managementDb, tenantName, configurationParams)); - auto& tenantData = self->createdTenants[tenantName]; + auto& tenantData = self->createdTenants[tenantId]; + + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadConfigureTenant") + .detail("Tenant", tenantName) + .detail("TenantId", tenantId) + .detail("OldTenantGroup", tenantData.tenantGroup) + .detail("NewTenantGroup", tenantGroup) + .detail("AccessTime", accessTime); + if (tenantData.tenantGroup != tenantGroup) { auto& dataDb = self->dataDbs[tenantData.cluster]; if (tenantData.tenantGroup.present()) { auto groupItr = self->tenantGroups.find(tenantData.tenantGroup.get()); - groupItr->second.erase(tenantName); - if (groupItr->second.empty()) { + groupItr->second.tenants.erase(tenantId); + if (groupItr->second.tenants.empty()) { self->tenantGroups.erase(groupItr); dataDb.tenantGroups.erase(tenantData.tenantGroup.get()); } - self->tenantGroups[tenantData.tenantGroup.get()].erase(tenantName); } if (tenantGroup.present()) { - self->tenantGroups[tenantGroup.get()].insert(tenantName); + self->tenantGroups[tenantGroup.get()].tenants.insert(tenantId); dataDb.tenantGroups.insert(tenantGroup.get()); } tenantData.tenantGroup = tenantGroup; + tenantData.configureTime = accessTime; } return Void(); } catch (Error& e) { @@ -360,41 +643,40 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } - ACTOR static Future renameTenant(MetaclusterRestoreWorkload* self) { + ACTOR static Future renameTenant(MetaclusterRestoreWorkload* self, TenantData::AccessTime accessTime) { state TenantName oldTenantName; state TenantName newTenantName; for (int i = 0; i < 10; ++i) { oldTenantName = self->chooseTenantName(); - if (self->createdTenants.count(oldTenantName) != 0) { + if (self->tenantNameIndex.count(oldTenantName) != 0) { break; } } for (int i = 0; i < 10; ++i) { newTenantName = self->chooseTenantName(); - if (self->createdTenants.count(newTenantName) == 0) { + if (self->tenantNameIndex.count(newTenantName) == 0) { break; } } - if (self->createdTenants.count(oldTenantName) == 0 || self->createdTenants.count(newTenantName) != 0) { + if (self->tenantNameIndex.count(oldTenantName) == 0 || self->tenantNameIndex.count(newTenantName) != 0) { return Void(); } + state int64_t tenantId = self->tenantNameIndex[oldTenantName]; + + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadRenameTenant") + .detail("OldTenantName", oldTenantName) + .detail("NewTenantName", newTenantName) + .detail("TenantId", tenantId) + .detail("AccessTime", accessTime); wait(MetaclusterAPI::renameTenant(self->managementDb, oldTenantName, newTenantName)); - auto const& tenantData = self->createdTenants[oldTenantName]; - if (tenantData.tenantGroup.present()) { - auto& tenantGroup = self->tenantGroups[tenantData.tenantGroup.get()]; - tenantGroup.erase(oldTenantName); - tenantGroup.insert(newTenantName); - } - - auto& dataDb = self->dataDbs[tenantData.cluster]; - dataDb.tenants.erase(oldTenantName); - dataDb.tenants.insert(newTenantName); - - self->createdTenants[newTenantName] = tenantData; - self->createdTenants.erase(oldTenantName); + TenantData& tenantData = self->createdTenants[tenantId]; + tenantData.name = newTenantName; + tenantData.renameTime = accessTime; + self->tenantNameIndex[newTenantName] = tenantId; + self->tenantNameIndex.erase(oldTenantName); return Void(); } @@ -402,16 +684,16 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR static Future runOperations(MetaclusterRestoreWorkload* self) { while (now() < self->endTime) { state int operation = deterministicRandom()->randomInt(0, 4); + state TenantData::AccessTime accessTime = + self->backupComplete ? TenantData::AccessTime::AFTER_BACKUP : TenantData::AccessTime::DURING_BACKUP; if (operation == 0) { - wait(createTenant(self, - self->backupComplete ? TenantData::CreateTime::AFTER_BACKUP - : TenantData::CreateTime::DURING_BACKUP)); + wait(createTenant(self, accessTime)); } else if (operation == 1) { - wait(deleteTenant(self)); + wait(deleteTenant(self, accessTime)); } else if (operation == 2) { - wait(configureTenant(self)); + wait(configureTenant(self, accessTime)); } else if (operation == 3) { - wait(renameTenant(self)); + wait(renameTenant(self, accessTime)); } } @@ -428,14 +710,24 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR static Future _start(Database cx, MetaclusterRestoreWorkload* self) { state std::set clustersToRestore; - for (auto db : self->dataDbIndex) { - if (deterministicRandom()->random01() < 0.1) { - clustersToRestore.insert(db); - } - } + TraceEvent("MetaclusterRestoreWorkloadStart") + .detail("RecoverManagementCluster", self->recoverManagementCluster) + .detail("RecoverDataClusters", self->recoverDataClusters); - if (clustersToRestore.empty()) { - clustersToRestore.insert(deterministicRandom()->randomChoice(self->dataDbIndex)); + if (self->recoverDataClusters) { + for (auto db : self->dataDbIndex) { + if (deterministicRandom()->random01() < 0.1) { + clustersToRestore.insert(db); + } + } + + if (clustersToRestore.empty()) { + clustersToRestore.insert(deterministicRandom()->randomChoice(self->dataDbIndex)); + } + + for (auto c : clustersToRestore) { + TraceEvent("MetaclusterRestoreWorkloadChoseClusterForRestore").detail("ClusterName", c); + } } state Future opsFuture = runOperations(self); @@ -453,14 +745,24 @@ struct MetaclusterRestoreWorkload : TestWorkload { self->endTime = now() + 30.0; wait(opsFuture); + TraceEvent("MetaclusterRestoreWorkloadOperationsComplete"); + + if (self->recoverManagementCluster) { + wait(resetManagementCluster(self)); + } std::vector> restores; for (auto [cluster, backupUrl] : backups) { - restores.push_back(restoreCluster(cluster, self->dataDbs[cluster].db, backupUrl.get(), self)); + restores.push_back(restoreCluster( + cluster, self->dataDbs[cluster].db, backupUrl.get(), !self->recoverManagementCluster, self)); } wait(waitForAll(restores)); + if (self->recoverManagementCluster) { + wait(restoreManagementCluster(self)); + } + return Void(); } @@ -469,75 +771,107 @@ struct MetaclusterRestoreWorkload : TestWorkload { ClusterName clusterName, DataClusterData clusterData) { state Optional metaclusterRegistration; - state std::vector> tenants; + state KeyBackedRangeResult> tenants; state Reference tr = clusterData.db->createTransaction(); loop { try { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); wait( - store(metaclusterRegistration, - MetaclusterMetadata::metaclusterRegistration().get(clusterData.db.getReference())) && + store(metaclusterRegistration, MetaclusterMetadata::metaclusterRegistration().get(tr)) && store(tenants, - TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1))); + TenantMetadata::tenantMap().getRange(tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1))); break; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); } } + ASSERT_LE(tenants.results.size(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); ASSERT(metaclusterRegistration.present() && metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); if (!clusterData.restored) { - ASSERT(tenants.size() == clusterData.tenants.size()); - for (auto [tenantName, tenantEntry] : tenants) { - ASSERT(clusterData.tenants.count(tenantName)); - auto tenantData = self->createdTenants[tenantName]; + ASSERT_EQ(tenants.results.size(), clusterData.tenants.size()); + for (auto [tenantId, tenantEntry] : tenants.results) { + ASSERT(clusterData.tenants.count(tenantId)); + auto tenantData = self->createdTenants[tenantId]; ASSERT(tenantData.cluster == clusterName); ASSERT(tenantData.tenantGroup == tenantEntry.tenantGroup); + ASSERT(tenantData.name == tenantEntry.tenantName); } } else { int expectedTenantCount = 0; - std::map tenantMap(tenants.begin(), tenants.end()); - for (auto tenantName : clusterData.tenants) { - TenantData tenantData = self->createdTenants[tenantName]; - auto tenantItr = tenantMap.find(tenantName); - if (tenantData.createTime == TenantData::CreateTime::BEFORE_BACKUP) { + std::map tenantMap(tenants.results.begin(), tenants.results.end()); + for (auto tenantId : clusterData.tenants) { + TenantData tenantData = self->createdTenants[tenantId]; + auto tenantItr = tenantMap.find(tenantId); + if (tenantData.createTime == TenantData::AccessTime::BEFORE_BACKUP) { ++expectedTenantCount; ASSERT(tenantItr != tenantMap.end()); ASSERT(tenantData.cluster == clusterName); - ASSERT(tenantItr->second.tenantGroup == tenantData.tenantGroup); - } else if (tenantData.createTime == TenantData::CreateTime::AFTER_BACKUP) { + if (!self->recoverManagementCluster || + tenantData.configureTime <= TenantData::AccessTime::BEFORE_BACKUP) { + ASSERT(tenantItr->second.tenantGroup == tenantData.tenantGroup); + } + if (!self->recoverManagementCluster || + tenantData.renameTime <= TenantData::AccessTime::BEFORE_BACKUP) { + ASSERT(tenantItr->second.tenantName == tenantData.name); + } + } else if (tenantData.createTime == TenantData::AccessTime::AFTER_BACKUP) { ASSERT(tenantItr == tenantMap.end()); } else if (tenantItr != tenantMap.end()) { ++expectedTenantCount; } } - ASSERT(tenants.size() == expectedTenantCount); + // Check for deleted tenants that reappeared + int unexpectedTenants = 0; + for (auto const& [tenantId, tenantEntry] : tenantMap) { + if (!clusterData.tenants.count(tenantId)) { + ASSERT(self->recoverManagementCluster); + ASSERT(self->deletedTenants.count(tenantId)); + ++unexpectedTenants; + } + } + + ASSERT_EQ(tenantMap.size() - unexpectedTenants, expectedTenantCount); } return Void(); } ACTOR static Future checkTenants(MetaclusterRestoreWorkload* self) { - state std::vector> tenants = wait(MetaclusterAPI::listTenants( - self->managementDb, ""_sr, "\xff\xff"_sr, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); - ASSERT(tenants.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + state KeyBackedRangeResult> tenants = + wait(runTransaction(self->managementDb, [](Reference tr) { + return MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1); + })); - std::map tenantMap(tenants.begin(), tenants.end()); - for (auto& [tenantName, tenantData] : self->createdTenants) { - TenantMapEntry const& entry = tenantMap[tenantName]; - if (tenantData.createTime != TenantData::CreateTime::BEFORE_BACKUP && - self->dataDbs[tenantData.cluster].restored) { - ASSERT(entry.tenantState == TenantState::ERROR || - (entry.tenantState == TenantState::READY && - tenantData.createTime == TenantData::CreateTime::DURING_BACKUP)); + ASSERT_LE(tenants.results.size(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + + std::map tenantMap(tenants.results.begin(), tenants.results.end()); + for (auto& [tenantId, tenantData] : self->createdTenants) { + auto tenantItr = tenantMap.find(tenantId); + if (tenantItr == tenantMap.end()) { + // A tenant that we expected to have been created can only be missing from the management cluster if we + // lost data in the process of recovering both the management and some data clusters + ASSERT_NE(tenantData.createTime, TenantData::AccessTime::BEFORE_BACKUP); + ASSERT(self->dataDbs[tenantData.cluster].restored && self->recoverManagementCluster); } else { - ASSERT(entry.tenantState == TenantState::READY); + if (tenantData.createTime != TenantData::AccessTime::BEFORE_BACKUP && + self->dataDbs[tenantData.cluster].restored) { + ASSERT(tenantItr->second.tenantState == TenantState::ERROR || + (tenantItr->second.tenantState == TenantState::READY && + tenantData.createTime == TenantData::AccessTime::DURING_BACKUP)); + } else { + ASSERT_EQ(tenantItr->second.tenantState, TenantState::READY); + } } } + + // TODO: evaluate extra tenants in tenantMap + return Void(); } From 2d59c5681dec9916e34828f967fde58963b4a122 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 9 Feb 2023 08:42:23 -0800 Subject: [PATCH 29/57] Bug fixes and test improvements for management cluster restoration --- fdbcli/MetaclusterCommands.actor.cpp | 50 ++++-- fdbclient/ClientKnobs.cpp | 1 + fdbclient/Tenant.cpp | 7 + fdbclient/include/fdbclient/ClientKnobs.h | 1 + .../fdbclient/MetaclusterManagement.actor.h | 109 ++++++++++--- fdbclient/include/fdbclient/Tenant.h | 2 + .../MetaclusterManagementWorkload.actor.cpp | 4 +- .../MetaclusterRestoreWorkload.actor.cpp | 143 ++++++++++++++---- 8 files changed, 250 insertions(+), 67 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 32acf57988..c2fce540fc 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -190,20 +190,46 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto return false; } - if (tokens[4] == "restore_known_data_cluster"_sr) { - wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::True)); + state std::vector messages; + state bool success = true; - fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); - return true; - } else if (tokens[4] == "repopulate_from_data_cluster"_sr) { - wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::False)); - return false; - } else { - fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(tokens[4])); - return false; + try { + if (tokens[4] == "restore_known_data_cluster"_sr) { + wait(MetaclusterAPI::restoreCluster( + db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::True, &messages)); + + } else if (tokens[4] == "repopulate_from_data_cluster"_sr) { + wait(MetaclusterAPI::restoreCluster( + db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::False, &messages)); + } else { + fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(tokens[4])); + success = false; + } + } catch (Error& e) { + success = false; + fmt::print(stderr, "ERROR: {} ({})\n", e.what(), e.code()); } + + if (!messages.empty()) { + if (!success) { + fmt::print("\n"); + } + + fmt::print(success ? stdout : stderr, "The restore reported the following messages:\n"); + for (int i = 0; i < messages.size(); ++i) { + fmt::print(success ? stdout : stderr, " {}. {}\n", i + 1, messages[i]); + } + + if (success) { + fmt::print("\n"); + } + } + + if (success) { + fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); + } + + return success; } // metacluster configure command diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 62554b793c..c77559811b 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -297,6 +297,7 @@ void ClientKnobs::initialize(Randomize randomize) { init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, 5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1; init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; + init( METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE, 1000 ); if ( randomize && BUGGIFY ) METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE = 1 + deterministicRandom()->randomInt(0, 3); init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY, false ); diff --git a/fdbclient/Tenant.cpp b/fdbclient/Tenant.cpp index 002facf9c0..a68da1b9a9 100644 --- a/fdbclient/Tenant.cpp +++ b/fdbclient/Tenant.cpp @@ -177,6 +177,13 @@ void TenantMapEntry::configure(Standalone parameter, Optional } } +bool TenantMapEntry::operator==(TenantMapEntry const& other) const { + return id == other.id && tenantName == other.tenantName && tenantState == other.tenantState && + tenantLockState == other.tenantLockState && tenantGroup == other.tenantGroup && + assignedCluster == other.assignedCluster && configurationSequenceNum == other.configurationSequenceNum && + renameDestination == other.renameDestination && error == other.error; +} + json_spirit::mObject TenantGroupEntry::toJson() const { json_spirit::mObject tenantGroupEntry; if (assignedCluster.present()) { diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index a4d60e6d85..0dbe688193 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -292,6 +292,7 @@ public: int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK; double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; + int METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE; int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed bool CLIENT_ENABLE_USING_CLUSTER_ID_KEY; diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 7f5f170076..7b25b14a91 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -93,8 +93,11 @@ FDB_DECLARE_BOOLEAN_PARAM(IsRestoring); namespace MetaclusterAPI { // This prefix is used during a cluster restore if the desired name is in use -// TODO: this should probably live in the `\xff` tenant namespace, but other parts -// of the code are not able to work with `\xff` tenants yet. +// +// SOMEDAY: this should probably live in the `\xff` tenant namespace, but other parts of the code are not able to work +// with `\xff` tenants yet. In the unlikely event that we have regular tenants using this prefix, we have a rename +// cycle, and we have a collision between the names, a restore will fail with an error. This error can be resolved +// manually using tenant rename commands. const StringRef metaclusterTemporaryRenamePrefix = "\xfe/restoreTenant/"_sr; struct ManagementClusterMetadata { @@ -505,11 +508,11 @@ Future> createMetacluster(Reference db, ClusterName na if (metaclusterUid.present() && metaclusterUid.get() == existingRegistration.get().metaclusterId) { return Optional(); } else { - return format("cluster is already registered as a %s named `%s'", - existingRegistration.get().clusterType == ClusterType::METACLUSTER_DATA - ? "data cluster" - : "metacluster", - printable(existingRegistration.get().name).c_str()); + return fmt::format("cluster is already registered as a {} named `{}'", + existingRegistration.get().clusterType == ClusterType::METACLUSTER_DATA + ? "data cluster" + : "metacluster", + printable(existingRegistration.get().name)); } } @@ -1150,6 +1153,7 @@ struct RestoreClusterImpl { ClusterName clusterName; ClusterConnectionString connectionString; ApplyManagementClusterUpdates applyManagementClusterUpdates; + std::vector& messages; // Loaded from the data cluster UID dataClusterId; @@ -1163,9 +1167,11 @@ struct RestoreClusterImpl { RestoreClusterImpl(Reference managementDb, ClusterName clusterName, ClusterConnectionString connectionString, - ApplyManagementClusterUpdates applyManagementClusterUpdates) + ApplyManagementClusterUpdates applyManagementClusterUpdates, + std::vector& messages) : ctx(managementDb, {}, { DataClusterState::RESTORING }), clusterName(clusterName), - connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates) {} + connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates), + messages(messages) {} // If restoring a data cluster, verify that it has a matching registration entry // If adding a data cluster to a restored management cluster, update the data cluster registration entry @@ -1193,7 +1199,7 @@ struct RestoreClusterImpl { self->ctx.metaclusterRegistration.get().toDataClusterRegistration( metaclusterRegistration.get().name, metaclusterRegistration.get().id)); - wait(safeThreadFutureToFuture(tr->commit())); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); } } @@ -1256,16 +1262,26 @@ struct RestoreClusterImpl { .detail("Version", tr->getCommittedVersion()); } - ACTOR static Future markManagementTenantAsError(Reference tr, int64_t tenantId) { - state Optional tenantEntry = wait(tryGetTenantTransaction(tr, tenantId)); + ACTOR static Future markManagementTenantsAsError(RestoreClusterImpl* self, + Reference tr, + std::vector tenants) { + state std::vector>> getFutures; + for (auto tenantId : tenants) { + getFutures.push_back(tryGetTenantTransaction(tr, tenantId)); + } + wait(waitForAll(getFutures)); - if (!tenantEntry.present()) { - return Void(); + for (auto const& f : getFutures) { + if (!f.get().present()) { + continue; + } + + TenantMapEntry entry = f.get().get(); + entry.tenantState = TenantState::ERROR; + entry.error = "The tenant is missing after restoring its data cluster"; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, entry.id, entry); } - tenantEntry.get().tenantState = TenantState::ERROR; - tenantEntry.get().error = "The tenant is missing after restoring its data cluster"; - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantId, tenantEntry.get()); return Void(); } @@ -1328,7 +1344,9 @@ struct RestoreClusterImpl { // This is a retry, so return success return Void(); } else { - // TODO: Handle this better + self->messages.push_back(fmt::format("The tenant `{}' already exists on cluster `{}'", + printable(tenantEntry.tenantName), + printable(existingEntry.get().assignedCluster))); throw tenant_already_exists(); } } @@ -1354,7 +1372,11 @@ struct RestoreClusterImpl { wait(success(tenantGroupEntry)); if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->ctx.clusterName) { - // TODO: Handle this better + self->messages.push_back( + fmt::format("The tenant `{}' is part of a tenant group `{}' that already exists on cluster `{}'", + printable(tenantEntry.tenantName), + printable(tenantEntry.tenantGroup.get()), + printable(tenantGroupEntry.get().get().assignedCluster))); throw invalid_tenant_configuration(); } @@ -1402,8 +1424,17 @@ struct RestoreClusterImpl { .detail("NewEntryPresent", newId.present()); if (newId.present()) { + self->messages.push_back( + fmt::format("Failed to rename the tenant `{}' to `{}' because the new name is already in use", + printable(oldTenantName), + printable(newTenantName))); throw tenant_already_exists(); } else { + self->messages.push_back(fmt::format( + "Failed to rename the tenant `{}' to `{}' because the tenant did not have the expected ID {}", + printable(oldTenantName), + printable(newTenantName), + tenantId)); throw tenant_not_found(); } } @@ -1456,7 +1487,12 @@ struct RestoreClusterImpl { // which case we expect that the tenant will have the same name, ID, and assigned cluster. if (managementEntry->second.tenantName != tenantEntry.tenantName || managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { - // TODO: better handling of this error + self->messages.push_back( + fmt::format("The tenant `{}' has the same ID {} as an existing tenant `{}' on cluster `{}'", + printable(tenantEntry.tenantName), + tenantEntry.id, + printable(managementEntry->second.tenantName), + printable(managementEntry->second.assignedCluster))); throw tenant_already_exists(); } @@ -1535,6 +1571,8 @@ struct RestoreClusterImpl { ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + state std::vector missingTenants; + state int64_t missingTenantCount = 0; while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { int64_t tenantId = *setItr; TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantId]; @@ -1549,13 +1587,33 @@ struct RestoreClusterImpl { managementTenant.tenantState != TenantState::REGISTERING && managementTenant.tenantState != TenantState::REMOVING && managementTenant.tenantState != TenantState::ERROR) { - wait(self->ctx.runManagementTransaction([tenantId](Reference tr) { - return markManagementTenantAsError(tr, tenantId); - })); + missingTenants.push_back(tenantId); + ++missingTenantCount; + if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE) { + wait(self->ctx.runManagementTransaction( + [self = self, missingTenants = missingTenants](Reference tr) { + return markManagementTenantsAsError(self, tr, missingTenants); + })); + missingTenants.clear(); + } } ++setItr; } + if (missingTenants.size() > 0) { + wait(self->ctx.runManagementTransaction( + [self = self, missingTenants = missingTenants](Reference tr) { + return markManagementTenantsAsError(self, tr, missingTenants); + })); + } + + // This is a best effort attempt to communicate the number of missing tenants. If a restore needs to be run + // twice and is interrupted in the middle of the first attempt to process missing tenants, we may not report a + // full count. + if (missingTenantCount > 0) { + self->messages.push_back(fmt::format( + "The metacluster has {} tenants that are missing in the restored data cluster.", missingTenantCount)); + } return Void(); } @@ -1619,8 +1677,9 @@ ACTOR template Future restoreCluster(Reference db, ClusterName name, ClusterConnectionString connectionString, - ApplyManagementClusterUpdates applyManagementClusterUpdates) { - state RestoreClusterImpl impl(db, name, connectionString, applyManagementClusterUpdates); + ApplyManagementClusterUpdates applyManagementClusterUpdates, + std::vector* messages) { + state RestoreClusterImpl impl(db, name, connectionString, applyManagementClusterUpdates, *messages); wait(impl.run()); return Void(); } diff --git a/fdbclient/include/fdbclient/Tenant.h b/fdbclient/include/fdbclient/Tenant.h index cbd20972d4..2d4149fa36 100644 --- a/fdbclient/include/fdbclient/Tenant.h +++ b/fdbclient/include/fdbclient/Tenant.h @@ -102,6 +102,8 @@ struct TenantMapEntry { return ObjectReader::fromStringRef(value, IncludeVersion()); } + bool operator==(TenantMapEntry const& other) const; + template void serialize(Ar& ar) { serializer(ar, diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index fdb5565d72..a5ec3852bc 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -271,13 +271,15 @@ struct MetaclusterManagementWorkload : TestWorkload { state ClusterName clusterName = self->chooseClusterName(); state DataClusterData* dataDb = &self->dataDbs[clusterName]; + state std::vector messages; try { loop { Future restoreFuture = MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->db->getConnectionRecord()->getConnectionString(), - ApplyManagementClusterUpdates::True); + ApplyManagementClusterUpdates::True, + &messages); Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { break; diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 30ce110309..685a63a4e7 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -46,6 +46,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::set tenants; std::set tenantGroups; bool restored = false; + bool restoreHasMessages = false; DataClusterData() {} DataClusterData(Database db) : db(db) {} @@ -80,6 +81,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::map tenantGroups; std::set deletedTenants; + std::vector> managementTenantsBeforeRestore; int initialTenants; int maxTenants; @@ -94,7 +96,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { MetaclusterRestoreWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { maxTenants = std::min(1e8 - 1, getOption(options, "maxTenants"_sr, 1000)); - initialTenants = std::min(maxTenants, getOption(options, "initialTenants"_sr, 100)); + initialTenants = std::min(maxTenants, getOption(options, "initialTenants"_sr, 40)); maxTenantGroups = std::min(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20)); tenantGroupCapacity = (initialTenants / 2 + maxTenantGroups - 1) / g_simulator->extraDatabases.size(); @@ -196,10 +198,14 @@ struct MetaclusterRestoreWorkload : TestWorkload { wait(MetaclusterAPI::registerCluster(self->managementDb, clusterName, ccs, clusterEntry)); } + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadCreateTenants").detail("NumTenants", self->initialTenants); + while (self->createdTenants.size() < self->initialTenants) { wait(createTenant(self, TenantData::AccessTime::BEFORE_BACKUP)); } + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadCreateTenantsComplete"); + return Void(); } @@ -228,11 +234,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { return container->getURL(); } - ACTOR static Future restoreCluster(ClusterName clusterName, - Database dataDb, - std::string backupUrl, - bool addToMetacluster, - MetaclusterRestoreWorkload* self) { + ACTOR static Future restoreDataCluster(ClusterName clusterName, + Database dataDb, + std::string backupUrl, + bool addToMetacluster, + MetaclusterRestoreWorkload* self) { state FileBackupAgent backupAgent; state Standalone> backupRanges; addDefaultBackupRanges(backupRanges); @@ -250,16 +256,19 @@ struct MetaclusterRestoreWorkload : TestWorkload { TraceEvent("MetaclusterRestoreWorkloadRestoreCluster").detail("ClusterName", clusterName); wait(success(backupAgent.restore(dataDb, dataDb, clusterName, StringRef(backupUrl), {}, backupRanges))); + state std::vector messages; if (addToMetacluster) { TraceEvent("MetaclusterRestoreWorkloadAddClusterToMetacluster").detail("ClusterName", clusterName); wait(MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->getConnectionRecord()->getConnectionString(), - ApplyManagementClusterUpdates::True)); + ApplyManagementClusterUpdates::True, + &messages)); TraceEvent("MetaclusterRestoreWorkloadRestoreComplete").detail("ClusterName", clusterName); } self->dataDbs[clusterName].restored = true; + self->dataDbs[clusterName].restoreHasMessages = !messages.empty(); return Void(); } @@ -338,7 +347,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { // The tenant group from the data cluster is what we expect auto itr = self->tenantGroups.find(collisionItr->first); if (itr->second.cluster == clusterName) { - TraceEvent(SevDebug, "MetaclusterRestoreWorlkoadDeleteTenantGroupCollision") + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadDeleteTenantGroupCollision") .detail("From", "ManagementCluster") .detail("TenantGroup", collisionItr->first); std::unordered_set tenantsInGroup = @@ -355,7 +364,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { } // The tenant group from the management cluster is what we expect else { - TraceEvent(SevDebug, "MetaclusterRestoreWorlkoadDeleteTenantGroupCollision") + TraceEvent(SevDebug, "MetaclusterRestoreWorkloadDeleteTenantGroupCollision") .detail("From", "DataCluster") .detail("TenantGroup", collisionItr->first); std::unordered_set tenantsInGroup = wait(runTransaction( @@ -382,6 +391,17 @@ struct MetaclusterRestoreWorkload : TestWorkload { return Void(); } + ACTOR static Future>> getDataClusterTenants(Database db) { + KeyBackedRangeResult> tenants = + wait(runTransaction(db.getReference(), [](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return TenantMetadata::tenantMap().getRange(tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1); + })); + + ASSERT_LE(tenants.results.size(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + return tenants.results; + } + ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); state std::map::iterator clusterItr; @@ -440,36 +460,60 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } - loop { + state std::vector messages; + state bool completed = false; + while (!completed) { + state std::vector> dataTenantsBeforeRestore = + wait(getDataClusterTenants(clusterItr->second.db)); + try { TraceEvent("MetaclusterRestoreWorkloadRecoverManagementCluster") .detail("FromCluster", clusterItr->first) .detail("TenantCollisions", tenantCollisions.size()); + wait(MetaclusterAPI::restoreCluster( self->managementDb, clusterItr->first, clusterItr->second.db->getConnectionRecord()->getConnectionString(), - ApplyManagementClusterUpdates::False)); + ApplyManagementClusterUpdates::False, + &messages)); ASSERT(tenantCollisions.empty() && groupCollisions.empty()); - break; + completed = true; } catch (Error& e) { - if ((e.code() == error_code_tenant_already_exists && !tenantCollisions.empty()) || - (e.code() == error_code_invalid_tenant_configuration && !groupCollisions.empty())) { - if (!tenantCollisions.empty()) { - wait(self->resolveTenantCollisions( - self, clusterItr->first, clusterItr->second.db, tenantCollisions)); - tenantCollisions.clear(); - } - if (!groupCollisions.empty()) { - wait(self->resolveGroupCollisions( - self, clusterItr->first, clusterItr->second.db, groupCollisions)); - groupCollisions.clear(); - } - } else { + bool failedDueToCollision = + (e.code() == error_code_tenant_already_exists && !tenantCollisions.empty()) || + (e.code() == error_code_invalid_tenant_configuration && !groupCollisions.empty()); + if (!failedDueToCollision) { throw; } } + + std::vector> dataTenantsAfterRestore = + wait(getDataClusterTenants(clusterItr->second.db)); + + // Restoring a management cluster from data clusters should not change the data clusters at all + fmt::print("Checking data clusters: {}\n", completed); + ASSERT_EQ(dataTenantsBeforeRestore.size(), dataTenantsAfterRestore.size()); + for (int i = 0; i < dataTenantsBeforeRestore.size(); ++i) { + ASSERT_EQ(dataTenantsBeforeRestore[i].first, dataTenantsAfterRestore[i].first); + ASSERT(dataTenantsBeforeRestore[i].second == dataTenantsAfterRestore[i].second); + } + + // If we didn't succeed, resolve tenant and group collisions and try again + if (!completed) { + ASSERT(messages.size() > 0); + if (!tenantCollisions.empty()) { + wait(self->resolveTenantCollisions( + self, clusterItr->first, clusterItr->second.db, tenantCollisions)); + tenantCollisions.clear(); + } + if (!groupCollisions.empty()) { + wait(self->resolveGroupCollisions( + self, clusterItr->first, clusterItr->second.db, groupCollisions)); + groupCollisions.clear(); + } + } } TraceEvent("MetaclusterRestoreWorkloadRecoveredManagementCluster").detail("FromCluster", clusterItr->first); } @@ -749,11 +793,19 @@ struct MetaclusterRestoreWorkload : TestWorkload { if (self->recoverManagementCluster) { wait(resetManagementCluster(self)); + } else { + KeyBackedRangeResult> tenants = + wait(runTransaction(self->managementDb, [](Reference tr) { + return MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1); + })); + ASSERT_LE(tenants.results.size(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + self->managementTenantsBeforeRestore = tenants.results; } std::vector> restores; for (auto [cluster, backupUrl] : backups) { - restores.push_back(restoreCluster( + restores.push_back(restoreDataCluster( cluster, self->dataDbs[cluster].db, backupUrl.get(), !self->recoverManagementCluster, self)); } @@ -849,9 +901,31 @@ struct MetaclusterRestoreWorkload : TestWorkload { })); ASSERT_LE(tenants.results.size(), CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - std::map tenantMap(tenants.results.begin(), tenants.results.end()); - for (auto& [tenantId, tenantData] : self->createdTenants) { + + // If we did not restore the management cluster, then every tenant present in the management cluster before the + // restore should be present after the restore. All tenants in the management cluster should be unchanged except + // for those tenants that were created after the backup and lost during the restore, which will be marked in an + // error state. + for (auto const& [tenantId, tenantEntry] : self->managementTenantsBeforeRestore) { + auto itr = tenantMap.find(tenantId); + ASSERT(itr != tenantMap.end()); + + TenantMapEntry postRecoveryEntry = itr->second; + if (postRecoveryEntry.tenantState == TenantState::ERROR) { + ASSERT(self->dataDbs[itr->second.assignedCluster.get()].restored); + postRecoveryEntry.tenantState = tenantEntry.tenantState; + postRecoveryEntry.error.clear(); + } + + ASSERT(tenantEntry == postRecoveryEntry); + } + + if (!self->managementTenantsBeforeRestore.empty()) { + ASSERT_EQ(self->managementTenantsBeforeRestore.size(), tenantMap.size()); + } + + for (auto const& [tenantId, tenantData] : self->createdTenants) { auto tenantItr = tenantMap.find(tenantId); if (tenantItr == tenantMap.end()) { // A tenant that we expected to have been created can only be missing from the management cluster if we @@ -864,13 +938,24 @@ struct MetaclusterRestoreWorkload : TestWorkload { ASSERT(tenantItr->second.tenantState == TenantState::ERROR || (tenantItr->second.tenantState == TenantState::READY && tenantData.createTime == TenantData::AccessTime::DURING_BACKUP)); + if (tenantItr->second.tenantState == TenantState::ERROR) { + ASSERT(self->dataDbs[tenantData.cluster].restoreHasMessages); + } } else { ASSERT_EQ(tenantItr->second.tenantState, TenantState::READY); } } } - // TODO: evaluate extra tenants in tenantMap + // If we recovered both the management and some data clusters, we might undelete a tenant + // Check that any unexpected tenants were deleted and that we had a potentially lossy recovery + for (auto const& [tenantId, tenantEntry] : tenantMap) { + if (!self->createdTenants.count(tenantId)) { + ASSERT(self->deletedTenants.count(tenantId)); + ASSERT(self->recoverManagementCluster); + ASSERT(self->recoverDataClusters); + } + } return Void(); } From f9a68056ac7537ff96fd2370ece429e85945135d Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 9 Feb 2023 15:33:40 -0800 Subject: [PATCH 30/57] Add support for modifying a data cluster that is being restored so that we can manage conflicts --- fdbcli/MetaclusterCommands.actor.cpp | 42 ++++++- .../fdbclient/MetaclusterManagement.actor.h | 113 ++++++++++++++---- .../MetaclusterManagementWorkload.actor.cpp | 8 +- .../MetaclusterRestoreWorkload.actor.cpp | 30 +++-- 4 files changed, 150 insertions(+), 43 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index c2fce540fc..29bb251421 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -151,13 +151,37 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector fmt::print("Removes the specified data cluster from a metacluster.\n"); fmt::print("If FORCE is specified, then the cluster will be detached even if it has\n" "tenants assigned to it.\n"); + fmt::print("If run on a data cluster, the data cluster will remove its association\n" + "with the metacluster without modifying the management cluster. Doing so\n" + "requires the FORCE option to be set. Use of this mode is required to\n" + "repopulate a management cluster from a data cluster using the\n" + "`metacluster restore' command.\n"); return false; } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); + state bool force = tokens.size() == 4; - fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); + state ClusterType clusterType = wait(runTransaction(db, [](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return TenantAPI::getClusterType(tr); + })); + + if (clusterType == ClusterType::METACLUSTER_DATA && !force) { + fmt::print("ERROR: cannot remove a data cluster directly. To remove a data cluster,\n" + "use the `remove' command on the management cluster. To force a data cluster\n" + "to forget its metacluster association without fully removing it, use FORCE.\n"); + } + + wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, force)); + + if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { + fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); + } else { + fmt::print("The cluster `{}' has removed its association with its metacluster.\n" + "The metacluster has not been modified.\n", + printable(clusterName).c_str()); + } return true; } @@ -166,17 +190,23 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto if (tokens.size() != 5) { fmt::print("Usage: metacluster restore connection_string=\n" "\n\n"); - fmt::print("Add a restored data cluster back to a metacluster.\n"); + + fmt::print("Add a restored data cluster back to a metacluster.\n\n"); + fmt::print("Use `restore_known_data_cluster' to add back a restored copy of a data cluster\n"); fmt::print("that the metacluster is already tracking. This mode should be used if only data\n"); fmt::print("clusters are being restored, and any discrepancies between the management and\n"); - fmt::print("data clusters will be resolved using the management cluster metadata.\n"); + fmt::print("data clusters will be resolved using the management cluster metadata.\n\n"); + fmt::print("Use `repopulate_from_data_cluster' to rebuild a lost management cluster from the\n"); fmt::print("data clusters in a metacluster. This mode should be used if the management\n"); fmt::print("cluster is being restored. If any data clusters are also being restored, the\n"); fmt::print("oldest data clusters should be added first before any non-recovered data\n"); - fmt::print("clusters. Any discrepancies arising between the data clusters will be resolved\n"); - fmt::print("using the data cluster that was added last."); + fmt::print("clusters. Any conflicts arising between the added data cluster and existing data\n"); + fmt::print("will cause the restore to fail. Before repopulating a metacluster from a data\n"); + fmt::print("cluster, that data cluster needs to be detached from its prior metacluster using\n"); + fmt::print("the `metacluster remove' command.\n"); + return false; } diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 7b25b14a91..620c94058a 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -604,7 +604,8 @@ void updateClusterMetadata(Transaction tr, if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); } else if (!isRestoring && previousMetadata.entry.clusterState == DataClusterState::RESTORING && - (!updatedEntry.present() || updatedEntry.get().clusterState != DataClusterState::READY)) { + (!updatedEntry.present() || (updatedEntry.get().clusterState != DataClusterState::READY && + updatedEntry.get().clusterState != DataClusterState::REMOVING))) { throw cluster_restoring(); } else if (isRestoring) { ASSERT(previousMetadata.entry.clusterState == DataClusterState::RESTORING || @@ -627,6 +628,7 @@ static Future registerInManagementCluster(Transaction tr, state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, clusterName)); if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration(DataClusterMetadata(clusterEntry, connectionString))) { + TraceEvent("RegisterClusterAlreadyExists").detail("ClusterName", clusterName); throw cluster_already_exists(); } else if (!dataClusterMetadata.present()) { clusterEntry.allocated = ClusterUsage(); @@ -765,14 +767,16 @@ struct RemoveClusterImpl { MetaclusterOperationContext ctx; // Initialization parameters + Reference db; + ClusterType clusterType; bool forceRemove; // Parameters set in markClusterRemoving Optional lastTenantId; - RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) - : ctx(managementDb, clusterName, { DataClusterState::REMOVING, DataClusterState::RESTORING }), - forceRemove(forceRemove) {} + RemoveClusterImpl(Reference db, ClusterName clusterName, ClusterType clusterType, bool forceRemove) + : ctx(db, clusterName, { DataClusterState::REMOVING, DataClusterState::RESTORING }), db(db), + clusterType(clusterType), forceRemove(forceRemove) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { @@ -811,7 +815,8 @@ struct RemoveClusterImpl { } // Delete metacluster metadata from the data cluster - ACTOR static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { + ACTOR template + static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { // Delete metacluster related metadata MetaclusterMetadata::metaclusterRegistration().clear(tr); TenantMetadata::tenantTombstones().clear(tr); @@ -828,9 +833,7 @@ struct RemoveClusterImpl { } } - TraceEvent("ReconfiguredDataCluster") - .detail("Name", self->ctx.clusterName.get()) - .detail("Version", tr->getCommittedVersion()); + TraceEvent("ReconfiguredDataCluster").detail("Name", self->ctx.clusterName.get()); return Void(); } @@ -965,7 +968,51 @@ struct RemoveClusterImpl { return Void(); } + ACTOR static Future dataClusterForgetMetacluster(RemoveClusterImpl* self, + Reference tr) { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Optional metaclusterRegistrationEntry = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + if (!metaclusterRegistrationEntry.present()) { + return Void(); + } + + if (metaclusterRegistrationEntry.get().clusterType != ClusterType::METACLUSTER_DATA) { + TraceEvent(SevWarn, "CannotRemoveNonDataCluster") + .detail("ClusterName", self->ctx.clusterName.get()) + .detail("MetaclusterRegistration", + metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); + throw invalid_metacluster_operation(); + } + + if (metaclusterRegistrationEntry.get().name != self->ctx.clusterName.get()) { + TraceEvent(SevWarn, "CannotRemoveDataClusterWithNameMismatch") + .detail("ExpectedName", self->ctx.clusterName.get()) + .detail("MetaclusterRegistration", + metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); + throw invalid_metacluster_operation(); + } + + wait(updateDataCluster(self, tr)); + + return Void(); + } + ACTOR static Future run(RemoveClusterImpl* self) { + // On data clusters, we forget the metacluster information without updating the management cluster + if (self->clusterType == ClusterType::METACLUSTER_DATA) { + if (!self->forceRemove) { + throw invalid_metacluster_operation(); + } + + wait(runTransaction(self->db, [self = self](Reference tr) { + return dataClusterForgetMetacluster(self, tr); + })); + + return Void(); + } + state bool clusterIsPresent; try { wait(store(clusterIsPresent, @@ -1003,15 +1050,14 @@ struct RemoveClusterImpl { } } } - return Void(); } Future run() { return run(this); } }; ACTOR template -Future removeCluster(Reference db, ClusterName name, bool forceRemove) { - state RemoveClusterImpl impl(db, name, forceRemove); +Future removeCluster(Reference db, ClusterName name, ClusterType clusterType, bool forceRemove) { + state RemoveClusterImpl impl(db, name, clusterType, forceRemove); wait(impl.run()); return Void(); } @@ -1174,11 +1220,12 @@ struct RestoreClusterImpl { messages(messages) {} // If restoring a data cluster, verify that it has a matching registration entry - // If adding a data cluster to a restored management cluster, update the data cluster registration entry + // If adding a data cluster to a restored management cluster, create a new data cluster registration entry // with the new management cluster name/ID ACTOR static Future processMetaclusterRegistration(RestoreClusterImpl* self) { state Reference db = wait(openDatabase(self->connectionString)); state Reference tr = db->createTransaction(); + state UID dataClusterId = deterministicRandom()->randomUniqueID(); loop { try { @@ -1186,24 +1233,38 @@ struct RestoreClusterImpl { state Optional metaclusterRegistration = wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); - if (!metaclusterRegistration.present()) { - throw invalid_data_cluster(); - } - - if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { - if (self->applyManagementClusterUpdates) { + if (self->applyManagementClusterUpdates) { + if (!metaclusterRegistration.present() && self->applyManagementClusterUpdates) { + throw invalid_data_cluster(); + } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || + metaclusterRegistration.get().name != self->clusterName) { + TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") + .detail("ExistingRegistration", metaclusterRegistration.get()) + .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); throw cluster_already_exists(); - } else { - MetaclusterMetadata::metaclusterRegistration().set( - tr, - self->ctx.metaclusterRegistration.get().toDataClusterRegistration( - metaclusterRegistration.get().name, metaclusterRegistration.get().id)); - - wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); } + + self->dataClusterId = metaclusterRegistration.get().id; + } else { + MetaclusterRegistrationEntry dataClusterEntry = + self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, + dataClusterId); + self->dataClusterId = dataClusterEntry.id; + + if (metaclusterRegistration.present()) { + if (dataClusterEntry.matches(metaclusterRegistration.get())) { + break; + } + + TraceEvent(SevWarn, "MetaclusterRestoreClusterAlreadyRegistered") + .detail("ExistingRegistration", metaclusterRegistration.get()); + throw cluster_already_registered(); + } + + MetaclusterMetadata::metaclusterRegistration().set(tr, dataClusterEntry); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); } - self->dataClusterId = metaclusterRegistration.get().id; break; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index a5ec3852bc..9540af1646 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -214,8 +214,8 @@ struct MetaclusterManagementWorkload : TestWorkload { try { loop { // TODO: check force removal - Future removeFuture = - MetaclusterAPI::removeCluster(self->managementDb, clusterName, detachCluster); + Future removeFuture = MetaclusterAPI::removeCluster( + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, detachCluster); try { Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -956,8 +956,8 @@ struct MetaclusterManagementWorkload : TestWorkload { std::vector> removeClusterFutures; for (auto [clusterName, clusterMetadata] : dataClusters) { - removeClusterFutures.push_back( - MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants)); + removeClusterFutures.push_back(MetaclusterAPI::removeCluster( + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, !deleteTenants)); } wait(waitForAll(removeClusterFutures)); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 685a63a4e7..24be294fa1 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -307,8 +307,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { // We don't expect the data cluster tenant, so delete it else { removeTrackedTenant(t.second.first); - deleteFutures.push_back(TenantAPI::deleteTenant( - dataDb.getReference(), t.first, t.second.first, ClusterType::METACLUSTER_DATA)); + deleteFutures.push_back(TenantAPI::deleteTenant(dataDb.getReference(), t.first, t.second.first)); } } @@ -379,8 +378,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::vector> groupDeletions; for (auto const& t : tenantsInGroup) { self->removeTrackedTenant(t); - groupDeletions.push_back( - TenantAPI::deleteTenantTransaction(tr, t, ClusterType::METACLUSTER_DATA)); + groupDeletions.push_back(TenantAPI::deleteTenantTransaction(tr, t)); } return waitForAll(groupDeletions); })); @@ -403,10 +401,16 @@ struct MetaclusterRestoreWorkload : TestWorkload { } ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { + TraceEvent("MetaclusterRestoreWorkloadRestoringManagementCluster"); wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); state std::map::iterator clusterItr; for (clusterItr = self->dataDbs.begin(); clusterItr != self->dataDbs.end(); ++clusterItr) { - TraceEvent("MetaclusterRestoreWorkloadRecoverManagementCluster").detail("FromCluster", clusterItr->first); + TraceEvent("MetaclusterRestoreWorkloadProcessDataCluster").detail("FromCluster", clusterItr->first); + + wait(MetaclusterAPI::removeCluster( + clusterItr->second.db.getReference(), clusterItr->first, ClusterType::METACLUSTER_DATA, true)); + TraceEvent("MetaclusterRestoreWorkloadForgotMetacluster").detail("ClusterName", clusterItr->first); + state KeyBackedRangeResult> managementTenantList; state KeyBackedRangeResult> managementGroupList; @@ -467,7 +471,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { wait(getDataClusterTenants(clusterItr->second.db)); try { - TraceEvent("MetaclusterRestoreWorkloadRecoverManagementCluster") + TraceEvent("MetaclusterRestoreWorkloadRestoreManagementCluster") .detail("FromCluster", clusterItr->first) .detail("TenantCollisions", tenantCollisions.size()); @@ -487,6 +491,17 @@ struct MetaclusterRestoreWorkload : TestWorkload { if (!failedDueToCollision) { throw; } + + try { + wait(MetaclusterAPI::removeCluster( + self->managementDb, clusterItr->first, ClusterType::METACLUSTER_MANAGEMENT, true)); + TraceEvent("MetaclusterRestoreWorkloadRemoveFailedCluster") + .detail("ClusterName", clusterItr->first); + } catch (Error& e) { + if (e.code() != error_code_cluster_not_found) { + throw; + } + } } std::vector> dataTenantsAfterRestore = @@ -515,7 +530,8 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } } - TraceEvent("MetaclusterRestoreWorkloadRecoveredManagementCluster").detail("FromCluster", clusterItr->first); + TraceEvent("MetaclusterRestoreWorkloadRestoredDataClusterToManagementCluster") + .detail("FromCluster", clusterItr->first); } TraceEvent("MetaclusterRestoreWorkloadRestoredManagementCluster"); From 4b13c9c21116ce9456b5f54a18275dca8aaf7ce1 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 10 Feb 2023 10:41:55 -0800 Subject: [PATCH 31/57] Make a few minor fixes, refactor some code for clarity, and improve throughput of repopulating a management cluster --- fdbcli/MetaclusterCommands.actor.cpp | 2 +- fdbclient/ClientKnobs.cpp | 4 +- fdbclient/include/fdbclient/ClientKnobs.h | 2 +- .../fdbclient/MetaclusterManagement.actor.h | 499 +++++++++++------- .../MetaclusterManagementWorkload.actor.cpp | 1 - .../MetaclusterRestoreWorkload.actor.cpp | 179 ++++--- 6 files changed, 401 insertions(+), 286 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 29bb251421..ac92ba99db 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -242,7 +242,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto if (!messages.empty()) { if (!success) { - fmt::print("\n"); + fmt::print(stderr, "\n"); } fmt::print(success ? stdout : stderr, "The restore reported the following messages:\n"); diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index c77559811b..c590c6093f 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -297,8 +297,8 @@ void ClientKnobs::initialize(Randomize randomize) { init( METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK, 5 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK = 1; init( METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY, 1.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY = deterministicRandom()->random01() * 60; init( METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT, 10.0 ); if ( randomize && BUGGIFY ) METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT = 1 + deterministicRandom()->random01() * 59; - init( METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE, 1000 ); if ( randomize && BUGGIFY ) METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE = 1 + deterministicRandom()->randomInt(0, 3); - init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); + init( METACLUSTER_RESTORE_BATCH_SIZE, 1000 ); if ( randomize && BUGGIFY ) METACLUSTER_RESTORE_BATCH_SIZE = 1 + deterministicRandom()->randomInt(0, 3); + init( TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY, false ); init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false ); diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index 0dbe688193..5e30cb32c9 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -292,7 +292,7 @@ public: int METACLUSTER_ASSIGNMENT_CLUSTERS_TO_CHECK; double METACLUSTER_ASSIGNMENT_FIRST_CHOICE_DELAY; double METACLUSTER_ASSIGNMENT_AVAILABILITY_TIMEOUT; - int METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE; + int METACLUSTER_RESTORE_BATCH_SIZE; int TENANT_ENTRY_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantEntryCache is refreshed bool CLIENT_ENABLE_USING_CLUSTER_ID_KEY; diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 620c94058a..813df9c68d 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -968,35 +968,45 @@ struct RemoveClusterImpl { return Void(); } - ACTOR static Future dataClusterForgetMetacluster(RemoveClusterImpl* self, - Reference tr) { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - state Optional metaclusterRegistrationEntry = - wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + // Remove the metacluster registration entry on a data cluster without modifying the management cluster. + // Useful when reconstructing a management cluster when the original is lost. + ACTOR static Future dataClusterForgetMetacluster(RemoveClusterImpl* self) { + state Reference tr = self->db->createTransaction(); - if (!metaclusterRegistrationEntry.present()) { - return Void(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Optional metaclusterRegistrationEntry = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + + if (!metaclusterRegistrationEntry.present()) { + return Void(); + } + + if (metaclusterRegistrationEntry.get().clusterType != ClusterType::METACLUSTER_DATA) { + TraceEvent(SevWarn, "CannotRemoveNonDataCluster") + .detail("ClusterName", self->ctx.clusterName.get()) + .detail("MetaclusterRegistration", + metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); + throw invalid_metacluster_operation(); + } + + if (metaclusterRegistrationEntry.get().name != self->ctx.clusterName.get()) { + TraceEvent(SevWarn, "CannotRemoveDataClusterWithNameMismatch") + .detail("ExpectedName", self->ctx.clusterName.get()) + .detail("MetaclusterRegistration", + metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); + throw invalid_metacluster_operation(); + } + + wait(updateDataCluster(self, tr)); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } } - - if (metaclusterRegistrationEntry.get().clusterType != ClusterType::METACLUSTER_DATA) { - TraceEvent(SevWarn, "CannotRemoveNonDataCluster") - .detail("ClusterName", self->ctx.clusterName.get()) - .detail("MetaclusterRegistration", - metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); - throw invalid_metacluster_operation(); - } - - if (metaclusterRegistrationEntry.get().name != self->ctx.clusterName.get()) { - TraceEvent(SevWarn, "CannotRemoveDataClusterWithNameMismatch") - .detail("ExpectedName", self->ctx.clusterName.get()) - .detail("MetaclusterRegistration", - metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); - throw invalid_metacluster_operation(); - } - - wait(updateDataCluster(self, tr)); - - return Void(); } ACTOR static Future run(RemoveClusterImpl* self) { @@ -1006,10 +1016,7 @@ struct RemoveClusterImpl { throw invalid_metacluster_operation(); } - wait(runTransaction(self->db, [self = self](Reference tr) { - return dataClusterForgetMetacluster(self, tr); - })); - + wait(dataClusterForgetMetacluster(self)); return Void(); } @@ -1135,8 +1142,8 @@ void managementClusterAddTenantToGroup(Transaction tr, tr, Tuple::makeTuple(tenantEntry.tenantGroup.get(), tenantEntry.id)); } - if (!groupAlreadyExists) { - ASSERT(isRestoring || clusterMetadata->entry.hasCapacity()); + if (!groupAlreadyExists && !isRestoring) { + ASSERT(clusterMetadata->entry.hasCapacity()); DataClusterEntry updatedEntry = clusterMetadata->entry; ++updatedEntry.allocated.numTenantGroups; @@ -1220,12 +1227,33 @@ struct RestoreClusterImpl { messages(messages) {} // If restoring a data cluster, verify that it has a matching registration entry - // If adding a data cluster to a restored management cluster, create a new data cluster registration entry - // with the new management cluster name/ID - ACTOR static Future processMetaclusterRegistration(RestoreClusterImpl* self) { + ACTOR static Future loadDataClusterRegistration(RestoreClusterImpl* self) { + state Reference db = wait(openDatabase(self->connectionString)); + + Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(db)); + + if (!metaclusterRegistration.present()) { + throw invalid_data_cluster(); + } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || + metaclusterRegistration.get().name != self->clusterName) { + TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") + .detail("ExistingRegistration", metaclusterRegistration.get()) + .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); + throw cluster_already_exists(); + } + + self->dataClusterId = metaclusterRegistration.get().id; + self->ctx.dataClusterDb = db; + + return Void(); + } + + // If adding a data cluster to a restored management cluster, write a metacluster registration entry + // to attach it + ACTOR static Future writeDataClusterRegistration(RestoreClusterImpl* self) { state Reference db = wait(openDatabase(self->connectionString)); state Reference tr = db->createTransaction(); - state UID dataClusterId = deterministicRandom()->randomUniqueID(); loop { try { @@ -1233,76 +1261,48 @@ struct RestoreClusterImpl { state Optional metaclusterRegistration = wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); - if (self->applyManagementClusterUpdates) { - if (!metaclusterRegistration.present() && self->applyManagementClusterUpdates) { - throw invalid_data_cluster(); - } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || - metaclusterRegistration.get().name != self->clusterName) { - TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") - .detail("ExistingRegistration", metaclusterRegistration.get()) - .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); - throw cluster_already_exists(); + MetaclusterRegistrationEntry dataClusterEntry = + self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, + self->dataClusterId); + + if (metaclusterRegistration.present()) { + if (dataClusterEntry.matches(metaclusterRegistration.get())) { + break; } - self->dataClusterId = metaclusterRegistration.get().id; - } else { - MetaclusterRegistrationEntry dataClusterEntry = - self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, - dataClusterId); - self->dataClusterId = dataClusterEntry.id; - - if (metaclusterRegistration.present()) { - if (dataClusterEntry.matches(metaclusterRegistration.get())) { - break; - } - - TraceEvent(SevWarn, "MetaclusterRestoreClusterAlreadyRegistered") - .detail("ExistingRegistration", metaclusterRegistration.get()); - throw cluster_already_registered(); - } - - MetaclusterMetadata::metaclusterRegistration().set(tr, dataClusterEntry); - wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + TraceEvent(SevWarn, "MetaclusterRestoreClusterAlreadyRegistered") + .detail("ExistingRegistration", metaclusterRegistration.get()); + throw cluster_already_registered(); } + MetaclusterMetadata::metaclusterRegistration().set(tr, dataClusterEntry); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); break; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); } } - self->ctx.dataClusterDb = db; return Void(); } - ACTOR static Future markClusterRestoring(RestoreClusterImpl* self, Reference tr) { - // If we are attaching a new data cluster, then we need to register it - if (!self->applyManagementClusterUpdates) { - DataClusterEntry entry; - entry.id = self->dataClusterId; - entry.clusterState = DataClusterState::RESTORING; - wait(registerInManagementCluster(tr, self->clusterName, entry, self->connectionString)); - wait(self->ctx.setCluster(tr, self->clusterName)); - } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { - DataClusterEntry updatedEntry = self->ctx.dataClusterMetadata.get().entry; + void markClusterRestoring(Reference tr) { + if (ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING) { + DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::RESTORING; - updateClusterMetadata(tr, - self->ctx.clusterName.get(), - self->ctx.dataClusterMetadata.get(), - self->connectionString, - updatedEntry); + updateClusterMetadata( + tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), connectionString, updatedEntry); // Remove this cluster from the cluster capacity index, but leave its configured capacity intact in the // cluster entry. This allows us to retain the configured capacity while preventing the cluster from // being used to allocate new tenant groups. DataClusterEntry noCapacityEntry = updatedEntry; noCapacityEntry.capacity.numTenantGroups = 0; - updateClusterCapacityIndex(tr, self->ctx.clusterName.get(), updatedEntry, noCapacityEntry); + updateClusterCapacityIndex(tr, ctx.clusterName.get(), updatedEntry, noCapacityEntry); } - TraceEvent("MarkedDataClusterRestoring").detail("Name", self->ctx.clusterName.get()); - return Void(); + TraceEvent("MarkedDataClusterRestoring").detail("Name", clusterName); } void markClusterAsReady(Reference tr) { @@ -1389,67 +1389,6 @@ struct RestoreClusterImpl { return Void(); } - ACTOR static Future addTenantToManagementCluster(RestoreClusterImpl* self, - Reference tr, - TenantMapEntry tenantEntry) { - state Future> tenantGroupEntry = Optional(); - if (tenantEntry.tenantGroup.present()) { - tenantGroupEntry = - ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, tenantEntry.tenantGroup.get()); - } - - Optional existingEntry = wait(tryGetTenantTransaction(tr, tenantEntry.tenantName)); - if (existingEntry.present()) { - if (existingEntry.get().assignedCluster == self->ctx.clusterName) { - ASSERT(existingEntry.get().matchesConfiguration(tenantEntry)); - // This is a retry, so return success - return Void(); - } else { - self->messages.push_back(fmt::format("The tenant `{}' already exists on cluster `{}'", - printable(tenantEntry.tenantName), - printable(existingEntry.get().assignedCluster))); - throw tenant_already_exists(); - } - } - - int64_t lastTenantId = - wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.getD(tr, Snapshot::False, 0)); - ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, std::max(lastTenantId, tenantEntry.id)); - - tenantEntry.tenantState = TenantState::READY; - tenantEntry.assignedCluster = self->ctx.clusterName; - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantEntry.id, tenantEntry); - ManagementClusterMetadata::tenantMetadata().tenantNameIndex.set(tr, tenantEntry.tenantName, tenantEntry.id); - ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); - - ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); - ManagementClusterMetadata::clusterTenantCount.atomicOp( - tr, tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); - - // Updated indexes to include the new tenant - ManagementClusterMetadata::clusterTenantIndex.insert( - tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantName, tenantEntry.id)); - - wait(success(tenantGroupEntry)); - - if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->ctx.clusterName) { - self->messages.push_back( - fmt::format("The tenant `{}' is part of a tenant group `{}' that already exists on cluster `{}'", - printable(tenantEntry.tenantName), - printable(tenantEntry.tenantGroup.get()), - printable(tenantGroupEntry.get().get().assignedCluster))); - throw invalid_tenant_configuration(); - } - - managementClusterAddTenantToGroup(tr, - tenantEntry, - &self->ctx.dataClusterMetadata.get(), - GroupAlreadyExists(tenantGroupEntry.get().present()), - IsRestoring::True); - - return Void(); - } - ACTOR static Future renameTenant(RestoreClusterImpl* self, Reference tr, int64_t tenantId, @@ -1459,8 +1398,8 @@ struct RestoreClusterImpl { state Optional entry; state Optional newId; - wait(store(entry, TenantAPI::tryGetTenantTransaction(tr, tenantId))); - wait(store(newId, TenantMetadata::tenantNameIndex().get(tr, newTenantName))); + wait(store(entry, TenantAPI::tryGetTenantTransaction(tr, tenantId)) && + store(newId, TenantMetadata::tenantNameIndex().get(tr, newTenantName))); if (entry.present()) { if (entry.get().tenantName == oldTenantName && !newId.present()) { @@ -1529,33 +1468,11 @@ struct RestoreClusterImpl { state std::unordered_map::iterator managementEntry = self->mgmtClusterTenantMap.find(tenantEntry.id); - // Delete + // A data cluster tenant is not present on the management cluster if (managementEntry == self->mgmtClusterTenantMap.end()) { - if (self->applyManagementClusterUpdates) { - wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { - return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); - })); - } else { - wait(self->ctx.runManagementTransaction( - [self = self, tenantEntry = tenantEntry](Reference tr) { - return addTenantToManagementCluster(self, tr, tenantEntry); - })); - } - - return Optional>(); - } else if (!self->applyManagementClusterUpdates) { - // We have an ID match with an existing tenant. This is only allowed if we are retrying the restore, in - // which case we expect that the tenant will have the same name, ID, and assigned cluster. - if (managementEntry->second.tenantName != tenantEntry.tenantName || - managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { - self->messages.push_back( - fmt::format("The tenant `{}' has the same ID {} as an existing tenant `{}' on cluster `{}'", - printable(tenantEntry.tenantName), - tenantEntry.id, - printable(managementEntry->second.tenantName), - printable(managementEntry->second.assignedCluster))); - throw tenant_already_exists(); - } + wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { + return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); + })); return Optional>(); } else { @@ -1630,6 +1547,153 @@ struct RestoreClusterImpl { return Void(); } + // Returns true if the group needs to be created + ACTOR static Future addTenantToManagementCluster(RestoreClusterImpl* self, + Reference tr, + TenantMapEntry tenantEntry) { + state Future> tenantGroupEntry = Optional(); + if (tenantEntry.tenantGroup.present()) { + tenantGroupEntry = + ManagementClusterMetadata::tenantMetadata().tenantGroupMap.get(tr, tenantEntry.tenantGroup.get()); + } + + Optional existingEntry = wait(tryGetTenantTransaction(tr, tenantEntry.tenantName)); + if (existingEntry.present()) { + if (existingEntry.get().assignedCluster == self->ctx.clusterName) { + ASSERT(existingEntry.get().matchesConfiguration(tenantEntry)); + // This is a retry, so return success + return false; + } else { + self->messages.push_back(fmt::format("The tenant `{}' already exists on cluster `{}'", + printable(tenantEntry.tenantName), + printable(existingEntry.get().assignedCluster))); + throw tenant_already_exists(); + } + } + + tenantEntry.tenantState = TenantState::READY; + tenantEntry.assignedCluster = self->ctx.clusterName; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantEntry.id, tenantEntry); + ManagementClusterMetadata::tenantMetadata().tenantNameIndex.set(tr, tenantEntry.tenantName, tenantEntry.id); + + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); + + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantName, tenantEntry.id)); + + wait(success(tenantGroupEntry)); + + if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->ctx.clusterName) { + self->messages.push_back( + fmt::format("The tenant `{}' is part of a tenant group `{}' that already exists on cluster `{}'", + printable(tenantEntry.tenantName), + printable(tenantEntry.tenantGroup.get()), + printable(tenantGroupEntry.get().get().assignedCluster))); + throw invalid_tenant_configuration(); + } + + managementClusterAddTenantToGroup(tr, + tenantEntry, + &self->ctx.dataClusterMetadata.get(), + GroupAlreadyExists(tenantGroupEntry.get().present()), + IsRestoring::True); + + return !tenantGroupEntry.get().present(); + } + + ACTOR static Future addTenantBatchToManagementCluster(RestoreClusterImpl* self, + Reference tr, + std::vector tenants) { + state std::vector> futures; + state int64_t maxId = 0; + for (auto const& t : tenants) { + maxId = std::max(maxId, t.id); + futures.push_back(addTenantToManagementCluster(self, tr, t)); + } + + wait(waitForAll(futures)); + + std::set groupsCreated; + state int numGroupsCreated = 0; + for (int i = 0; i < tenants.size(); ++i) { + if (futures[i].get()) { + if (tenants[i].tenantGroup.present()) { + groupsCreated.insert(tenants[i].tenantGroup.get()); + } else { + ++numGroupsCreated; + } + } + } + + numGroupsCreated += groupsCreated.size(); + + if (numGroupsCreated > 0) { + state DataClusterMetadata clusterMetadata = wait(getClusterTransaction(tr, self->ctx.clusterName.get())); + + DataClusterEntry updatedEntry = clusterMetadata.entry; + updatedEntry.allocated.numTenantGroups += numGroupsCreated; + updateClusterMetadata(tr, + self->ctx.clusterName.get(), + clusterMetadata, + Optional(), + updatedEntry, + IsRestoring::True); + } + + int64_t lastTenantId = + wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.getD(tr, Snapshot::False, 0)); + + ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, std::max(lastTenantId, maxId)); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); + + return Void(); + } + + ACTOR static Future addTenantsToManagementCluster(RestoreClusterImpl* self) { + state std::unordered_map::iterator itr; + state std::vector tenantBatch; + + for (itr = self->dataClusterTenantMap.begin(); itr != self->dataClusterTenantMap.end(); ++itr) { + state std::unordered_map::iterator managementEntry = + self->mgmtClusterTenantMap.find(itr->second.id); + if (managementEntry == self->mgmtClusterTenantMap.end()) { + tenantBatch.push_back(itr->second); + } else if (managementEntry->second.tenantName != itr->second.tenantName || + managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { + ASSERT(managementEntry->second.matchesConfiguration(itr->second)); + self->messages.push_back( + fmt::format("The tenant `{}' has the same ID {} as an existing tenant `{}' on cluster `{}'", + printable(itr->second.tenantName), + itr->second.id, + printable(managementEntry->second.tenantName), + printable(managementEntry->second.assignedCluster))); + throw tenant_already_exists(); + } + + if (tenantBatch.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { + wait(runTransaction(self->ctx.managementDb, + [self = self, tenantBatch = tenantBatch](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return addTenantBatchToManagementCluster(self, tr, tenantBatch); + })); + tenantBatch.clear(); + } + } + + if (!tenantBatch.empty()) { + wait(runTransaction(self->ctx.managementDb, + [self = self, tenantBatch = tenantBatch](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return addTenantBatchToManagementCluster(self, tr, tenantBatch); + })); + } + + return Void(); + } + ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); state std::vector missingTenants; @@ -1650,7 +1714,7 @@ struct RestoreClusterImpl { managementTenant.tenantState != TenantState::ERROR) { missingTenants.push_back(tenantId); ++missingTenantCount; - if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_MISSING_TENANTS_BATCH_SIZE) { + if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { wait(self->ctx.runManagementTransaction( [self = self, missingTenants = missingTenants](Reference tr) { return markManagementTenantsAsError(self, tr, missingTenants); @@ -1678,25 +1742,21 @@ struct RestoreClusterImpl { return Void(); } - // This only supports the restore of an already registered data cluster, for now. - ACTOR static Future run(RestoreClusterImpl* self) { - // If we are applying management cluster updates, then we expect the data cluster to already be registered there - // Run a management transaction to load the data cluster metadata + ACTOR static Future runDataClusterRestore(RestoreClusterImpl* self) { + // Run a management transaction to populate the data cluster metadata wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - if (self->applyManagementClusterUpdates) { - return self->ctx.setCluster(tr, self->clusterName); - } else { - return Future(Void()); - } + return self->ctx.setCluster(tr, self->clusterName); })); // Make sure that the data cluster being restored has the appropriate metacluster registration entry and name - wait(processMetaclusterRegistration(self)); + wait(loadDataClusterRegistration(self)); // set state to restoring try { - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return markClusterRestoring(self, tr); })); + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterRestoring(tr); + return Future(Void()); + })); } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to restore the cluster, it // will throw an error indicating that the restore has already started @@ -1705,13 +1765,13 @@ struct RestoreClusterImpl { } } + // get all the tenants in the metacluster + wait(getAllTenantsFromManagementCluster(self)); + // get all the tenant information from the newly registered data cluster wait(self->ctx.runDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - // get all the tenant in the metacluster - wait(getAllTenantsFromManagementCluster(self)); - // Fix any differences between the data cluster and the management cluster wait(reconcileTenants(self)); @@ -1719,19 +1779,62 @@ struct RestoreClusterImpl { wait(processMissingTenants(self)); // set restored cluster to ready state - try { - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - self->markClusterAsReady(tr); - return Future(Void()); - })); - } catch (Error& e) { - throw; - } + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterAsReady(tr); + return Future(Void()); + })); return Void(); } - Future run() { return run(this); } + ACTOR static Future runManagementClusterRepopulate(RestoreClusterImpl* self) { + self->dataClusterId = deterministicRandom()->randomUniqueID(); + + // Record the data cluster in the management cluster + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + DataClusterEntry entry; + entry.id = self->dataClusterId; + entry.clusterState = DataClusterState::RESTORING; + return registerInManagementCluster(tr, self->clusterName, entry, self->connectionString); + })); + + // Write a metacluster registration entry in the data cluster + wait(writeDataClusterRegistration(self)); + + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return self->ctx.setCluster(tr, self->clusterName); + })); + + // get all the tenants in the metacluster + wait(getAllTenantsFromManagementCluster(self)); + + // get all the tenant information from the newly registered data cluster + try { + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); + } catch (Error& e) { + throw; + } + + // Fix any differences between the data cluster and the management cluster + wait(addTenantsToManagementCluster(self)); + + // set restored cluster to ready state + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterAsReady(tr); + return Future(Void()); + })); + + return Void(); + } + + Future run() { + if (applyManagementClusterUpdates) { + return runDataClusterRestore(this); + } else { + return runManagementClusterRepopulate(this); + } + } }; ACTOR template diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 9540af1646..72b7139233 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -213,7 +213,6 @@ struct MetaclusterManagementWorkload : TestWorkload { try { loop { - // TODO: check force removal Future removeFuture = MetaclusterAPI::removeCluster( self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, detachCluster); try { diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 24be294fa1..2c98b3e7de 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -105,10 +105,6 @@ struct MetaclusterRestoreWorkload : TestWorkload { recoverDataClusters = (mode != 1); } - void disableFailureInjectionWorkloads(std::set& out) const override { - out.insert("MachineAttritionWorkload"); - } - ClusterName chooseClusterName() { return dataDbIndex[deterministicRandom()->randomInt(0, dataDbIndex.size())]; } TenantName chooseTenantName() { @@ -288,11 +284,16 @@ struct MetaclusterRestoreWorkload : TestWorkload { } } - Future resolveTenantCollisions( - MetaclusterRestoreWorkload* self, - ClusterName clusterName, - Database dataDb, - std::unordered_map> const& tenantCollisions) { + // A map from tenant name to a pair of IDs. The first ID is from the data cluster, and the second is from the + // management cluster. + using TenantCollisions = std::unordered_map>; + + using GroupCollisions = std::unordered_set; + + Future resolveTenantCollisions(MetaclusterRestoreWorkload* self, + ClusterName clusterName, + Database dataDb, + TenantCollisions const& tenantCollisions) { TraceEvent("MetaclusterRestoreWorkloadDeleteTenantCollisions") .detail("FromCluster", clusterName) .detail("TenantCollisions", tenantCollisions.size()); @@ -334,25 +335,26 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR Future resolveGroupCollisions(MetaclusterRestoreWorkload* self, ClusterName clusterName, Database dataDb, - std::unordered_map groupCollisions) { + GroupCollisions groupCollisions) { TraceEvent("MetaclusterRestoreWorkloadDeleteTenantGroupCollisions") .detail("FromCluster", clusterName) .detail("GroupCollisions", groupCollisions.size()); state std::vector> deleteFutures; - state std::unordered_map::const_iterator collisionItr; + state GroupCollisions::const_iterator collisionItr; for (collisionItr = groupCollisions.begin(); collisionItr != groupCollisions.end(); ++collisionItr) { - // The tenant group from the data cluster is what we expect - auto itr = self->tenantGroups.find(collisionItr->first); + // If the data cluster tenant group is expected, then remove the management tenant group + // Note that the management tenant group may also have been expected + auto itr = self->tenantGroups.find(*collisionItr); if (itr->second.cluster == clusterName) { TraceEvent(SevDebug, "MetaclusterRestoreWorkloadDeleteTenantGroupCollision") .detail("From", "ManagementCluster") - .detail("TenantGroup", collisionItr->first); + .detail("TenantGroup", *collisionItr); std::unordered_set tenantsInGroup = wait(runTransaction(self->managementDb, [collisionItr = collisionItr](Reference tr) { return getTenantsInGroup( - tr, MetaclusterAPI::ManagementClusterMetadata::tenantMetadata(), collisionItr->first); + tr, MetaclusterAPI::ManagementClusterMetadata::tenantMetadata(), *collisionItr); })); for (auto const& t : tenantsInGroup) { @@ -365,11 +367,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { else { TraceEvent(SevDebug, "MetaclusterRestoreWorkloadDeleteTenantGroupCollision") .detail("From", "DataCluster") - .detail("TenantGroup", collisionItr->first); + .detail("TenantGroup", *collisionItr); std::unordered_set tenantsInGroup = wait(runTransaction( dataDb.getReference(), [collisionItr = collisionItr](Reference tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - return getTenantsInGroup(tr, TenantMetadata::instance(), collisionItr->first); + return getTenantsInGroup(tr, TenantMetadata::instance(), *collisionItr); })); deleteFutures.push_back(runTransactionVoid( @@ -400,6 +402,67 @@ struct MetaclusterRestoreWorkload : TestWorkload { return tenants.results; } + ACTOR static Future> getCollisions(MetaclusterRestoreWorkload* self, + Database db) { + state KeyBackedRangeResult> managementTenantList; + state KeyBackedRangeResult> managementGroupList; + state KeyBackedRangeResult> dataClusterTenants; + state KeyBackedRangeResult> dataClusterGroups; + + state TenantCollisions tenantCollisions; + state GroupCollisions groupCollisions; + + // Read the management cluster tenant map and tenant group map + wait(runTransactionVoid( + self->managementDb, + [managementTenantList = &managementTenantList, + managementGroupList = &managementGroupList](Reference tr) { + return store(*managementTenantList, + MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && + store(*managementGroupList, + MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + })); + + // Read the data cluster tenant map and tenant group map + wait(runTransaction(db.getReference(), + [dataClusterTenants = &dataClusterTenants, + dataClusterGroups = &dataClusterGroups](Reference tr) { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return store(*dataClusterTenants, + TenantMetadata::tenantNameIndex().getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && + store(*dataClusterGroups, + TenantMetadata::tenantGroupMap().getRange( + tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); + })); + + std::unordered_map managementTenants(managementTenantList.results.begin(), + managementTenantList.results.end()); + std::unordered_map managementGroups(managementGroupList.results.begin(), + managementGroupList.results.end()); + + ASSERT(managementTenants.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + ASSERT(managementGroups.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + ASSERT(dataClusterTenants.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + ASSERT(dataClusterGroups.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); + + for (auto const& t : dataClusterTenants.results) { + auto itr = managementTenants.find(t.first); + if (itr != managementTenants.end()) { + tenantCollisions[t.first] = std::make_pair(t.second, itr->second); + } + } + for (auto const& g : dataClusterGroups.results) { + if (managementGroups.count(g.first)) { + groupCollisions.insert(g.first); + } + } + + return std::make_pair(tenantCollisions, groupCollisions); + } + ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { TraceEvent("MetaclusterRestoreWorkloadRestoringManagementCluster"); wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); @@ -407,62 +470,13 @@ struct MetaclusterRestoreWorkload : TestWorkload { for (clusterItr = self->dataDbs.begin(); clusterItr != self->dataDbs.end(); ++clusterItr) { TraceEvent("MetaclusterRestoreWorkloadProcessDataCluster").detail("FromCluster", clusterItr->first); + // Remove the data cluster from its old metacluster wait(MetaclusterAPI::removeCluster( clusterItr->second.db.getReference(), clusterItr->first, ClusterType::METACLUSTER_DATA, true)); TraceEvent("MetaclusterRestoreWorkloadForgotMetacluster").detail("ClusterName", clusterItr->first); - state KeyBackedRangeResult> managementTenantList; - state KeyBackedRangeResult> managementGroupList; - - wait(runTransactionVoid( - self->managementDb, - [managementTenantList = &managementTenantList, - managementGroupList = &managementGroupList](Reference tr) { - return store(*managementTenantList, - MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange( - tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && - store(*managementGroupList, - MetaclusterAPI::ManagementClusterMetadata::tenantMetadata().tenantGroupMap.getRange( - tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); - })); - ASSERT(managementTenantList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - ASSERT(managementGroupList.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - - state std::unordered_map managementTenants(managementTenantList.results.begin(), - managementTenantList.results.end()); - state std::unordered_map managementGroups( - managementGroupList.results.begin(), managementGroupList.results.end()); - - state KeyBackedRangeResult> dataClusterTenants; - state KeyBackedRangeResult> dataClusterGroups; - - wait(runTransaction(clusterItr->second.db.getReference(), - [dataClusterTenants = &dataClusterTenants, - dataClusterGroups = &dataClusterGroups](Reference tr) { - tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - return store(*dataClusterTenants, - TenantMetadata::tenantNameIndex().getRange( - tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)) && - store(*dataClusterGroups, - TenantMetadata::tenantGroupMap().getRange( - tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER + 1)); - })); - ASSERT(dataClusterTenants.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - ASSERT(dataClusterGroups.results.size() <= CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER); - - state std::unordered_map> tenantCollisions; - state std::unordered_map groupCollisions; - for (auto const& t : dataClusterTenants.results) { - auto itr = managementTenants.find(t.first); - if (itr != managementTenants.end()) { - tenantCollisions[t.first] = std::make_pair(t.second, itr->second); - } - } - for (auto const& g : dataClusterGroups.results) { - if (managementGroups.count(g.first)) { - groupCollisions[g.first] = g.second; - } - } + state std::pair collisions = + wait(getCollisions(self, clusterItr->second.db)); state std::vector messages; state bool completed = false; @@ -473,7 +487,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { try { TraceEvent("MetaclusterRestoreWorkloadRestoreManagementCluster") .detail("FromCluster", clusterItr->first) - .detail("TenantCollisions", tenantCollisions.size()); + .detail("TenantCollisions", collisions.first.size()); wait(MetaclusterAPI::restoreCluster( self->managementDb, @@ -482,16 +496,17 @@ struct MetaclusterRestoreWorkload : TestWorkload { ApplyManagementClusterUpdates::False, &messages)); - ASSERT(tenantCollisions.empty() && groupCollisions.empty()); + ASSERT(collisions.first.empty() && collisions.second.empty()); completed = true; } catch (Error& e) { bool failedDueToCollision = - (e.code() == error_code_tenant_already_exists && !tenantCollisions.empty()) || - (e.code() == error_code_invalid_tenant_configuration && !groupCollisions.empty()); + (e.code() == error_code_tenant_already_exists && !collisions.first.empty()) || + (e.code() == error_code_invalid_tenant_configuration && !collisions.second.empty()); if (!failedDueToCollision) { throw; } + // If the restore did not succeed, remove the partially restored cluster try { wait(MetaclusterAPI::removeCluster( self->managementDb, clusterItr->first, ClusterType::METACLUSTER_MANAGEMENT, true)); @@ -518,16 +533,14 @@ struct MetaclusterRestoreWorkload : TestWorkload { // If we didn't succeed, resolve tenant and group collisions and try again if (!completed) { ASSERT(messages.size() > 0); - if (!tenantCollisions.empty()) { - wait(self->resolveTenantCollisions( - self, clusterItr->first, clusterItr->second.db, tenantCollisions)); - tenantCollisions.clear(); - } - if (!groupCollisions.empty()) { - wait(self->resolveGroupCollisions( - self, clusterItr->first, clusterItr->second.db, groupCollisions)); - groupCollisions.clear(); - } + + wait(self->resolveTenantCollisions( + self, clusterItr->first, clusterItr->second.db, collisions.first)); + wait(self->resolveGroupCollisions( + self, clusterItr->first, clusterItr->second.db, collisions.second)); + + collisions.first.clear(); + collisions.second.clear(); } } TraceEvent("MetaclusterRestoreWorkloadRestoredDataClusterToManagementCluster") From 0e078435ab9244c5c8dbc90fdd866c3f35324ec3 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 10 Feb 2023 10:57:37 -0800 Subject: [PATCH 32/57] Remove unnecessary try/catch --- fdbclient/include/fdbclient/MetaclusterManagement.actor.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 813df9c68d..ee899bf276 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1809,12 +1809,8 @@ struct RestoreClusterImpl { wait(getAllTenantsFromManagementCluster(self)); // get all the tenant information from the newly registered data cluster - try { - wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - } catch (Error& e) { - throw; - } + wait(self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); // Fix any differences between the data cluster and the management cluster wait(addTenantsToManagementCluster(self)); From a6b47c1da4990456040bf5560f88790fabb38b85 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 10 Feb 2023 11:12:36 -0800 Subject: [PATCH 33/57] Fix merge issue --- .../MetaclusterManagementWorkload.actor.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 031fd6faaa..83b16cdab3 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -908,16 +908,16 @@ struct MetaclusterManagementWorkload : TestWorkload { ClusterName clusterName, DataClusterData clusterData) { state Optional metaclusterRegistration; - state std::vector> tenants; + state std::vector> tenants; state Reference tr = clusterData.db->createTransaction(); loop { try { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - wait( - store(metaclusterRegistration, MetaclusterMetadata::metaclusterRegistration().get(tr)) && - store(tenants, - TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1))); + wait(store(metaclusterRegistration, MetaclusterMetadata::metaclusterRegistration().get(tr)) && + store(tenants, + TenantAPI::listTenantMetadataTransaction( + tr, ""_sr, "\xff\xff"_sr, clusterData.tenants.size() + 1))); break; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); @@ -932,7 +932,7 @@ struct MetaclusterManagementWorkload : TestWorkload { } ASSERT(tenants.size() == clusterData.tenants.size()); - for (auto [tenantName, tid] : tenants) { + for (auto [tenantName, tenantEntry] : tenants) { ASSERT(clusterData.tenants.count(tenantName)); auto tenantData = self->createdTenants[tenantName]; ASSERT(tenantData.cluster == clusterName); From a37d8f757c89b6e143b5a103eb7068d11d1186b8 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Fri, 10 Feb 2023 21:01:52 -0800 Subject: [PATCH 34/57] Redwood: fix restart test failure with xor encoding --- fdbserver/VersionedBTree.actor.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index eeef9f1b65..b5ca33af85 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4970,13 +4970,14 @@ public: // VersionedBTree takes ownership of pager VersionedBTree(IPager2* pager, std::string name, + UID logID, Reference const> db, Optional expectedEncryptionMode, EncodingType encodingType = EncodingType::MAX_ENCODING_TYPE, Reference keyProvider = {}) : m_pager(pager), m_db(db), m_expectedEncryptionMode(expectedEncryptionMode), m_encodingType(encodingType), m_enforceEncodingType(false), m_keyProvider(keyProvider), m_pBuffer(nullptr), m_mutationCount(0), m_name(name), - m_pBoundaryVerifier(DecodeBoundaryVerifier::getVerifier(name)) { + m_logID(logID), m_pBoundaryVerifier(DecodeBoundaryVerifier::getVerifier(name)) { m_pDecodeCacheMemory = m_pager->getPageCachePenaltySource(); m_lazyClearActor = 0; m_init = init_impl(this); @@ -5121,7 +5122,7 @@ public: // default encoding is expected. if (encodingType == EncodingType::MAX_ENCODING_TYPE) { encodingType = expectedEncodingType; - if (encodingType == EncodingType::XXHash64 && g_network->isSimulated() && BUGGIFY) { + if (encodingType == EncodingType::XXHash64 && g_network->isSimulated() && m_logID.hash() % 2 == 0) { encodingType = EncodingType::XOREncryption_TestOnly; } } else if (encodingType != expectedEncodingType) { @@ -5592,6 +5593,7 @@ private: Future m_latestCommit; Future m_init; std::string m_name; + UID m_logID; int m_blockSize; ParentInfoMapT childUpdateTracker; @@ -7965,7 +7967,7 @@ public: SERVER_KNOBS->REDWOOD_EXTENT_CONCURRENT_READS, false, m_error); - m_tree = new VersionedBTree(pager, filename, db, encryptionMode, encodingType, keyProvider); + m_tree = new VersionedBTree(pager, filename, logID, db, encryptionMode, encodingType, keyProvider); m_init = catchError(init_impl(this)); } @@ -10126,7 +10128,7 @@ TEST_CASE("Lredwood/correctness/btree") { printf("Initializing...\n"); pager = new DWALPager( pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads, pagerMemoryOnly); - state VersionedBTree* btree = new VersionedBTree(pager, file, {}, encryptionMode, encodingType, keyProvider); + state VersionedBTree* btree = new VersionedBTree(pager, file, UID(), {}, encryptionMode, encodingType, keyProvider); wait(btree->init()); state DecodeBoundaryVerifier* pBoundaries = DecodeBoundaryVerifier::getVerifier(file); @@ -10365,7 +10367,7 @@ TEST_CASE("Lredwood/correctness/btree") { printf("Reopening btree from disk.\n"); IPager2* pager = new DWALPager( pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads, false); - btree = new VersionedBTree(pager, file, {}, encryptionMode, encodingType, keyProvider); + btree = new VersionedBTree(pager, file, UID(), {}, encryptionMode, encodingType, keyProvider); wait(btree->init()); @@ -10414,6 +10416,7 @@ TEST_CASE("Lredwood/correctness/btree") { concurrentExtentReads, pagerMemoryOnly), file, + UID(), {}, {}, encodingType, @@ -10750,8 +10753,8 @@ TEST_CASE(":/redwood/performance/set") { DWALPager* pager = new DWALPager( pageSize, extentSize, file, pageCacheBytes, remapCleanupWindowBytes, concurrentExtentReads, pagerMemoryOnly); - state VersionedBTree* btree = - new VersionedBTree(pager, file, {}, {}, EncodingType::XXHash64, makeReference()); + state VersionedBTree* btree = new VersionedBTree( + pager, file, UID(), {}, {}, EncodingType::XXHash64, makeReference()); wait(btree->init()); printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); From b4f45a0a8781d102d54511e67d2fcfd5e439fd1a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sat, 11 Feb 2023 12:09:17 -0800 Subject: [PATCH 35/57] Fix logic in MetaclusterManagementWorkload when retrying a tenant creation with an invalid assigned cluster --- .../MetaclusterManagementWorkload.actor.cpp | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 35d7e02f69..6cb6ea29a6 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -444,11 +444,10 @@ struct MetaclusterManagementWorkload : TestWorkload { // Choose between two preferred clusters because if we get a partial completion and // retry, we want the operation to eventually succeed instead of having a chance of // never re-visiting the original preferred cluster. - state std::pair preferredClusters; - state Optional originalPreferredCluster; + state std::vector preferredClusters; if (!assignClusterAutomatically) { - preferredClusters.first = self->chooseClusterName(); - preferredClusters.second = self->chooseClusterName(); + preferredClusters.push_back(self->chooseClusterName()); + preferredClusters.push_back(self->chooseClusterName()); } state TenantMapEntry tenantMapEntry; @@ -459,11 +458,7 @@ struct MetaclusterManagementWorkload : TestWorkload { loop { try { if (!assignClusterAutomatically && (!retried || deterministicRandom()->coinflip())) { - tenantMapEntry.assignedCluster = - deterministicRandom()->coinflip() ? preferredClusters.first : preferredClusters.second; - if (!originalPreferredCluster.present()) { - originalPreferredCluster = tenantMapEntry.assignedCluster.get(); - } + tenantMapEntry.assignedCluster = deterministicRandom()->randomChoice(preferredClusters); } Future createFuture = MetaclusterAPI::createTenant(self->managementDb, tenantMapEntry, assignClusterAutomatically); @@ -480,16 +475,23 @@ struct MetaclusterManagementWorkload : TestWorkload { ASSERT(entry.present()); tenantMapEntry = entry.get(); break; - } else if (!assignClusterAutomatically && retried && - originalPreferredCluster.get() != tenantMapEntry.assignedCluster.get() && - (e.code() == error_code_cluster_no_capacity || - e.code() == error_code_cluster_not_found || - e.code() == error_code_invalid_tenant_configuration)) { - // When picking a different assigned cluster, it is possible to leave the - // tenant creation in a partially completed state, which we want to avoid. - // Continue retrying if the new preferred cluster throws errors rather than - // exiting immediately so we can allow the operation to finish. - continue; + } else if (!assignClusterAutomatically && (e.code() == error_code_cluster_no_capacity || + e.code() == error_code_cluster_not_found || + e.code() == error_code_invalid_tenant_configuration)) { + state Error error = e; + Optional entry = wait(MetaclusterAPI::tryGetTenant(self->managementDb, tenant)); + if (entry.present() && entry.get().assignedCluster != tenantMapEntry.assignedCluster) { + // When picking a different assigned cluster, it is possible to leave the + // tenant creation in a partially completed state, which we want to avoid. + // Continue retrying if the new preferred cluster throws errors rather than + // exiting immediately so we can allow the operation to finish. + preferredClusters.clear(); + preferredClusters.push_back(entry.get().assignedCluster.get()); + tenantMapEntry.assignedCluster = entry.get().assignedCluster; + continue; + } + + throw error; } else { throw; } @@ -545,6 +547,7 @@ struct MetaclusterManagementWorkload : TestWorkload { ASSERT(tenantGroup.present()); ASSERT(tenantMapEntry.assignedCluster.present()); auto itr = self->tenantGroups.find(tenantGroup.get()); + ASSERT(itr != self->tenantGroups.end()); ASSERT(itr->second.cluster != tenantMapEntry.assignedCluster.get()); return Void(); } From e6021f83266ea85e50dfb7307befb65c084e5a8b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sat, 11 Feb 2023 15:15:32 -0800 Subject: [PATCH 36/57] Add Jon's metacluster concurrency test and fix various bugs that it found --- fdbcli/MetaclusterCommands.actor.cpp | 7 +- fdbclient/Metacluster.cpp | 12 +- fdbclient/include/fdbclient/Metacluster.h | 8 +- .../fdbclient/MetaclusterManagement.actor.h | 251 ++++++++----- .../workloads/MetaclusterConsistency.actor.h | 1 + ...terManagementConcurrencyWorkload.actor.cpp | 352 ++++++++++++++++++ .../MetaclusterManagementWorkload.actor.cpp | 7 +- tests/CMakeLists.txt | 1 + .../MetaclusterManagementConcurrency.toml | 16 + 9 files changed, 562 insertions(+), 93 deletions(-) create mode 100644 fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp create mode 100644 tests/slow/MetaclusterManagementConcurrency.toml diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 5aad98082e..3276c73ae5 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -163,9 +163,14 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4)); + bool updatedDataCluster = wait(MetaclusterAPI::removeCluster(db, clusterName, tokens.size() == 4, 15.0)); fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); + if (!updatedDataCluster) { + fmt::print("WARNING: the data cluster could not be updated and still contains its\n" + "metacluster registration info. To finish removing it, FORCE remove the\n" + "data cluster directly."); + } return true; } diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp index b298762f11..3b90c903d9 100644 --- a/fdbclient/Metacluster.cpp +++ b/fdbclient/Metacluster.cpp @@ -24,6 +24,7 @@ FDB_DEFINE_BOOLEAN_PARAM(AddNewTenants); FDB_DEFINE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DEFINE_BOOLEAN_PARAM(AssignClusterAutomatically); +FDB_DEFINE_BOOLEAN_PARAM(RunOnDisconnectedCluster); std::string clusterTypeToString(const ClusterType& clusterType) { switch (clusterType) { @@ -40,6 +41,8 @@ std::string clusterTypeToString(const ClusterType& clusterType) { std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState) { switch (clusterState) { + case DataClusterState::REGISTERING: + return "registering"; case DataClusterState::READY: return "ready"; case DataClusterState::REMOVING: @@ -52,7 +55,9 @@ std::string DataClusterEntry::clusterStateToString(DataClusterState clusterState } DataClusterState DataClusterEntry::stringToClusterState(std::string stateStr) { - if (stateStr == "ready") { + if (stateStr == "registering") { + return DataClusterState::REGISTERING; + } else if (stateStr == "ready") { return DataClusterState::READY; } else if (stateStr == "removing") { return DataClusterState::REMOVING; @@ -82,4 +87,9 @@ MetaclusterMetadata::metaclusterRegistration() { static KeyBackedObjectProperty instance( "\xff/metacluster/clusterRegistration"_sr, IncludeVersion()); return instance; +} + +KeyBackedSet& MetaclusterMetadata::registrationTombstones() { + static KeyBackedSet instance("\xff/metacluster/registrationTombstones"_sr); + return instance; } \ No newline at end of file diff --git a/fdbclient/include/fdbclient/Metacluster.h b/fdbclient/include/fdbclient/Metacluster.h index c08ae0e667..1a8b5854ff 100644 --- a/fdbclient/include/fdbclient/Metacluster.h +++ b/fdbclient/include/fdbclient/Metacluster.h @@ -57,11 +57,12 @@ std::string clusterTypeToString(const ClusterType& clusterType); // Represents the various states that a data cluster could be in. // +// REGISTERING - the data cluster is being registered with the metacluster // READY - the data cluster is active // REMOVING - the data cluster is being removed and cannot have its configuration changed or any tenants created // RESTORING - the data cluster is being restored and cannot have its configuration changed or any tenants // created/updated/deleted. -enum class DataClusterState { READY, REMOVING, RESTORING }; +enum class DataClusterState { REGISTERING, READY, REMOVING, RESTORING }; struct DataClusterEntry { constexpr static FileIdentifier file_identifier = 929511; @@ -81,9 +82,7 @@ struct DataClusterEntry { : id(id), capacity(capacity), allocated(allocated) {} // Returns true if all configurable properties match - bool matchesConfiguration(DataClusterEntry const& other) const { - return id == other.id && capacity == other.capacity; - } + bool matchesConfiguration(DataClusterEntry const& other) const { return capacity == other.capacity; } bool hasCapacity() const { return allocated < capacity; } @@ -188,6 +187,7 @@ struct Traceable : std::true_type { struct MetaclusterMetadata { // Registration information for a metacluster, stored on both management and data clusters static KeyBackedObjectProperty& metaclusterRegistration(); + static KeyBackedSet& registrationTombstones(); }; #endif diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 52cb5d6e55..6a8a4d7ebb 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -19,17 +19,13 @@ */ #pragma once -#include "fdbclient/FDBOptions.g.h" -#include "fdbclient/Tenant.h" -#include "flow/IRandom.h" -#include "flow/Platform.h" -#include "flow/ThreadHelper.actor.h" #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H) #define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_G_H #include "fdbclient/MetaclusterManagement.actor.g.h" #elif !defined(FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H) #define FDBCLIENT_METACLUSTER_MANAGEMENT_ACTOR_H +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GenericTransactionHelper.h" #include "fdbclient/GenericManagementAPI.actor.h" @@ -37,9 +33,13 @@ #include "fdbclient/Metacluster.h" #include "fdbclient/MultiVersionTransaction.h" #include "fdbclient/SystemData.h" +#include "fdbclient/Tenant.h" #include "fdbclient/TenantManagement.actor.h" #include "fdbclient/VersionedMap.h" #include "flow/flat_buffers.h" +#include "flow/IRandom.h" +#include "flow/Platform.h" +#include "flow/ThreadHelper.actor.h" #include "flow/actorcompiler.h" // has to be last include // This file provides the interfaces to manage metacluster metadata. @@ -83,6 +83,7 @@ struct DataClusterMetadata { FDB_DECLARE_BOOLEAN_PARAM(AddNewTenants); FDB_DECLARE_BOOLEAN_PARAM(RemoveMissingTenants); FDB_DECLARE_BOOLEAN_PARAM(AssignClusterAutomatically); +FDB_DECLARE_BOOLEAN_PARAM(RunOnDisconnectedCluster); namespace MetaclusterAPI { @@ -199,9 +200,29 @@ struct MetaclusterOperationContext { Optional metaclusterRegistration; Optional dataClusterMetadata; + bool dataClusterIsRegistered = true; - MetaclusterOperationContext(Reference managementDb, Optional clusterName = {}) - : managementDb(managementDb), clusterName(clusterName) {} + std::set extraSupportedDataClusterStates; + + MetaclusterOperationContext(Reference managementDb, + Optional clusterName = {}, + std::set extraSupportedDataClusterStates = {}) + : managementDb(managementDb), clusterName(clusterName), + extraSupportedDataClusterStates(extraSupportedDataClusterStates) {} + + void checkClusterState() { + DataClusterState clusterState = + dataClusterMetadata.present() ? dataClusterMetadata.get().entry.clusterState : DataClusterState::READY; + if (clusterState != DataClusterState::READY && extraSupportedDataClusterStates.count(clusterState) == 0) { + if (clusterState == DataClusterState::REGISTERING) { + throw cluster_not_found(); + } else if (clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } + + ASSERT(false); + } + } // Run a transaction on the management cluster. This verifies that the cluster is a management cluster and matches // the same metacluster that we've run any previous transactions on. If a clusterName is set, it also verifies that @@ -276,6 +297,8 @@ struct MetaclusterOperationContext { } } + self->checkClusterState(); + state decltype(std::declval()(Reference()).getValue()) result = wait(func(tr)); @@ -298,12 +321,16 @@ struct MetaclusterOperationContext { // has the expected ID and is part of the metacluster that previous transactions have run on. ACTOR template static Future()(Reference()).getValue())> - runDataClusterTransaction(MetaclusterOperationContext* self, Function func) { + runDataClusterTransaction(MetaclusterOperationContext* self, + Function func, + RunOnDisconnectedCluster runOnDisconnectedCluster) { ASSERT(self->dataClusterDb); ASSERT(self->dataClusterMetadata.present()); ASSERT(self->metaclusterRegistration.present() && self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + self->checkClusterState(); + state Reference tr = self->dataClusterDb->createTransaction(); loop { try { @@ -313,13 +340,17 @@ struct MetaclusterOperationContext { wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); // Check that this is the expected data cluster and is part of the right metacluster - if (!currentMetaclusterRegistration.present() || - currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { + if (!currentMetaclusterRegistration.present()) { + if (!runOnDisconnectedCluster) { + throw invalid_metacluster_operation(); + } + } else if (currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { throw invalid_metacluster_operation(); } else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { throw invalid_metacluster_operation(); } + self->dataClusterIsRegistered = currentMetaclusterRegistration.present(); state decltype(std::declval()(Reference()).getValue()) result = wait(func(tr)); @@ -333,8 +364,9 @@ struct MetaclusterOperationContext { template Future()(Reference()).getValue())> - runDataClusterTransaction(Function func) { - return runDataClusterTransaction(this, func); + runDataClusterTransaction(Function func, + RunOnDisconnectedCluster runOnDisconnectedCluster = RunOnDisconnectedCluster::False) { + return runDataClusterTransaction(this, func, runOnDisconnectedCluster); } ACTOR static Future updateClusterName(MetaclusterOperationContext* self, @@ -349,6 +381,8 @@ struct MetaclusterOperationContext { wait(store(self->dataClusterDb, openDatabase(self->dataClusterMetadata.get().connectionString))); } + self->checkClusterState(); + return Void(); } @@ -569,7 +603,11 @@ void updateClusterMetadata(Transaction tr, Optional const& updatedEntry) { if (updatedEntry.present()) { - if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { + if (previousMetadata.entry.clusterState == DataClusterState::REGISTERING && + updatedEntry.get().clusterState != DataClusterState::READY && + updatedEntry.get().clusterState != DataClusterState::REMOVING) { + throw cluster_not_found(); + } else if (previousMetadata.entry.clusterState == DataClusterState::REMOVING) { throw cluster_removed(); } ManagementClusterMetadata::dataClusters().set(tr, name, updatedEntry.get()); @@ -595,13 +633,33 @@ struct RegisterClusterImpl { DataClusterEntry clusterEntry) : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {} - // Check that cluster name is available - ACTOR static Future registrationPrecheck(RegisterClusterImpl* self, Reference tr) { + // Store the cluster entry for the new cluster in a registering state + ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, + Reference tr) { state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); - if (dataClusterMetadata.present()) { + if (!dataClusterMetadata.present()) { + self->clusterEntry.clusterState = DataClusterState::REGISTERING; + self->clusterEntry.allocated = ClusterUsage(); + self->clusterEntry.id = deterministicRandom()->randomUniqueID(); + + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } else if (!dataClusterMetadata.get().matchesConfiguration( + DataClusterMetadata(self->clusterEntry, self->connectionString)) || + dataClusterMetadata.get().entry.clusterState != DataClusterState::REGISTERING) { throw cluster_already_exists(); + } else { + self->clusterEntry = dataClusterMetadata.get().entry; } + TraceEvent("RegisteringDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("ConnectionString", self->connectionString.toString()); + return Void(); } @@ -615,6 +673,8 @@ struct RegisterClusterImpl { state Future>> existingTenantsFuture = TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1); state ThreadFuture existingDataFuture = tr->getRange(normalKeys, 1); + state Future tombstoneFuture = + MetaclusterMetadata::registrationTombstones().exists(tr, self->clusterEntry.id); // Check whether this cluster has already been registered state Optional existingRegistration = @@ -622,15 +682,21 @@ struct RegisterClusterImpl { if (existingRegistration.present()) { if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA || existingRegistration.get().name != self->clusterName || - !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || + existingRegistration.get().id != self->clusterEntry.id) { throw cluster_already_registered(); } else { // We already successfully registered the cluster with these details, so there's nothing to do - self->clusterEntry.id = existingRegistration.get().id; return Void(); } } + // Check if the cluster was removed concurrently + bool tombstone = wait(tombstoneFuture); + if (tombstone) { + throw cluster_removed(); + } + // Check for any existing data std::vector> existingTenants = wait(safeThreadFutureToFuture(existingTenantsFuture)); @@ -645,7 +711,6 @@ struct RegisterClusterImpl { throw cluster_not_empty(); } - self->clusterEntry.id = deterministicRandom()->randomUniqueID(); MetaclusterMetadata::metaclusterRegistration().set( tr, self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, @@ -668,28 +733,31 @@ struct RegisterClusterImpl { } // Store the cluster entry for the new cluster - ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, - Reference tr) { + ACTOR static Future markClusterReady(RegisterClusterImpl* self, Reference tr) { state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); - if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration( - DataClusterMetadata(self->clusterEntry, self->connectionString))) { + if (!dataClusterMetadata.present() || + dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } else if (dataClusterMetadata.get().entry.id != self->clusterEntry.id) { throw cluster_already_exists(); - } else if (!dataClusterMetadata.present()) { - self->clusterEntry.allocated = ClusterUsage(); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::READY) { + return Void(); + } else { + ASSERT(dataClusterMetadata.get().entry.clusterState == DataClusterState::REGISTERING); + dataClusterMetadata.get().entry.clusterState = DataClusterState::READY; - if (self->clusterEntry.hasCapacity()) { + if (dataClusterMetadata.get().entry.hasCapacity()) { ManagementClusterMetadata::clusterCapacityIndex.insert( - tr, Tuple::makeTuple(self->clusterEntry.allocated.numTenantGroups, self->clusterName)); + tr, Tuple::makeTuple(dataClusterMetadata.get().entry.allocated.numTenantGroups, self->clusterName)); } - ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, dataClusterMetadata.get().entry); ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); } TraceEvent("RegisteredDataCluster") .detail("ClusterName", self->clusterName) .detail("ClusterID", self->clusterEntry.id) - .detail("Capacity", self->clusterEntry.capacity) - .detail("Version", tr->getCommittedVersion()) + .detail("Capacity", dataClusterMetadata.get().entry.capacity) .detail("ConnectionString", self->connectionString.toString()); return Void(); @@ -697,12 +765,14 @@ struct RegisterClusterImpl { ACTOR static Future run(RegisterClusterImpl* self) { wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return registrationPrecheck(self, tr); })); - // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we don't - // have a metacluster registration on the data cluster + [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + + // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we + // don't have a metacluster registration on the data cluster wait(configureDataCluster(self)); wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + [self = self](Reference tr) { return markClusterReady(self, tr); })); + return Void(); } Future run() { return run(this); } @@ -736,15 +806,20 @@ struct RemoveClusterImpl { // Initialization parameters bool forceRemove; + double dataClusterTimeout; // Parameters set in markClusterRemoving Optional lastTenantId; - RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove) - : ctx(managementDb, clusterName), forceRemove(forceRemove) {} + // An output parameter that signals whether we were able to remove the data cluster + bool dataClusterUpdated = false; + + RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove, double dataClusterTimeout) + : ctx(managementDb, clusterName, { DataClusterState::REGISTERING, DataClusterState::REMOVING }), + forceRemove(forceRemove) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. - ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { + ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { throw cluster_not_empty(); } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) { @@ -772,34 +847,37 @@ struct RemoveClusterImpl { self->lastTenantId = lastId; } - TraceEvent("MarkedDataClusterRemoving") - .detail("Name", self->ctx.clusterName.get()) - .detail("Version", tr->getCommittedVersion()); - - return true; + TraceEvent("MarkedDataClusterRemoving").detail("Name", self->ctx.clusterName.get()); + return Void(); } // Delete metacluster metadata from the data cluster ACTOR static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { - // Delete metacluster related metadata - MetaclusterMetadata::metaclusterRegistration().clear(tr); - TenantMetadata::tenantTombstones().clear(tr); - TenantMetadata::tombstoneCleanupData().clear(tr); - // If we are force removing a cluster, then it will potentially contain tenants that have IDs - // larger than the next tenant ID to be allocated on the cluster. To avoid collisions, we advance - // the ID so that it will be the larger of the current one on the data cluster and the management - // cluster. - if (self->lastTenantId.present()) { - Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); - if (!lastId.present() || lastId.get() < self->lastTenantId.get()) { - TenantMetadata::lastTenantId().set(tr, self->lastTenantId.get()); + if (self->ctx.dataClusterIsRegistered) { + // Delete metacluster related metadata + MetaclusterMetadata::metaclusterRegistration().clear(tr); + TenantMetadata::tenantTombstones().clear(tr); + TenantMetadata::tombstoneCleanupData().clear(tr); + + // If we are force removing a cluster, then it will potentially contain tenants that have IDs + // larger than the next tenant ID to be allocated on the cluster. To avoid collisions, we advance + // the ID so that it will be the larger of the current one on the data cluster and the management + // cluster. + if (self->lastTenantId.present()) { + Optional lastId = wait(TenantMetadata::lastTenantId().get(tr)); + if (!lastId.present() || lastId.get() < self->lastTenantId.get()) { + TenantMetadata::lastTenantId().set(tr, self->lastTenantId.get()); + } } } - TraceEvent("ReconfiguredDataCluster") + // Insert a tombstone marking this tenant removed even if we aren't registered + MetaclusterMetadata::registrationTombstones().insert(tr, self->ctx.metaclusterRegistration.get().id); + + TraceEvent("RemovedMetaclusterRegistrationOnDataCluster") .detail("Name", self->ctx.clusterName.get()) - .detail("Version", tr->getCommittedVersion()); + .detail("WasRegistered", self->ctx.dataClusterIsRegistered); return Void(); } @@ -914,41 +992,44 @@ struct RemoveClusterImpl { } ACTOR static Future run(RemoveClusterImpl* self) { - state bool clusterIsPresent; try { - wait(store(clusterIsPresent, - self->ctx.runManagementTransaction([self = self](Reference tr) { - return markClusterRemoving(self, tr); - }))); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markClusterRemoving(self, tr); })); } catch (Error& e) { // If the transaction retries after success or if we are trying a second time to remove the cluster, it will // throw an error indicating that the removal has already started - if (e.code() == error_code_cluster_removed) { - clusterIsPresent = true; - } else { + if (e.code() != error_code_cluster_removed) { throw; } } - if (clusterIsPresent) { - try { - wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return updateDataCluster(self, tr); })); - } catch (Error& e) { - // If this transaction gets retried, the metacluster information may have already been erased. - if (e.code() != error_code_invalid_metacluster_operation) { - throw; - } + try { + Future f = self->ctx.runDataClusterTransaction( + [self = self](Reference tr) { return updateDataCluster(self, tr); }, + RunOnDisconnectedCluster::True); + + if (self->forceRemove && self->dataClusterTimeout > 0) { + f = timeoutError(f, self->dataClusterTimeout); } - // This runs multiple transactions, so the run transaction calls are inside the function - try { - wait(managementClusterPurgeDataCluster(self)); - } catch (Error& e) { - // If this transaction gets retried, the cluster may have already been deleted. - if (e.code() != error_code_cluster_not_found) { - throw; - } + wait(f); + self->dataClusterUpdated = true; + } catch (Error& e) { + // If this transaction gets retried, the metacluster information may have already been erased. + if (e.code() == error_code_invalid_metacluster_operation) { + self->dataClusterUpdated = true; + } else if (e.code() != error_code_timed_out) { + throw; + } + } + + // This runs multiple transactions, so the run transaction calls are inside the function + try { + wait(managementClusterPurgeDataCluster(self)); + } catch (Error& e) { + // If this transaction gets retried, the cluster may have already been deleted. + if (e.code() != error_code_cluster_not_found) { + throw; } } @@ -958,10 +1039,10 @@ struct RemoveClusterImpl { }; ACTOR template -Future removeCluster(Reference db, ClusterName name, bool forceRemove) { - state RemoveClusterImpl impl(db, name, forceRemove); +Future removeCluster(Reference db, ClusterName name, bool forceRemove, double dataClusterTimeout = 0) { + state RemoveClusterImpl impl(db, name, forceRemove, dataClusterTimeout); wait(impl.run()); - return Void(); + return impl.dataClusterUpdated; } ACTOR template @@ -1319,6 +1400,8 @@ struct CreateTenantImpl { throw cluster_removed(); } + ASSERT(self->ctx.dataClusterMetadata.get().entry.clusterState == DataClusterState::READY); + managementClusterAddTenantToGroup( tr, self->tenantEntry, &self->ctx.dataClusterMetadata.get(), assignment.second); diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index 770b17f051..1b6e4aea99 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -189,6 +189,7 @@ private: if (!clusterMetadata.entry.hasCapacity()) { ASSERT(allocatedItr == clusterAllocatedMap.end()); } else { + ASSERT(allocatedItr != clusterAllocatedMap.end()); ASSERT_EQ(allocatedItr->second, clusterMetadata.entry.allocated.numTenantGroups); ++numFoundInAllocatedMap; } diff --git a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp new file mode 100644 index 0000000000..cf6d0ff732 --- /dev/null +++ b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp @@ -0,0 +1,352 @@ +/* + * MetaclusterManagementConcurrencyWorkload.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/FDBTypes.h" +#include "fdbclient/GenericManagementAPI.actor.h" +#include "fdbclient/MetaclusterManagement.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include "fdbclient/RunTransaction.actor.h" +#include "fdbclient/TenantManagement.actor.h" +#include "fdbclient/ThreadSafeTransaction.h" +#include "fdbrpc/simulator.h" +#include "fdbserver/workloads/MetaclusterConsistency.actor.h" +#include "fdbserver/workloads/TenantConsistency.actor.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" +#include "flow/Error.h" +#include "flow/IRandom.h" +#include "flow/ProtocolVersion.h" +#include "flow/flow.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +struct MetaclusterManagementConcurrencyWorkload : TestWorkload { + static constexpr auto NAME = "MetaclusterManagementConcurrency"; + + Reference managementDb; + std::map dataDbs; + std::vector dataDbIndex; + + double testDuration; + + MetaclusterManagementConcurrencyWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + testDuration = getOption(options, "testDuration"_sr, 120.0); + } + + Future setup(Database const& cx) override { return _setup(cx, this); } + + ACTOR static Future _setup(Database cx, MetaclusterManagementConcurrencyWorkload* self) { + Reference threadSafeHandle = + wait(unsafeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(cx))); + + MultiVersionApi::api->selectApiVersion(cx->apiVersion.version()); + self->managementDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle); + + ASSERT(g_simulator->extraDatabases.size() > 0); + for (auto connectionString : g_simulator->extraDatabases) { + ClusterConnectionString ccs(connectionString); + self->dataDbIndex.push_back(ClusterName(format("cluster_%08d", self->dataDbs.size()))); + self->dataDbs[self->dataDbIndex.back()] = + Database::createSimulatedExtraDatabase(connectionString, cx->defaultTenant); + } + + if (self->clientId == 0) { + wait(success(MetaclusterAPI::createMetacluster( + cx.getReference(), + "management_cluster"_sr, + deterministicRandom()->randomInt(TenantAPI::TENANT_ID_PREFIX_MIN_VALUE, + TenantAPI::TENANT_ID_PREFIX_MAX_VALUE + 1)))); + } + return Void(); + } + + ClusterName chooseClusterName() { return dataDbIndex[deterministicRandom()->randomInt(0, dataDbIndex.size())]; } + + static Future verifyClusterRecovered(Database db) { + return success(runTransaction(db.getReference(), + [](Reference tr) { return tr->getReadVersion(); })); + } + + ACTOR static Future registerCluster(MetaclusterManagementConcurrencyWorkload* self) { + state ClusterName clusterName = self->chooseClusterName(); + state Database dataDb = self->dataDbs[clusterName]; + + state UID debugId = deterministicRandom()->randomUniqueID(); + + try { + state DataClusterEntry entry; + entry.capacity.numTenantGroups = deterministicRandom()->randomInt(0, 4); + loop { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRegisteringCluster", debugId) + .detail("ClusterName", clusterName) + .detail("NumTenantGroups", entry.capacity.numTenantGroups); + Future registerFuture = + MetaclusterAPI::registerCluster(self->managementDb, + clusterName, + dataDb.getReference()->getConnectionRecord()->getConnectionString(), + entry); + + Optional result = wait(timeout(registerFuture, deterministicRandom()->randomInt(1, 30))); + if (result.present()) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRegisteredCluster", debugId) + .detail("ClusterName", clusterName) + .detail("NumTenantGroups", entry.capacity.numTenantGroups); + break; + } + } + } catch (Error& e) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRegisterClusterError", debugId) + .error(e) + .detail("ClusterName", clusterName); + if (e.code() != error_code_cluster_already_exists && e.code() != error_code_cluster_not_empty && + e.code() != error_code_cluster_already_registered && e.code() != error_code_cluster_removed) { + TraceEvent(SevError, "MetaclusterManagementConcurrencyRegisterClusterFailure", debugId) + .error(e) + .detail("ClusterName", clusterName); + } + return Void(); + } + + wait(verifyClusterRecovered(dataDb)); + return Void(); + } + + ACTOR static Future removeCluster(MetaclusterManagementConcurrencyWorkload* self) { + state ClusterName clusterName = self->chooseClusterName(); + state Database dataDb = self->dataDbs[clusterName]; + + state UID debugId = deterministicRandom()->randomUniqueID(); + + try { + loop { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRemovingCluster", debugId) + .detail("ClusterName", clusterName); + Future removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false); + Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); + if (result.present()) { + ASSERT(result.get()); + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRemovedCluster", debugId) + .detail("ClusterName", clusterName); + break; + } + } + } catch (Error& e) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRemoveClusterError", debugId) + .error(e) + .detail("ClusterName", clusterName); + if (e.code() != error_code_cluster_not_found && e.code() != error_code_cluster_not_empty) { + TraceEvent(SevError, "MetaclusterManagementConcurrencyRemoveClusterFailure", debugId) + .error(e) + .detail("ClusterName", clusterName); + } + return Void(); + } + + wait(verifyClusterRecovered(dataDb)); + return Void(); + } + + ACTOR static Future listClusters(MetaclusterManagementConcurrencyWorkload* self) { + state ClusterName clusterName1 = self->chooseClusterName(); + state ClusterName clusterName2 = self->chooseClusterName(); + state int limit = deterministicRandom()->randomInt(1, self->dataDbs.size() + 1); + try { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyListClusters") + .detail("StartClusterName", clusterName1) + .detail("EndClusterName", clusterName2) + .detail("Limit", limit); + + std::map clusterList = + wait(MetaclusterAPI::listClusters(self->managementDb, clusterName1, clusterName2, limit)); + + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyListedClusters") + .detail("StartClusterName", clusterName1) + .detail("EndClusterName", clusterName2) + .detail("Limit", limit); + + ASSERT(clusterName1 <= clusterName2); + ASSERT(clusterList.size() <= limit); + } catch (Error& e) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyListClustersError") + .error(e) + .detail("StartClusterName", clusterName1) + .detail("EndClusterName", clusterName2) + .detail("Limit", limit); + + if (e.code() != error_code_inverted_range) { + TraceEvent(SevError, "ListClusterFailure") + .error(e) + .detail("ClusterName1", clusterName1) + .detail("ClusterName2", clusterName2); + } + return Void(); + } + return Void(); + } + + ACTOR static Future getCluster(MetaclusterManagementConcurrencyWorkload* self) { + state ClusterName clusterName = self->chooseClusterName(); + state Database dataDb = self->dataDbs[clusterName]; + + try { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyGetCluster").detail("ClusterName", clusterName); + DataClusterMetadata clusterMetadata = wait(MetaclusterAPI::getCluster(self->managementDb, clusterName)); + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyGotCluster").detail("ClusterName", clusterName); + + ASSERT(dataDb.getReference()->getConnectionRecord()->getConnectionString() == + clusterMetadata.connectionString); + } catch (Error& e) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyGetClusterError") + .error(e) + .detail("ClusterName", clusterName); + if (e.code() != error_code_cluster_not_found) { + TraceEvent(SevError, "GetClusterFailure").error(e).detail("ClusterName", clusterName); + } + return Void(); + } + + return Void(); + } + + ACTOR static Future> configureImpl(MetaclusterManagementConcurrencyWorkload* self, + ClusterName clusterName, + Optional numTenantGroups, + Optional connectionString) { + state Reference tr = self->managementDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + Optional clusterMetadata = + wait(MetaclusterAPI::tryGetClusterTransaction(tr, clusterName)); + state Optional entry; + + if (clusterMetadata.present()) { + if (numTenantGroups.present()) { + entry = clusterMetadata.get().entry; + entry.get().capacity.numTenantGroups = numTenantGroups.get(); + } + MetaclusterAPI::updateClusterMetadata( + tr, clusterName, clusterMetadata.get(), connectionString, entry); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + } + + return entry; + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + ACTOR static Future configureCluster(MetaclusterManagementConcurrencyWorkload* self) { + state ClusterName clusterName = self->chooseClusterName(); + state Database dataDb = self->dataDbs[clusterName]; + + state UID debugId = deterministicRandom()->randomUniqueID(); + + state Optional newNumTenantGroups; + state Optional connectionString; + if (deterministicRandom()->coinflip()) { + newNumTenantGroups = deterministicRandom()->randomInt(0, 4); + } + if (deterministicRandom()->coinflip()) { + connectionString = dataDb.getReference()->getConnectionRecord()->getConnectionString(); + } + + try { + loop { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyConfigureCluster", debugId) + .detail("ClusterName", clusterName) + .detail("NewNumTenantGroups", newNumTenantGroups.orDefault(-1)) + .detail("NewConnectionString", + connectionString.map(&ClusterConnectionString::toString).orDefault("")); + Optional> result = + wait(timeout(configureImpl(self, clusterName, newNumTenantGroups, connectionString), + deterministicRandom()->randomInt(1, 30))); + if (result.present()) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyConfiguredCluster", debugId) + .detail("ClusterName", clusterName) + .detail("NewNumTenantGroups", newNumTenantGroups.orDefault(-1)) + .detail("NewConnectionString", + connectionString.map(&ClusterConnectionString::toString).orDefault("")); + break; + } + } + } catch (Error& e) { + TraceEvent(SevDebug, "MetaclusterManagementConcurrencyConfigureClusterError", debugId) + .error(e) + .detail("ClusterName", clusterName) + .detail("NewNumTenantGroups", newNumTenantGroups.orDefault(-1)) + .detail("NewConnectionString", connectionString.map(&ClusterConnectionString::toString).orDefault("")); + if (e.code() != error_code_cluster_not_found && e.code() != error_code_cluster_removed && + e.code() != error_code_invalid_metacluster_operation) { + TraceEvent(SevError, "ConfigureClusterFailure").error(e).detail("ClusterName", clusterName); + } + } + + return Void(); + } + + Future start(Database const& cx) override { return _start(cx, this); } + ACTOR static Future _start(Database cx, MetaclusterManagementConcurrencyWorkload* self) { + state double start = now(); + + // Run a random sequence of metacluster management operations for the duration of the test + while (now() < start + self->testDuration) { + state int operation = deterministicRandom()->randomInt(0, 5); + if (operation == 0) { + wait(registerCluster(self)); + } else if (operation == 1) { + wait(removeCluster(self)); + } else if (operation == 2) { + wait(listClusters(self)); + } else if (operation == 3) { + wait(getCluster(self)); + } else if (operation == 4) { + wait(configureCluster(self)); + } + } + + return Void(); + } + + Future check(Database const& cx) override { + if (clientId == 0) { + return _check(cx, this); + } else { + return true; + } + } + ACTOR static Future _check(Database cx, MetaclusterManagementConcurrencyWorkload* self) { + // The metacluster consistency check runs the tenant consistency check for each cluster + state MetaclusterConsistencyCheck metaclusterConsistencyCheck( + self->managementDb, AllowPartialMetaclusterOperations::True); + wait(metaclusterConsistencyCheck.run()); + + return true; + } + + void getMetrics(std::vector& m) override {} +}; + +WorkloadFactory MetaclusterManagementConcurrencyWorkloadFactory; diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index feffb7903d..d79a235767 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -213,10 +213,11 @@ struct MetaclusterManagementWorkload : TestWorkload { try { loop { // TODO: check force removal - Future removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false); + Future removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false); try { - Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); + Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { + ASSERT(result.get()); break; } else { retried = true; @@ -924,7 +925,7 @@ struct MetaclusterManagementWorkload : TestWorkload { std::vector> removeClusterFutures; for (auto [clusterName, clusterMetadata] : dataClusters) { removeClusterFutures.push_back( - MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants)); + success(MetaclusterAPI::removeCluster(self->managementDb, clusterName, !deleteTenants))); } wait(waitForAll(removeClusterFutures)); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 31704fcfbb..68cb382329 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -402,6 +402,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES slow/LongRunning.toml LONG_RUNNING) add_fdb_test(TEST_FILES slow/LowLatencyWithFailures.toml) add_fdb_test(TEST_FILES slow/MetaclusterManagement.toml) + add_fdb_test(TEST_FILES slow/MetaclusterManagementConcurrency.toml) add_fdb_test(TEST_FILES slow/MoveKeysClean.toml) add_fdb_test(TEST_FILES slow/MoveKeysSideband.toml) add_fdb_test(TEST_FILES slow/RyowCorrectness.toml) diff --git a/tests/slow/MetaclusterManagementConcurrency.toml b/tests/slow/MetaclusterManagementConcurrency.toml new file mode 100644 index 0000000000..bbeaf4aae7 --- /dev/null +++ b/tests/slow/MetaclusterManagementConcurrency.toml @@ -0,0 +1,16 @@ +[configuration] +allowDefaultTenant = false +tenantModes = ['optional', 'required'] +allowCreatingTenants = false +extraDatabaseMode = 'Multiple' +extraDatabaseCount = 5 + +[[test]] +testTitle = 'MetaclusterManagementConcurrencyTest' +clearAfterTest = true +timeout = 2100 +runSetup = true + + [[test.workload]] + testName = 'MetaclusterManagementConcurrency' + testDuration = 30 From a261c1d94c97e1d51ecdf9191e68fb6b93fe30a1 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Sat, 11 Feb 2023 19:46:47 -0800 Subject: [PATCH 37/57] Run tenant management concurrency alongside metacluster management concurrency. Fix a few issues where performing tenant operations returned undesirable errors when the associated cluster was removed. --- .../fdbclient/MetaclusterManagement.actor.h | 22 ++-- ...terManagementConcurrencyWorkload.actor.cpp | 15 ++- ...antManagementConcurrencyWorkload.actor.cpp | 116 ++++++++++++++---- .../MetaclusterManagementConcurrency.toml | 6 + 4 files changed, 126 insertions(+), 33 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 6a8a4d7ebb..7095121af2 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -272,7 +272,7 @@ struct MetaclusterOperationContext { // entry. if (self->clusterName.present()) { if (!currentDataClusterMetadata.present()) { - throw cluster_not_found(); + throw cluster_removed(); } else { currentMetaclusterRegistration = currentMetaclusterRegistration.get().toDataClusterRegistration( self->clusterName.get(), currentDataClusterMetadata.get().entry.id); @@ -288,7 +288,7 @@ struct MetaclusterOperationContext { // cluster metadata in the context and open a connection to the data DB. if (self->dataClusterMetadata.present() && self->dataClusterMetadata.get().entry.id != currentDataClusterMetadata.get().entry.id) { - throw cluster_not_found(); + throw cluster_removed(); } else if (self->clusterName.present()) { self->dataClusterMetadata = currentDataClusterMetadata; if (!self->dataClusterDb) { @@ -342,12 +342,12 @@ struct MetaclusterOperationContext { // Check that this is the expected data cluster and is part of the right metacluster if (!currentMetaclusterRegistration.present()) { if (!runOnDisconnectedCluster) { - throw invalid_metacluster_operation(); + throw cluster_removed(); } } else if (currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { - throw invalid_metacluster_operation(); + throw cluster_removed(); } else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { - throw invalid_metacluster_operation(); + throw cluster_removed(); } self->dataClusterIsRegistered = currentMetaclusterRegistration.present(); @@ -805,6 +805,7 @@ struct RemoveClusterImpl { MetaclusterOperationContext ctx; // Initialization parameters + ClusterName clusterName; bool forceRemove; double dataClusterTimeout; @@ -815,11 +816,14 @@ struct RemoveClusterImpl { bool dataClusterUpdated = false; RemoveClusterImpl(Reference managementDb, ClusterName clusterName, bool forceRemove, double dataClusterTimeout) - : ctx(managementDb, clusterName, { DataClusterState::REGISTERING, DataClusterState::REMOVING }), - forceRemove(forceRemove) {} + : ctx(managementDb, Optional(), { DataClusterState::REGISTERING, DataClusterState::REMOVING }), + clusterName(clusterName), forceRemove(forceRemove), dataClusterTimeout(dataClusterTimeout) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. ACTOR static Future markClusterRemoving(RemoveClusterImpl* self, Reference tr) { + state DataClusterMetadata clusterMetadata = wait(getClusterTransaction(tr, self->clusterName)); + wait(self->ctx.setCluster(tr, self->clusterName)); + if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { throw cluster_not_empty(); } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) { @@ -1016,7 +1020,7 @@ struct RemoveClusterImpl { self->dataClusterUpdated = true; } catch (Error& e) { // If this transaction gets retried, the metacluster information may have already been erased. - if (e.code() == error_code_invalid_metacluster_operation) { + if (e.code() == error_code_cluster_removed) { self->dataClusterUpdated = true; } else if (e.code() != error_code_timed_out) { throw; @@ -1028,7 +1032,7 @@ struct RemoveClusterImpl { wait(managementClusterPurgeDataCluster(self)); } catch (Error& e) { // If this transaction gets retried, the cluster may have already been deleted. - if (e.code() != error_code_cluster_not_found) { + if (e.code() != error_code_cluster_removed) { throw; } } diff --git a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp index cf6d0ff732..33f4efd5ea 100644 --- a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp @@ -123,6 +123,7 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { TraceEvent(SevError, "MetaclusterManagementConcurrencyRegisterClusterFailure", debugId) .error(e) .detail("ClusterName", clusterName); + ASSERT(false); } return Void(); } @@ -158,6 +159,7 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { TraceEvent(SevError, "MetaclusterManagementConcurrencyRemoveClusterFailure", debugId) .error(e) .detail("ClusterName", clusterName); + ASSERT(false); } return Void(); } @@ -194,10 +196,11 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { .detail("Limit", limit); if (e.code() != error_code_inverted_range) { - TraceEvent(SevError, "ListClusterFailure") + TraceEvent(SevError, "MetaclusterManagementConcurrencyListClusterFailure") .error(e) .detail("ClusterName1", clusterName1) .detail("ClusterName2", clusterName2); + ASSERT(false); } return Void(); } @@ -220,7 +223,10 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { .error(e) .detail("ClusterName", clusterName); if (e.code() != error_code_cluster_not_found) { - TraceEvent(SevError, "GetClusterFailure").error(e).detail("ClusterName", clusterName); + TraceEvent(SevError, "MetaclusterManagementConcurrencyGetClusterFailure") + .error(e) + .detail("ClusterName", clusterName); + ASSERT(false); } return Void(); } @@ -300,7 +306,10 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { .detail("NewConnectionString", connectionString.map(&ClusterConnectionString::toString).orDefault("")); if (e.code() != error_code_cluster_not_found && e.code() != error_code_cluster_removed && e.code() != error_code_invalid_metacluster_operation) { - TraceEvent(SevError, "ConfigureClusterFailure").error(e).detail("ClusterName", clusterName); + TraceEvent(SevError, "MetaclusterManagementConcurrencyConfigureClusterFailure") + .error(e) + .detail("ClusterName", clusterName); + ASSERT(false); } } diff --git a/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp index 339131b486..0b956c92ff 100644 --- a/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp +++ b/fdbserver/workloads/TenantManagementConcurrencyWorkload.actor.cpp @@ -47,6 +47,7 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { int maxTenantGroups; double testDuration; bool useMetacluster; + bool createMetacluster; Reference mvDb; Database dataDb; @@ -55,8 +56,11 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { maxTenants = std::min(1e8 - 1, getOption(options, "maxTenants"_sr, 100)); maxTenantGroups = std::min(2 * maxTenants, getOption(options, "maxTenantGroups"_sr, 20)); testDuration = getOption(options, "testDuration"_sr, 120.0); + createMetacluster = getOption(options, "createMetacluster"_sr, true); - if (clientId == 0) { + if (hasOption(options, "useMetacluster"_sr)) { + useMetacluster = getOption(options, "useMetacluster"_sr, false); + } else if (clientId == 0) { useMetacluster = deterministicRandom()->coinflip(); } else { // Other clients read the metacluster state from the database @@ -100,16 +104,22 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { MultiVersionApi::api->selectApiVersion(cx->apiVersion.version()); self->mvDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle); - if (self->useMetacluster && self->clientId == 0) { + if (self->useMetacluster && self->createMetacluster && self->clientId == 0) { wait(success(MetaclusterAPI::createMetacluster( cx.getReference(), "management_cluster"_sr, deterministicRandom()->randomInt(TenantAPI::TENANT_ID_PREFIX_MIN_VALUE, TenantAPI::TENANT_ID_PREFIX_MAX_VALUE + 1)))); - DataClusterEntry entry; - entry.capacity.numTenantGroups = 1e9; - wait(MetaclusterAPI::registerCluster(self->mvDb, "cluster1"_sr, g_simulator->extraDatabases[0], entry)); + state int extraDatabaseIdx; + for (extraDatabaseIdx = 0; extraDatabaseIdx < g_simulator->extraDatabases.size(); ++extraDatabaseIdx) { + DataClusterEntry entry; + entry.capacity.numTenantGroups = 1e9; + wait(MetaclusterAPI::registerCluster(self->mvDb, + ClusterName(fmt::format("cluster{}", extraDatabaseIdx)), + g_simulator->extraDatabases[extraDatabaseIdx], + entry)); + } } state Transaction tr(cx); @@ -145,10 +155,7 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { } } - if (self->useMetacluster) { - ASSERT(g_simulator->extraDatabases.size() == 1); - self->dataDb = Database::createSimulatedExtraDatabase(g_simulator->extraDatabases[0], cx->defaultTenant); - } else { + if (!self->useMetacluster) { self->dataDb = cx; } @@ -175,27 +182,45 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { ACTOR static Future createTenant(TenantManagementConcurrencyWorkload* self) { state TenantName tenant = self->chooseTenantName(); state TenantMapEntry entry; + + state UID debugId = deterministicRandom()->randomUniqueID(); + entry.tenantName = tenant; entry.tenantGroup = self->chooseTenantGroup(); try { loop { + TraceEvent(SevDebug, "TenantManagementConcurrencyCreatingTenant", debugId) + .detail("TenantName", entry.tenantName) + .detail("TenantGroup", entry.tenantGroup); Future createFuture = self->useMetacluster ? MetaclusterAPI::createTenant(self->mvDb, entry, AssignClusterAutomatically::True) : success(TenantAPI::createTenant(self->dataDb.getReference(), tenant, entry)); Optional result = wait(timeout(createFuture, 30)); if (result.present()) { + TraceEvent(SevDebug, "TenantManagementConcurrencyCreatedTenant", debugId) + .detail("TenantName", entry.tenantName) + .detail("TenantGroup", entry.tenantGroup); break; } } return Void(); } catch (Error& e) { - if (e.code() == error_code_tenant_removed) { + TraceEvent(SevDebug, "TenantManagementConcurrencyCreateTenantError", debugId) + .error(e) + .detail("TenantName", entry.tenantName) + .detail("TenantGroup", entry.tenantGroup); + if (e.code() == error_code_metacluster_no_capacity || e.code() == error_code_cluster_removed) { + ASSERT(self->useMetacluster && !self->createMetacluster); + } else if (e.code() == error_code_tenant_removed) { ASSERT(self->useMetacluster); } else if (e.code() != error_code_tenant_already_exists && e.code() != error_code_cluster_no_capacity) { - TraceEvent(SevError, "CreateTenantFailure").error(e).detail("TenantName", tenant); + TraceEvent(SevError, "TenantManagementConcurrencyCreateTenantFailure", debugId) + .error(e) + .detail("TenantName", entry.tenantName) + .detail("TenantGroup", entry.tenantGroup); ASSERT(false); } @@ -205,23 +230,36 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { ACTOR static Future deleteTenant(TenantManagementConcurrencyWorkload* self) { state TenantName tenant = self->chooseTenantName(); + state UID debugId = deterministicRandom()->randomUniqueID(); try { loop { + TraceEvent(SevDebug, "TenantManagementConcurrencyDeletingTenant", debugId).detail("TenantName", tenant); Future deleteFuture = self->useMetacluster ? MetaclusterAPI::deleteTenant(self->mvDb, tenant) : TenantAPI::deleteTenant(self->dataDb.getReference(), tenant); Optional result = wait(timeout(deleteFuture, 30)); if (result.present()) { + TraceEvent(SevDebug, "TenantManagementConcurrencyDeletedTenant", debugId) + .detail("TenantName", tenant); break; } } return Void(); } catch (Error& e) { - if (e.code() != error_code_tenant_not_found) { - TraceEvent(SevError, "DeleteTenantFailure").error(e).detail("TenantName", tenant); + TraceEvent(SevDebug, "TenantManagementConcurrencyDeleteTenantError", debugId) + .error(e) + .detail("TenantName", tenant); + if (e.code() == error_code_cluster_removed) { + ASSERT(self->useMetacluster && !self->createMetacluster); + } else if (e.code() != error_code_tenant_not_found) { + TraceEvent(SevError, "TenantManagementConcurrencyDeleteTenantFailure", debugId) + .error(e) + .detail("TenantName", tenant); + + ASSERT(false); } return Void(); } @@ -257,21 +295,43 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { ACTOR static Future configureTenant(TenantManagementConcurrencyWorkload* self) { state TenantName tenant = self->chooseTenantName(); state std::map, Optional> configParams; - configParams["tenant_group"_sr] = self->chooseTenantGroup(); + state Optional tenantGroup = self->chooseTenantGroup(); + state UID debugId = deterministicRandom()->randomUniqueID(); + + configParams["tenant_group"_sr] = tenantGroup; try { loop { + TraceEvent(SevDebug, "TenantManagementConcurrencyConfiguringTenant", debugId) + .detail("TenantName", tenant) + .detail("TenantGroup", tenantGroup); Optional result = wait(timeout(configureImpl(self, tenant, configParams), 30)); if (result.present()) { + TraceEvent(SevDebug, "TenantManagementConcurrencyConfiguredTenant", debugId) + .detail("TenantName", tenant) + .detail("TenantGroup", tenantGroup); break; } } return Void(); } catch (Error& e) { - if (e.code() != error_code_tenant_not_found && e.code() != error_code_invalid_tenant_state) { - TraceEvent(SevError, "ConfigureTenantFailure").error(e).detail("TenantName", tenant); + TraceEvent(SevDebug, "TenantManagementConcurrencyConfigureTenantError", debugId) + .error(e) + .detail("TenantName", tenant) + .detail("TenantGroup", tenantGroup); + if (e.code() == error_code_cluster_removed) { + ASSERT(self->useMetacluster && !self->createMetacluster); + } else if (e.code() == error_code_cluster_no_capacity || + e.code() == error_code_invalid_tenant_configuration) { + ASSERT(self->useMetacluster && !self->createMetacluster); + } else if (e.code() != error_code_tenant_not_found && e.code() != error_code_invalid_tenant_state) { + TraceEvent(SevError, "TenantManagementConcurrencyConfigureTenantFailure", debugId) + .error(e) + .detail("TenantName", tenant) + .detail("TenantGroup", tenantGroup); + ASSERT(false); } return Void(); } @@ -280,29 +340,43 @@ struct TenantManagementConcurrencyWorkload : TestWorkload { ACTOR static Future renameTenant(TenantManagementConcurrencyWorkload* self) { state TenantName oldTenant = self->chooseTenantName(); state TenantName newTenant = self->chooseTenantName(); + state UID debugId = deterministicRandom()->randomUniqueID(); try { loop { + TraceEvent(SevDebug, "TenantManagementConcurrencyRenamingTenant", debugId) + .detail("OldTenantName", oldTenant) + .detail("NewTenantName", newTenant); Future renameFuture = self->useMetacluster ? MetaclusterAPI::renameTenant(self->mvDb, oldTenant, newTenant) : TenantAPI::renameTenant(self->dataDb.getReference(), oldTenant, newTenant); Optional result = wait(timeout(renameFuture, 30)); if (result.present()) { + TraceEvent(SevDebug, "TenantManagementConcurrencyRenamedTenant", debugId) + .detail("OldTenantName", oldTenant) + .detail("NewTenantName", newTenant); break; } } return Void(); } catch (Error& e) { - if (e.code() == error_code_invalid_tenant_state || e.code() == error_code_tenant_removed || - e.code() == error_code_cluster_no_capacity) { + TraceEvent(SevDebug, "TenantManagementConcurrencyRenameTenantError", debugId) + .error(e) + .detail("OldTenantName", oldTenant) + .detail("NewTenantName", newTenant); + if (e.code() == error_code_cluster_removed) { + ASSERT(self->useMetacluster && !self->createMetacluster); + } else if (e.code() == error_code_invalid_tenant_state || e.code() == error_code_tenant_removed || + e.code() == error_code_cluster_no_capacity) { ASSERT(self->useMetacluster); } else if (e.code() != error_code_tenant_not_found && e.code() != error_code_tenant_already_exists) { - TraceEvent(SevError, "RenameTenantFailure") + TraceEvent(SevDebug, "TenantManagementConcurrencyRenameTenantFailure", debugId) .error(e) - .detail("OldTenant", oldTenant) - .detail("NewTenant", newTenant); + .detail("OldTenantName", oldTenant) + .detail("NewTenantName", newTenant); + ASSERT(false); } return Void(); } diff --git a/tests/slow/MetaclusterManagementConcurrency.toml b/tests/slow/MetaclusterManagementConcurrency.toml index bbeaf4aae7..2db298eb8b 100644 --- a/tests/slow/MetaclusterManagementConcurrency.toml +++ b/tests/slow/MetaclusterManagementConcurrency.toml @@ -14,3 +14,9 @@ runSetup = true [[test.workload]] testName = 'MetaclusterManagementConcurrency' testDuration = 30 + + [[test.workload]] + testName = 'TenantManagementConcurrency' + useMetacluster = true + createMetacluster = false + testDuration = 30 From 5ede2d439cec481dda877f020bd8f03cbc8aa0bd Mon Sep 17 00:00:00 2001 From: Dan Adkins Date: Mon, 13 Feb 2023 08:53:58 -0800 Subject: [PATCH 38/57] Disable machine attrition in DiskFailure workload. The machine attrition logic doesn't take into account the possibility that a disk corruption could an unrecoverable failure in the cluster. Before disabling attrition during the DiskFailure workload, the failure rate was >10/100,000 in the DiskFailureCycle test. Afterwards, there were no failures in 100,000 runs. --- fdbserver/workloads/DiskFailureInjection.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbserver/workloads/DiskFailureInjection.actor.cpp b/fdbserver/workloads/DiskFailureInjection.actor.cpp index 0bb5953e2b..333fa01005 100644 --- a/fdbserver/workloads/DiskFailureInjection.actor.cpp +++ b/fdbserver/workloads/DiskFailureInjection.actor.cpp @@ -65,6 +65,9 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload { periodicBroadcastInterval = getOption(options, "periodicBroadcastInterval"_sr, periodicBroadcastInterval); } + // TODO: Currently this workload doesn't play well with MachineAttrition. + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("Attrition"); } + void initFailureInjectionMode(DeterministicRandom& random) override { enabled = clientId == 0; } Future setup(Database const& cx) override { return Void(); } From 473dd33a1f3538e7f24d8347cff3129008ddabc5 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 13 Feb 2023 11:53:47 -0800 Subject: [PATCH 39/57] Fix get mapped range test assertion to account for the possibility of a range terminating early when it reaches the end of a shard --- fdbserver/workloads/GetMappedRange.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/workloads/GetMappedRange.actor.cpp b/fdbserver/workloads/GetMappedRange.actor.cpp index faaaffbba8..03ae074a59 100644 --- a/fdbserver/workloads/GetMappedRange.actor.cpp +++ b/fdbserver/workloads/GetMappedRange.actor.cpp @@ -332,7 +332,7 @@ struct GetMappedRangeWorkload : ApiWorkload { } expectedCnt = std::min(expectedCnt, boundByRecord); std::cout << "boundByRecord: " << boundByRecord << std::endl; - ASSERT(result.size() == expectedCnt); + ASSERT_LE(result.size(), expectedCnt); beginSelector = KeySelector(firstGreaterThan(result.back().key)); } } else { From 958ff862e0eebba2ebb6159ac9caab9833c125e0 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 13 Feb 2023 12:59:48 -0800 Subject: [PATCH 40/57] Fix some merge issues --- fdbcli/MetaclusterCommands.actor.cpp | 4 ++-- .../fdbclient/MetaclusterManagement.actor.h | 5 ++++- ...lusterManagementConcurrencyWorkload.actor.cpp | 3 ++- .../MetaclusterRestoreWorkload.actor.cpp | 16 ++++++++++------ 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 00ef857194..941a94cdb4 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -184,8 +184,8 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector "to forget its metacluster association without fully removing it, use FORCE.\n"); } - wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, force)); - bool updatedDataCluster = wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, tokens.size() == 4, 15.0)); + bool updatedDataCluster = + wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, tokens.size() == 4, 15.0)); if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 47f0b85633..09e13b6829 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -861,6 +861,9 @@ struct RemoveClusterImpl { // Parameters set in markClusterRemoving Optional lastTenantId; + // Output parameter indicating whether the data cluster was updated during the removal + bool dataClusterUpdated = false; + RemoveClusterImpl(Reference db, ClusterName clusterName, ClusterType clusterType, @@ -1166,7 +1169,7 @@ struct RemoveClusterImpl { }; ACTOR template -Future removeCluster(Reference db, +Future removeCluster(Reference db, ClusterName name, ClusterType clusterType, bool forceRemove, diff --git a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp index 33f4efd5ea..e3b923a012 100644 --- a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp @@ -142,7 +142,8 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { loop { TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRemovingCluster", debugId) .detail("ClusterName", clusterName); - Future removeFuture = MetaclusterAPI::removeCluster(self->managementDb, clusterName, false); + Future removeFuture = MetaclusterAPI::removeCluster( + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, false); Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { ASSERT(result.get()); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 2c98b3e7de..a46de60514 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -174,7 +174,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { MultiVersionApi::api->selectApiVersion(cx->apiVersion.version()); self->managementDb = MultiVersionDatabase::debugCreateFromExistingDatabase(threadSafeHandle); - wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); + wait(success(MetaclusterAPI::createMetacluster( + self->managementDb, + "management_cluster"_sr, + deterministicRandom()->randomInt(TenantAPI::TENANT_ID_PREFIX_MIN_VALUE, + TenantAPI::TENANT_ID_PREFIX_MAX_VALUE + 1)))); ASSERT(g_simulator->extraDatabases.size() > 0); state std::vector::iterator extraDatabasesItr; @@ -465,14 +469,14 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { TraceEvent("MetaclusterRestoreWorkloadRestoringManagementCluster"); - wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr))); + wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr, 0))); state std::map::iterator clusterItr; for (clusterItr = self->dataDbs.begin(); clusterItr != self->dataDbs.end(); ++clusterItr) { TraceEvent("MetaclusterRestoreWorkloadProcessDataCluster").detail("FromCluster", clusterItr->first); // Remove the data cluster from its old metacluster - wait(MetaclusterAPI::removeCluster( - clusterItr->second.db.getReference(), clusterItr->first, ClusterType::METACLUSTER_DATA, true)); + wait(success(MetaclusterAPI::removeCluster( + clusterItr->second.db.getReference(), clusterItr->first, ClusterType::METACLUSTER_DATA, true))); TraceEvent("MetaclusterRestoreWorkloadForgotMetacluster").detail("ClusterName", clusterItr->first); state std::pair collisions = @@ -508,8 +512,8 @@ struct MetaclusterRestoreWorkload : TestWorkload { // If the restore did not succeed, remove the partially restored cluster try { - wait(MetaclusterAPI::removeCluster( - self->managementDb, clusterItr->first, ClusterType::METACLUSTER_MANAGEMENT, true)); + wait(success(MetaclusterAPI::removeCluster( + self->managementDb, clusterItr->first, ClusterType::METACLUSTER_MANAGEMENT, true))); TraceEvent("MetaclusterRestoreWorkloadRemoveFailedCluster") .detail("ClusterName", clusterItr->first); } catch (Error& e) { From f3b58a063fafd0fb8922e6f72a87fc3bd64d0342 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 13 Feb 2023 15:32:44 -0800 Subject: [PATCH 41/57] Fix some merge issues and review comments --- .../fdbclient/MetaclusterManagement.actor.h | 49 ++++++++++--------- .../workloads/MetaclusterConsistency.actor.h | 2 - .../workloads/TenantConsistency.actor.h | 4 +- .../MetaclusterRestoreWorkload.actor.cpp | 6 ++- flow/Error.cpp | 4 +- 5 files changed, 37 insertions(+), 28 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 09e13b6829..043fe6eb54 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -911,7 +911,7 @@ struct RemoveClusterImpl { // Delete metacluster metadata from the data cluster ACTOR template - static Future updateDataCluster(RemoveClusterImpl* self, Reference tr) { + static Future updateDataCluster(RemoveClusterImpl* self, Reference tr, UID clusterId) { if (self->ctx.dataClusterIsRegistered) { // Delete metacluster related metadata MetaclusterMetadata::metaclusterRegistration().clear(tr); @@ -931,10 +931,10 @@ struct RemoveClusterImpl { } // Insert a tombstone marking this tenant removed even if we aren't registered - MetaclusterMetadata::registrationTombstones().insert(tr, self->ctx.metaclusterRegistration.get().id); + MetaclusterMetadata::registrationTombstones().insert(tr, clusterId); TraceEvent("RemovedMetaclusterRegistrationOnDataCluster") - .detail("Name", self->ctx.clusterName.get()) + .detail("Name", self->clusterName) .detail("WasRegistered", self->ctx.dataClusterIsRegistered); return Void(); @@ -1087,21 +1087,21 @@ struct RemoveClusterImpl { if (metaclusterRegistrationEntry.get().clusterType != ClusterType::METACLUSTER_DATA) { TraceEvent(SevWarn, "CannotRemoveNonDataCluster") - .detail("ClusterName", self->ctx.clusterName.get()) + .detail("ClusterName", self->clusterName) .detail("MetaclusterRegistration", metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); throw invalid_metacluster_operation(); } - if (metaclusterRegistrationEntry.get().name != self->ctx.clusterName.get()) { + if (metaclusterRegistrationEntry.get().name != self->clusterName) { TraceEvent(SevWarn, "CannotRemoveDataClusterWithNameMismatch") - .detail("ExpectedName", self->ctx.clusterName.get()) + .detail("ExpectedName", self->clusterName) .detail("MetaclusterRegistration", metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); throw invalid_metacluster_operation(); } - wait(updateDataCluster(self, tr)); + wait(updateDataCluster(self, tr, metaclusterRegistrationEntry.get().id)); wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); return Void(); @@ -1136,7 +1136,9 @@ struct RemoveClusterImpl { try { Future f = self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return updateDataCluster(self, tr); }, + [self = self](Reference tr) { + return updateDataCluster(self, tr, self->ctx.metaclusterRegistration.get().id); + }, RunOnDisconnectedCluster::True); if (self->forceRemove && self->dataClusterTimeout > 0) { @@ -1460,7 +1462,7 @@ struct RestoreClusterImpl { state KeyBackedRangeResult> tenants = wait(TenantMetadata::tenantMap().getRange(tr, {}, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); - for (auto t : tenants.results) { + for (auto const& t : tenants.results) { self->dataClusterTenantMap.emplace(t.first, t.second); self->dataClusterTenantNames.insert(t.second.tenantName); } @@ -1475,7 +1477,7 @@ struct RestoreClusterImpl { wait(ManagementClusterMetadata::tenantMetadata().tenantMap.getRange( tr, initialTenantId, {}, CLIENT_KNOBS->MAX_TENANTS_PER_CLUSTER)); - for (auto t : tenants.results) { + for (auto const& t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); if (t.second.assignedCluster.present() && self->ctx.clusterName.get() == t.second.assignedCluster.get()) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); @@ -1579,7 +1581,8 @@ struct RestoreClusterImpl { self->mgmtClusterTenantMap.find(tenantEntry.id); // A data cluster tenant is not present on the management cluster - if (managementEntry == self->mgmtClusterTenantMap.end()) { + if (managementEntry == self->mgmtClusterTenantMap.end() || + managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); })); @@ -1589,9 +1592,6 @@ struct RestoreClusterImpl { state TenantName tenantName = tenantEntry.tenantName; state TenantMapEntry managementTenant = managementEntry->second; - ASSERT(managementTenant.assignedCluster.present() && - managementTenant.assignedCluster.get() == self->ctx.clusterName.get()); - // Rename if (tenantName != managementTenant.tenantName) { state TenantName temporaryName; @@ -1718,10 +1718,15 @@ struct RestoreClusterImpl { ACTOR static Future addTenantBatchToManagementCluster(RestoreClusterImpl* self, Reference tr, std::vector tenants) { + Optional tenantIdPrefix = wait(TenantMetadata::tenantIdPrefix().get(tr)); + ASSERT(tenantIdPrefix.present()); + state std::vector> futures; - state int64_t maxId = 0; + state int64_t maxId = tenantIdPrefix.get() << 48; for (auto const& t : tenants) { - maxId = std::max(maxId, t.id); + if (TenantAPI::getTenantIdPrefix(t.id) == tenantIdPrefix.get()) { + maxId = std::max(maxId, t.id); + } futures.push_back(addTenantToManagementCluster(self, tr, t)); } @@ -1773,8 +1778,8 @@ struct RestoreClusterImpl { if (managementEntry == self->mgmtClusterTenantMap.end()) { tenantBatch.push_back(itr->second); } else if (managementEntry->second.tenantName != itr->second.tenantName || - managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { - ASSERT(managementEntry->second.matchesConfiguration(itr->second)); + managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get() || + !managementEntry->second.matchesConfiguration(itr->second)) { self->messages.push_back( fmt::format("The tenant `{}' has the same ID {} as an existing tenant `{}' on cluster `{}'", printable(itr->second.tenantName), @@ -1924,7 +1929,7 @@ struct RestoreClusterImpl { wait(self->ctx.runDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); - // Fix any differences between the data cluster and the management cluster + // Add all tenants from the data cluster to the management cluster wait(addTenantsToManagementCluster(self)); // set restored cluster to ready state @@ -2088,11 +2093,11 @@ struct CreateTenantImpl { if (availableClusters.results.empty()) { throw metacluster_no_capacity(); } - for (auto clusterTuple : availableClusters.results) { + for (auto const& clusterTuple : availableClusters.results) { dataClusterNames.push_back(clusterTuple.getString(1)); } } - for (auto dataClusterName : dataClusterNames) { + for (auto const& dataClusterName : dataClusterNames) { dataClusterDbs.push_back(getAndOpenDatabase(tr, dataClusterName)); } wait(waitForAll(dataClusterDbs)); @@ -2115,7 +2120,7 @@ struct CreateTenantImpl { // Get the first cluster that was available state Optional chosenCluster; - for (auto f : clusterAvailabilityChecks) { + for (auto const& f : clusterAvailabilityChecks) { if (f.isReady()) { chosenCluster = f.get(); break; diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index a0adbfd8d7..4d15a57e4d 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -219,7 +219,6 @@ private: std::set processedTenantGroups; for (auto [tenantId, entry] : managementMetadata.tenantMap) { ASSERT(entry.assignedCluster.present()); - ASSERT(TenantAPI::getTenantIdPrefix(tenantId) == managementMetadata.tenantIdPrefix.get()); // Each tenant should be assigned to the same cluster where it is stored in the cluster tenant index auto clusterItr = managementMetadata.clusterTenantMap.find(entry.assignedCluster.get()); @@ -333,7 +332,6 @@ private: TenantMapEntry const& metaclusterEntry = self->managementMetadata.tenantMap[tenantId]; ASSERT(!entry.assignedCluster.present()); ASSERT_EQ(entry.id, metaclusterEntry.id); - ASSERT(TenantAPI::getTenantIdPrefix(entry.id) == self->managementMetadata.tenantIdPrefix.get()); ASSERT(entry.tenantName == metaclusterEntry.tenantName); ASSERT_EQ(entry.tenantState, TenantState::READY); diff --git a/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h index c8414ac387..8b28c21ab6 100644 --- a/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/TenantConsistency.actor.h @@ -153,7 +153,9 @@ private: for (auto [tenantId, tenantMapEntry] : metadata.tenantMap) { ASSERT_EQ(tenantId, tenantMapEntry.id); if (metadata.clusterType != ClusterType::METACLUSTER_DATA) { - ASSERT_LE(tenantId, metadata.lastTenantId); + if (TenantAPI::getTenantIdPrefix(tenantId) == TenantAPI::getTenantIdPrefix(metadata.lastTenantId)) { + ASSERT_LE(tenantId, metadata.lastTenantId); + } } ASSERT_EQ(metadata.tenantNameIndex[tenantMapEntry.tenantName], tenantId); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index a46de60514..0b431b5bf0 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -469,7 +469,11 @@ struct MetaclusterRestoreWorkload : TestWorkload { ACTOR static Future restoreManagementCluster(MetaclusterRestoreWorkload* self) { TraceEvent("MetaclusterRestoreWorkloadRestoringManagementCluster"); - wait(success(MetaclusterAPI::createMetacluster(self->managementDb, "management_cluster"_sr, 0))); + wait(success(MetaclusterAPI::createMetacluster( + self->managementDb, + "management_cluster"_sr, + deterministicRandom()->randomInt(TenantAPI::TENANT_ID_PREFIX_MIN_VALUE, + TenantAPI::TENANT_ID_PREFIX_MAX_VALUE + 1)))); state std::map::iterator clusterItr; for (clusterItr = self->dataDbs.begin(); clusterItr != self->dataDbs.end(); ++clusterItr) { TraceEvent("MetaclusterRestoreWorkloadProcessDataCluster").detail("FromCluster", clusterItr->first); diff --git a/flow/Error.cpp b/flow/Error.cpp index 0f3455fcb3..41cbb419a3 100644 --- a/flow/Error.cpp +++ b/flow/Error.cpp @@ -27,10 +27,10 @@ bool g_crashOnError = false; -#define DEBUG_ERROR 1 +#define DEBUG_ERROR 0 #if DEBUG_ERROR -std::set debugErrorSet = std::set{ error_code_invalid_metacluster_operation }; +std::set debugErrorSet = std::set{ error_code_platform_error }; #define SHOULD_LOG_ERROR(x) (debugErrorSet.count(x) > 0) #endif From a4d3035f64d97159a7581a87d0579349197bcefc Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 7 Feb 2023 14:30:47 -0800 Subject: [PATCH 42/57] Enable RocksDB restarting tests Disable sharded rocks storage for downgrade tests where we Need to keep knob "shard_encode_location_metadata" so that downgrade tests can pass the second phase. --- fdbserver/SimulatedCluster.actor.cpp | 8 -------- tests/fast/BlobGranuleMoveVerifyCycle.toml | 2 +- tests/fast/BlobGranuleVerifyAtomicOps.toml | 2 +- tests/fast/BlobGranuleVerifyCycle.toml | 2 +- tests/fast/BlobGranuleVerifySmall.toml | 2 +- tests/fast/BlobGranuleVerifySmallClean.toml | 2 +- tests/rare/BlobGranuleRanges.toml | 2 +- .../ConfigureStorageMigrationTestRestart-1.toml | 2 +- .../to_7.1.0_until_7.2.0/CycleTestRestart-1.toml | 2 +- tests/slow/BlobGranuleCorrectness.toml | 2 +- tests/slow/BlobGranuleCorrectnessClean.toml | 2 +- tests/slow/BlobGranuleVerifyBalance.toml | 2 +- tests/slow/BlobGranuleVerifyBalanceClean.toml | 2 +- tests/slow/BlobGranuleVerifyLarge.toml | 2 +- tests/slow/BlobGranuleVerifyLargeClean.toml | 2 +- 15 files changed, 14 insertions(+), 22 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 3b4aa4ff48..e73a6365f8 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2666,14 +2666,6 @@ ACTOR void setupAndRun(std::string dataFolder, testConfig.storageEngineExcludeTypes.push_back(5); } - // The RocksDB storage engine does not support the restarting tests because you cannot consistently get a clean - // snapshot of the storage engine without a snapshotting file system. - // https://github.com/apple/foundationdb/issues/5155 - if (std::string_view(testFile).find("restarting") != std::string_view::npos) { - testConfig.storageEngineExcludeTypes.push_back(4); - testConfig.storageEngineExcludeTypes.push_back(5); - } - // The RocksDB engine is not always built with the rest of fdbserver. Don't try to use it if it is not included // in the build. if (!rocksDBEnabled) { diff --git a/tests/fast/BlobGranuleMoveVerifyCycle.toml b/tests/fast/BlobGranuleMoveVerifyCycle.toml index 4f1267aac7..5152fd65f4 100644 --- a/tests/fast/BlobGranuleMoveVerifyCycle.toml +++ b/tests/fast/BlobGranuleMoveVerifyCycle.toml @@ -3,7 +3,7 @@ testClass = "BlobGranule" blobGranulesEnabled = true allowDefaultTenant = false # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4] +# storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleMoveVerifyCycle' diff --git a/tests/fast/BlobGranuleVerifyAtomicOps.toml b/tests/fast/BlobGranuleVerifyAtomicOps.toml index 84e946c004..6d26499967 100644 --- a/tests/fast/BlobGranuleVerifyAtomicOps.toml +++ b/tests/fast/BlobGranuleVerifyAtomicOps.toml @@ -5,7 +5,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyAtomicOps' diff --git a/tests/fast/BlobGranuleVerifyCycle.toml b/tests/fast/BlobGranuleVerifyCycle.toml index 1cf056b87b..b3e4fdedcd 100644 --- a/tests/fast/BlobGranuleVerifyCycle.toml +++ b/tests/fast/BlobGranuleVerifyCycle.toml @@ -5,7 +5,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyCycle' diff --git a/tests/fast/BlobGranuleVerifySmall.toml b/tests/fast/BlobGranuleVerifySmall.toml index ba50b0fda6..42b37c4f04 100644 --- a/tests/fast/BlobGranuleVerifySmall.toml +++ b/tests/fast/BlobGranuleVerifySmall.toml @@ -5,7 +5,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifySmall' diff --git a/tests/fast/BlobGranuleVerifySmallClean.toml b/tests/fast/BlobGranuleVerifySmallClean.toml index ef957b9f53..8fb36517e6 100644 --- a/tests/fast/BlobGranuleVerifySmallClean.toml +++ b/tests/fast/BlobGranuleVerifySmallClean.toml @@ -2,7 +2,7 @@ blobGranulesEnabled = true allowDefaultTenant = false # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] testClass = "BlobGranule" [[test]] diff --git a/tests/rare/BlobGranuleRanges.toml b/tests/rare/BlobGranuleRanges.toml index aa38cdf5f8..85f7dd7fb7 100644 --- a/tests/rare/BlobGranuleRanges.toml +++ b/tests/rare/BlobGranuleRanges.toml @@ -4,7 +4,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleRanges' diff --git a/tests/restarting/to_7.1.0_until_7.2.0/ConfigureStorageMigrationTestRestart-1.toml b/tests/restarting/to_7.1.0_until_7.2.0/ConfigureStorageMigrationTestRestart-1.toml index fc63bdea57..50164841d1 100644 --- a/tests/restarting/to_7.1.0_until_7.2.0/ConfigureStorageMigrationTestRestart-1.toml +++ b/tests/restarting/to_7.1.0_until_7.2.0/ConfigureStorageMigrationTestRestart-1.toml @@ -3,7 +3,7 @@ extraMachineCountDC = 2 maxTLogVersion=6 disableHostname=true disableEncryption=true -storageEngineExcludeTypes=[3, 4] +storageEngineExcludeTypes=[3, 4, 5] tenantModes=['disabled'] [[knobs]] diff --git a/tests/restarting/to_7.1.0_until_7.2.0/CycleTestRestart-1.toml b/tests/restarting/to_7.1.0_until_7.2.0/CycleTestRestart-1.toml index db37256797..54f8a7994a 100644 --- a/tests/restarting/to_7.1.0_until_7.2.0/CycleTestRestart-1.toml +++ b/tests/restarting/to_7.1.0_until_7.2.0/CycleTestRestart-1.toml @@ -1,5 +1,5 @@ [configuration] -storageEngineExcludeTypes = [3] +storageEngineExcludeTypes = [3, 5] maxTLogVersion = 6 disableTss = true disableHostname = true diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml index f651594d38..c5f5369359 100644 --- a/tests/slow/BlobGranuleCorrectness.toml +++ b/tests/slow/BlobGranuleCorrectness.toml @@ -5,7 +5,7 @@ tenantModes = ['optional', 'required'] injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] encryptModes = ['domain_aware', 'cluster_aware'] [[knobs]] diff --git a/tests/slow/BlobGranuleCorrectnessClean.toml b/tests/slow/BlobGranuleCorrectnessClean.toml index d6c83d63d0..22c5bd4dc0 100644 --- a/tests/slow/BlobGranuleCorrectnessClean.toml +++ b/tests/slow/BlobGranuleCorrectnessClean.toml @@ -3,7 +3,7 @@ blobGranulesEnabled = true allowDefaultTenant = false tenantModes = ['optional', 'required'] # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] encryptModes = ['domain_aware', 'cluster_aware'] [[knobs]] diff --git a/tests/slow/BlobGranuleVerifyBalance.toml b/tests/slow/BlobGranuleVerifyBalance.toml index 91f97d6d04..e610ff6299 100644 --- a/tests/slow/BlobGranuleVerifyBalance.toml +++ b/tests/slow/BlobGranuleVerifyBalance.toml @@ -4,7 +4,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyBalance' diff --git a/tests/slow/BlobGranuleVerifyBalanceClean.toml b/tests/slow/BlobGranuleVerifyBalanceClean.toml index 4ea976020e..5a5627f95f 100644 --- a/tests/slow/BlobGranuleVerifyBalanceClean.toml +++ b/tests/slow/BlobGranuleVerifyBalanceClean.toml @@ -2,7 +2,7 @@ blobGranulesEnabled = true allowDefaultTenant = false # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyBalanceClean' diff --git a/tests/slow/BlobGranuleVerifyLarge.toml b/tests/slow/BlobGranuleVerifyLarge.toml index 01aac91356..dffb8579c8 100644 --- a/tests/slow/BlobGranuleVerifyLarge.toml +++ b/tests/slow/BlobGranuleVerifyLarge.toml @@ -4,7 +4,7 @@ allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyLarge' diff --git a/tests/slow/BlobGranuleVerifyLargeClean.toml b/tests/slow/BlobGranuleVerifyLargeClean.toml index a7adc4243a..7f2f1ce423 100644 --- a/tests/slow/BlobGranuleVerifyLargeClean.toml +++ b/tests/slow/BlobGranuleVerifyLargeClean.toml @@ -2,7 +2,7 @@ blobGranulesEnabled = true allowDefaultTenant = false # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [4, 5] +# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyLargeClean' From 401b9c89186d1d0dfd9429bf615c67efed23f502 Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Tue, 14 Feb 2023 08:34:41 -0800 Subject: [PATCH 43/57] EaR: Helper routines to support configurable encryption (#9368) * EaR: Helper routines to support configurable encryption Description Add helper methods to BlobCipherEncryptHeaderRef enabling: 1. Extract 'IV' abstracting out underlying algorithm header 1. Extract 'cipherDetails' abstracting out underlying algorithm header Testing BlobCipherUnitTest & EncryptionOps are updated - 100K loop * EaR: Helper routines to support configurable encryption Description Add helper methods to BlobCipherEncryptHeaderRef enabling: 1. Extract 'IV' abstracting out underlying algorithm header 1. Extract 'cipherDetails' abstracting out underlying algorithm header Testing BlobCipherUnitTest & EncryptionOps are updated - 100K loop --- fdbclient/BlobCipher.cpp | 200 +++++++++++++++----- fdbclient/include/fdbclient/BlobCipher.h | 17 ++ fdbserver/workloads/EncryptionOps.actor.cpp | 18 ++ 3 files changed, 187 insertions(+), 48 deletions(-) diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp index 23588a4c23..bea45205f4 100644 --- a/fdbclient/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -60,6 +60,58 @@ #define BLOB_CIPHER_DEBUG false #define BLOB_CIPHER_SERIALIZATION_CHECKS false +namespace { +void validateEncryptHeaderFlagVersion(const int flagsVersion) { + ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); + + if (flagsVersion > CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) { + TraceEvent("EncryptHeaderUnsupportedFlagVersion") + .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) + .detail("Version", flagsVersion); + throw not_implemented(); + } +} + +void validateEncryptHeaderAlgoHeaderVersion(const EncryptCipherMode cipherMode, + const EncryptAuthTokenMode authMode, + const EncryptAuthTokenAlgo authAlgo, + const int version) { + ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); + + if (cipherMode != ENCRYPT_CIPHER_MODE_AES_256_CTR) { + TraceEvent("EncryptHeaderUnsupportedEncryptCipherMode") + .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) + .detail("CipherMode", cipherMode); + throw not_implemented(); + } + + int maxSupportedVersion = -1; + if (authMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + maxSupportedVersion = CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION; + } else { + ASSERT_EQ(authMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); + + if (authAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { + maxSupportedVersion = CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION; + } else if (authAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { + maxSupportedVersion = CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION; + } else { + // Unknown encryption authentication algo + } + } + + if (version > maxSupportedVersion || maxSupportedVersion == -1) { + TraceEvent("EncryptHeaderUnsupportedEncryptAuthToken") + .detail("CipherMode", cipherMode) + .detail("AuthMode", authMode) + .detail("AuthAlgo", authAlgo) + .detail("AlgoHeaderVersion", version) + .detail("MaxSsupportedVersion", maxSupportedVersion); + throw not_implemented(); + } +} +} // namespace + // BlobCipherEncryptHeaderRef uint32_t BlobCipherEncryptHeaderRef::getHeaderSize(const int flagVersion, @@ -91,62 +143,93 @@ uint32_t BlobCipherEncryptHeaderRef::getHeaderSize(const int flagVersion, return total; } +const uint8_t* BlobCipherEncryptHeaderRef::getIV() const { + ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); + + validateEncryptHeaderFlagVersion(flagsVersion); + ASSERT_EQ(flagsVersion, 1); + + BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); + + validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, + (EncryptAuthTokenMode)flags.authTokenMode, + (EncryptAuthTokenAlgo)flags.authTokenAlgo, + algoHeaderVersion); + ASSERT_EQ(algoHeaderVersion, 1); + + return std::visit([](auto& h) { return h.iv; }, algoHeader); +} + +template +inline constexpr bool always_false_v = false; + +const EncryptHeaderCipherDetails BlobCipherEncryptHeaderRef::getCipherDetails() const { + ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); + + validateEncryptHeaderFlagVersion(flagsVersion); + ASSERT_EQ(flagsVersion, 1); + + BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); + + validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, + (EncryptAuthTokenMode)flags.authTokenMode, + (EncryptAuthTokenAlgo)flags.authTokenAlgo, + algoHeaderVersion); + ASSERT_EQ(algoHeaderVersion, 1); + + // TODO: Replace with "Overload visitor pattern" someday. + return std::visit( + [](auto&& h) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return EncryptHeaderCipherDetails(h.cipherTextDetails); + } else if constexpr (std::is_same_v> || + std::is_same_v>) { + return EncryptHeaderCipherDetails(h.cipherTextDetails, h.cipherHeaderDetails); + } else { + static_assert(always_false_v, "Unknown encryption authentication"); + } + }, + algoHeader); +} + void BlobCipherEncryptHeaderRef::validateEncryptionHeaderDetails(const BlobCipherDetails& textCipherDetails, const BlobCipherDetails& headerCipherDetails, const StringRef& ivRef) const { ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); - if (flagsVersion > CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) { - TraceEvent("ValidateEncryptHeaderUnsupportedFlagVersion") - .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) - .detail("Version", flagsVersion); - throw not_implemented(); - } + validateEncryptHeaderFlagVersion(flagsVersion); + ASSERT_EQ(flagsVersion, 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); + + validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, + (EncryptAuthTokenMode)flags.authTokenMode, + (EncryptAuthTokenAlgo)flags.authTokenAlgo, + algoHeaderVersion); + ASSERT_EQ(algoHeaderVersion, 1); + BlobCipherDetails persistedTextCipherDetails; BlobCipherDetails persistedHeaderCipherDetails; uint8_t* persistedIV = nullptr; - if (flags.authTokenMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - if (algoHeaderVersion > CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION) { - TraceEvent("ValidateEncryptHeaderUnsupportedAlgoHeaderVersion") - .detail("AuthMode", "No-Auth") - .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION) - .detail("Version", algoHeaderVersion); - throw not_implemented(); - } - persistedTextCipherDetails = std::get(this->algoHeader).cipherTextDetails; - persistedIV = (uint8_t*)(&std::get(this->algoHeader).iv[0]); - } else { - if (flags.authTokenAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { - if (algoHeaderVersion > CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION) { - TraceEvent("ValidateEncryptHeaderUnsupportedAlgoHeaderVersion") - .detail("AuthMode", "Hmac-Sha") - .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION) - .detail("Version", algoHeaderVersion); - } - persistedTextCipherDetails = - std::get>(this->algoHeader).cipherTextDetails; - persistedHeaderCipherDetails = - std::get>(this->algoHeader).cipherHeaderDetails; - persistedIV = (uint8_t*)(&std::get>(this->algoHeader).iv[0]); - } else if (flags.authTokenAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { - if (algoHeaderVersion > CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION) { - TraceEvent("ValidateEncryptHeaderUnsupportedAlgoHeaderVersion") - .detail("AuthMode", "Aes-Cmac") - .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION) - .detail("Version", algoHeaderVersion); - } - persistedTextCipherDetails = - std::get>(this->algoHeader).cipherTextDetails; - persistedHeaderCipherDetails = - std::get>(this->algoHeader).cipherHeaderDetails; - persistedIV = (uint8_t*)(&std::get>(this->algoHeader).iv[0]); - } else { - throw not_implemented(); - } - } + // TODO: Replace with "Overload visitor pattern" someday. + return std::visit( + [&persistedTextCipherDetails, &persistedHeaderCipherDetails, &persistedIV](auto&& h) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + persistedTextCipherDetails = h.cipherTextDetails; + persistedIV = (uint8_t*)&h.iv[0]; + } else if constexpr (std::is_same_v> || + std::is_same_v>) { + persistedTextCipherDetails = h.cipherTextDetails; + persistedHeaderCipherDetails = h.cipherHeaderDetails; + persistedIV = (uint8_t*)&h.iv[0]; + } else { + static_assert(always_false_v, "Unknown encryption authentication"); + } + }, + algoHeader); // Validate encryption header 'cipherHeader' details sanity if (flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && @@ -1808,8 +1891,17 @@ void testConfigurableEncryptionHeaderNoAuthMode(const int minDomainId) { BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); AesCtrNoAuthV1 noAuth = std::get(headerRef.algoHeader); - Standalone serHeaderRef = BlobCipherEncryptHeaderRef::toStringRef(headerRef); + const uint8_t* headerIV = headerRef.getIV(); + ASSERT_EQ(memcmp(&headerIV[0], &iv[0], AES_256_IV_LENGTH), 0); + + EncryptHeaderCipherDetails validateDetails = headerRef.getCipherDetails(); + ASSERT(validateDetails.textCipherDetails.isValid() && + validateDetails.textCipherDetails == + BlobCipherDetails(cipherKey->getDomainId(), cipherKey->getBaseCipherId(), cipherKey->getSalt())); + ASSERT(!validateDetails.headerCipherDetails.present()); + + Standalone serHeaderRef = BlobCipherEncryptHeaderRef::toStringRef(headerRef); BlobCipherEncryptHeaderRef validateHeader = BlobCipherEncryptHeaderRef::fromStringRef(serHeaderRef); BlobCipherEncryptHeaderFlagsV1 validateFlags = std::get(validateHeader.flags); ASSERT(validateFlags == flags); @@ -2093,8 +2185,20 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); AesCtrWithAuthV1 algoHeader = std::get>(headerRef.algoHeader); - Standalone serHeaderRef = BlobCipherEncryptHeaderRef::toStringRef(headerRef); + const uint8_t* headerIV = headerRef.getIV(); + ASSERT_EQ(memcmp(&headerIV[0], &iv[0], AES_256_IV_LENGTH), 0); + + EncryptHeaderCipherDetails validateDetails = headerRef.getCipherDetails(); + ASSERT(validateDetails.textCipherDetails.isValid() && + validateDetails.textCipherDetails == + BlobCipherDetails(cipherKey->getDomainId(), cipherKey->getBaseCipherId(), cipherKey->getSalt())); + ASSERT(validateDetails.headerCipherDetails.present() && validateDetails.headerCipherDetails.get().isValid() && + validateDetails.headerCipherDetails.get() == BlobCipherDetails(headerCipherKey->getDomainId(), + headerCipherKey->getBaseCipherId(), + headerCipherKey->getSalt())); + + Standalone serHeaderRef = BlobCipherEncryptHeaderRef::toStringRef(headerRef); BlobCipherEncryptHeaderRef validateHeader = BlobCipherEncryptHeaderRef::fromStringRef(serHeaderRef); BlobCipherEncryptHeaderFlagsV1 validateFlags = std::get(validateHeader.flags); ASSERT(validateFlags == flags); @@ -2105,7 +2209,7 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { ASSERT_EQ(memcmp(&iv[0], &validateAlgo.iv[0], AES_256_IV_LENGTH), 0); ASSERT_EQ(memcmp(&algoHeader.authToken[0], &validateAlgo.authToken[0], AuthTokenSize), 0); - TraceEvent("HmacShaHeaderSize") + TraceEvent("HeaderSize") .detail("Flags", sizeof(flags)) .detail("AlgoHeader", sizeof(algoHeader)) .detail("TotalHeader", serHeaderRef.size()); diff --git a/fdbclient/include/fdbclient/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h index 89a16f58e8..b1b8c231d4 100644 --- a/fdbclient/include/fdbclient/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -174,6 +174,11 @@ struct BlobCipherDetails { } bool operator!=(const BlobCipherDetails& o) const { return !(*this == o); } + bool isValid() const { + return this->encryptDomainId != INVALID_ENCRYPT_DOMAIN_ID && + this->baseCipherId != INVALID_ENCRYPT_CIPHER_KEY_ID && this->salt != INVALID_ENCRYPT_RANDOM_SALT; + } + template void serialize(Ar& ar) { serializer(ar, encryptDomainId, baseCipherId, salt); @@ -333,6 +338,15 @@ struct AesCtrNoAuthV1 { } }; +struct EncryptHeaderCipherDetails { + BlobCipherDetails textCipherDetails; + Optional headerCipherDetails; + + EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails) : textCipherDetails(tCipherDetails) {} + EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails, const BlobCipherDetails& hCipherDetails) + : textCipherDetails(tCipherDetails), headerCipherDetails(hCipherDetails) {} +}; + struct BlobCipherEncryptHeaderRef { // Serializable fields @@ -460,6 +474,9 @@ struct BlobCipherEncryptHeaderRef { } } + const uint8_t* getIV() const; + const EncryptHeaderCipherDetails getCipherDetails() const; + void validateEncryptionHeaderDetails(const BlobCipherDetails& textCipherDetails, const BlobCipherDetails& headerCipherDetails, const StringRef& ivRef) const; diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index ff087c5894..c1a3e06f9f 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -315,6 +315,24 @@ struct EncryptionOpsWorkload : TestWorkload { auto end = std::chrono::high_resolution_clock::now(); // validate encrypted buffer size and contents (not matching with plaintext) + const uint8_t* headerIV = headerRef->getIV(); + ASSERT_EQ(memcmp(&headerIV[0], &iv[0], AES_256_IV_LENGTH), 0); + + EncryptHeaderCipherDetails validateDetails = headerRef->getCipherDetails(); + ASSERT(validateDetails.textCipherDetails.isValid() && + validateDetails.textCipherDetails == BlobCipherDetails(textCipherKey->getDomainId(), + textCipherKey->getBaseCipherId(), + textCipherKey->getSalt())); + if (authMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + ASSERT(!validateDetails.headerCipherDetails.present()); + } else { + ASSERT(validateDetails.headerCipherDetails.present() && + validateDetails.headerCipherDetails.get().isValid() && + validateDetails.headerCipherDetails.get() == BlobCipherDetails(headerCipherKey->getDomainId(), + headerCipherKey->getBaseCipherId(), + headerCipherKey->getSalt())); + } + ASSERT_EQ(encrypted.size(), len); ASSERT_EQ(headerRef->flagsVersion, CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); ASSERT_NE(memcmp(encrypted.begin(), payload, len), 0); From 53f105eec5aa3766921b69c2196ffd9d787f3daa Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 14 Feb 2023 09:07:09 -0800 Subject: [PATCH 44/57] fix anyExisted when beginTenant==endTenant --- fdbserver/workloads/TenantManagementWorkload.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/workloads/TenantManagementWorkload.actor.cpp b/fdbserver/workloads/TenantManagementWorkload.actor.cpp index 0c687c6029..d838a653cb 100644 --- a/fdbserver/workloads/TenantManagementWorkload.actor.cpp +++ b/fdbserver/workloads/TenantManagementWorkload.actor.cpp @@ -800,6 +800,7 @@ struct TenantManagementWorkload : TestWorkload { if (!endTenant.present()) { tenants[beginTenant] = anyExists ? itr->second.tenant->id() : TenantInfo::INVALID_TENANT; } else if (endTenant.present()) { + anyExists = false; for (auto itr = self->createdTenants.lower_bound(beginTenant); itr != self->createdTenants.end() && itr->first < endTenant.get(); ++itr) { From d28f253182027aee464d8c86b05736d4d55b584c Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 14 Feb 2023 09:57:08 -0800 Subject: [PATCH 45/57] Save shard_encode_location_metadata knob value for restarting tests This is needed so that sharded rocks use consistent knob values. --- fdbserver/fdbserver.actor.cpp | 3 +++ fdbserver/workloads/SaveAndKill.actor.cpp | 1 + 2 files changed, 4 insertions(+) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 92516078db..a042b32280 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -2270,6 +2270,9 @@ int main(int argc, char* argv[]) { g_knobs.setKnob("encrypt_header_auth_token_algo", KnobValue::create((int)ini.GetLongValue( "META", "encryptHeaderAuthTokenAlgo", FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ALGO))); + g_knobs.setKnob( + "shard_encode_location_metadata", + KnobValue::create(ini.GetBoolValue("META", "enableShardEncodeLocationMetadata", false))); } setupAndRun(dataFolder, opts.testFile, opts.restarting, (isRestoring >= 1), opts.whitelistBinPaths); g_simulator->run(); diff --git a/fdbserver/workloads/SaveAndKill.actor.cpp b/fdbserver/workloads/SaveAndKill.actor.cpp index 9549c1fd25..ae2bb54362 100644 --- a/fdbserver/workloads/SaveAndKill.actor.cpp +++ b/fdbserver/workloads/SaveAndKill.actor.cpp @@ -84,6 +84,7 @@ struct SaveAndKillWorkload : TestWorkload { ini.SetBoolValue("META", "enableTLogEncryption", SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION); ini.SetBoolValue("META", "enableStorageServerEncryption", SERVER_KNOBS->ENABLE_STORAGE_SERVER_ENCRYPTION); ini.SetBoolValue("META", "enableBlobGranuleEncryption", SERVER_KNOBS->ENABLE_BLOB_GRANULE_ENCRYPTION); + ini.SetBoolValue("META", "enableShardEncodeLocationMetadata", SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA); ini.SetBoolValue("META", "encryptHeaderAuthTokenEnabled", FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED); ini.SetLongValue("META", "encryptHeaderAuthTokenAlgo", FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ALGO); From 99e8b95bf0ad822865fd8b75a1d0e2be5df19680 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 14 Feb 2023 10:36:29 -0800 Subject: [PATCH 46/57] Fix restarting to 7.2 tests for sharded rocks --- .../ConfigureStorageMigrationTestRestart-1.toml | 4 ---- tests/restarting/to_7.2.0_until_7.3.0/CycleTestRestart-1.toml | 4 ---- tests/restarting/to_7.3.0/CycleTestRestart-1.toml | 4 ---- 3 files changed, 12 deletions(-) diff --git a/tests/restarting/to_7.2.0_until_7.3.0/ConfigureStorageMigrationTestRestart-1.toml b/tests/restarting/to_7.2.0_until_7.3.0/ConfigureStorageMigrationTestRestart-1.toml index 3c35537b97..9e315bd45f 100644 --- a/tests/restarting/to_7.2.0_until_7.3.0/ConfigureStorageMigrationTestRestart-1.toml +++ b/tests/restarting/to_7.2.0_until_7.3.0/ConfigureStorageMigrationTestRestart-1.toml @@ -6,10 +6,6 @@ disableEncryption=true storageEngineExcludeTypes=[4] tenantModes=['disabled'] -[[knobs]] -# This can be removed once the lower bound of this downgrade test is a version that understands the new protocol -shard_encode_location_metadata = false - [[test]] testTitle = 'CloggedConfigureDatabaseTest' clearAfterTest = false diff --git a/tests/restarting/to_7.2.0_until_7.3.0/CycleTestRestart-1.toml b/tests/restarting/to_7.2.0_until_7.3.0/CycleTestRestart-1.toml index 8cd261a670..2d5877824c 100644 --- a/tests/restarting/to_7.2.0_until_7.3.0/CycleTestRestart-1.toml +++ b/tests/restarting/to_7.2.0_until_7.3.0/CycleTestRestart-1.toml @@ -5,10 +5,6 @@ disableHostname = true disableEncryption = true tenantModes=['disabled'] -[[knobs]] -# This can be removed once the lower bound of this downgrade test is a version that understands the new protocol -shard_encode_location_metadata = false - [[test]] testTitle = 'Clogged' clearAfterTest = false diff --git a/tests/restarting/to_7.3.0/CycleTestRestart-1.toml b/tests/restarting/to_7.3.0/CycleTestRestart-1.toml index 5970f9531a..98fc4f01ae 100644 --- a/tests/restarting/to_7.3.0/CycleTestRestart-1.toml +++ b/tests/restarting/to_7.3.0/CycleTestRestart-1.toml @@ -3,10 +3,6 @@ maxTLogVersion = 6 disableTss = true disableHostname = true -[[knobs]] -# This can be removed once the lower bound of this downgrade test is a version that understands the new protocol -shard_encode_location_metadata = false - [[test]] testTitle = 'Clogged' clearAfterTest = false From 7284e691fb998b4ccce68897df4fd699e32e3193 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 14 Feb 2023 12:28:55 -0800 Subject: [PATCH 47/57] Fix a few minor restore bugs and add a dry-run mode. Some improvements to the fdbcli output. --- fdbcli/MetaclusterCommands.actor.cpp | 162 ++++-- fdbcli/TenantCommands.actor.cpp | 7 +- fdbclient/Metacluster.cpp | 4 + fdbclient/Tenant.cpp | 4 + .../fdbclient/MetaclusterManagement.actor.h | 495 +++++++++++------- .../MetaclusterManagementWorkload.actor.cpp | 8 +- .../MetaclusterRestoreWorkload.actor.cpp | 62 ++- flow/include/flow/error_definitions.h | 1 + 8 files changed, 506 insertions(+), 237 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index 941a94cdb4..c7ef3e9aa6 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -179,9 +179,15 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector })); if (clusterType == ClusterType::METACLUSTER_DATA && !force) { - fmt::print("ERROR: cannot remove a data cluster directly. To remove a data cluster,\n" - "use the `remove' command on the management cluster. To force a data cluster\n" - "to forget its metacluster association without fully removing it, use FORCE.\n"); + if (tokens[2] == "FORCE"_sr) { + fmt::print("ERROR: a cluster name must be specified.\n"); + } else { + fmt::print("ERROR: cannot remove a data cluster directly. To remove a data cluster,\n" + "use the `remove' command on the management cluster. To force a data cluster\n" + "to forget its metacluster association without fully removing it, use FORCE.\n"); + } + + return false; } bool updatedDataCluster = @@ -189,9 +195,11 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); - fmt::print("WARNING: the data cluster could not be updated and still contains its\n" - "metacluster registration info. To finish removing it, FORCE remove the\n" - "data cluster directly."); + if (!updatedDataCluster) { + fmt::print("WARNING: the data cluster could not be updated and may still contains its\n" + "metacluster registration info. To finish removing it, FORCE remove the\n" + "data cluster directly.\n"); + } } else { ASSERT(updatedDataCluster); fmt::print("The cluster `{}' has removed its association with its metacluster.\n" @@ -201,34 +209,54 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector return true; } +void printRestoreUsage() { + fmt::print("Usage: metacluster restore [dryrun] connection_string=\n" + " [force_join_new_metacluster]\n\n"); + + fmt::print("Add a restored data cluster back to a metacluster.\n\n"); + + fmt::print("Use `dryrun' to report what changes a restore would make and whether any\n"); + fmt::print("failures would occur. Without `dryrun', the restore will modify the metacluster\n"); + fmt::print("with the changes required to perform the restore.\n\n"); + + fmt::print("Use `restore_known_data_cluster' to add back a restored copy of a data cluster\n"); + fmt::print("that the metacluster is already tracking. This mode should be used if only data\n"); + fmt::print("clusters are being restored, and any discrepancies between the management and\n"); + fmt::print("data clusters will be resolved using the management cluster metadata.\n"); + fmt::print("If `force_join_new_metacluster' is specified, the cluster will try to restore\n"); + fmt::print("to a different metacluster than it was originally registered to.\n\n"); + + fmt::print("Use `repopulate_from_data_cluster' to rebuild a lost management cluster from the\n"); + fmt::print("data clusters in a metacluster. This mode should be used if the management\n"); + fmt::print("cluster is being restored. If any data clusters are also being restored, the\n"); + fmt::print("oldest data clusters should be added first before any non-recovered data\n"); + fmt::print("clusters. Any conflicts arising between the added data cluster and existing data\n"); + fmt::print("will cause the restore to fail. Before repopulating a metacluster from a data\n"); + fmt::print("cluster, that data cluster needs to be detached from its prior metacluster using\n"); + fmt::print("the `metacluster remove' command.\n"); +} + // metacluster restore command ACTOR Future metaclusterRestoreCommand(Reference db, std::vector tokens) { - if (tokens.size() != 5) { - fmt::print("Usage: metacluster restore connection_string=\n" - "\n\n"); - - fmt::print("Add a restored data cluster back to a metacluster.\n\n"); - - fmt::print("Use `restore_known_data_cluster' to add back a restored copy of a data cluster\n"); - fmt::print("that the metacluster is already tracking. This mode should be used if only data\n"); - fmt::print("clusters are being restored, and any discrepancies between the management and\n"); - fmt::print("data clusters will be resolved using the management cluster metadata.\n\n"); - - fmt::print("Use `repopulate_from_data_cluster' to rebuild a lost management cluster from the\n"); - fmt::print("data clusters in a metacluster. This mode should be used if the management\n"); - fmt::print("cluster is being restored. If any data clusters are also being restored, the\n"); - fmt::print("oldest data clusters should be added first before any non-recovered data\n"); - fmt::print("clusters. Any conflicts arising between the added data cluster and existing data\n"); - fmt::print("will cause the restore to fail. Before repopulating a metacluster from a data\n"); - fmt::print("cluster, that data cluster needs to be detached from its prior metacluster using\n"); - fmt::print("the `metacluster remove' command.\n"); - + if (tokens.size() < 5 || tokens.size() > 7) { + printRestoreUsage(); return false; } + state bool dryRun = tokens[3] == "dryrun"_sr; + state bool forceJoin = tokens[tokens.size() - 1] == "force_join_new_metacluster"_sr; + + if (tokens.size() < 5 + (int)dryRun + (int)forceJoin) { + printRestoreUsage(); + return false; + } + + state ClusterName clusterName = tokens[2]; + state StringRef restoreType = tokens[tokens.size() - 1 - (int)forceJoin]; + // connection string DataClusterEntry defaultEntry; - auto config = parseClusterConfiguration(tokens, defaultEntry, 3, 4); + auto config = parseClusterConfiguration(tokens, defaultEntry, 3 + (int)dryRun, 3 + (int)dryRun + 1); if (!config.present()) { return false; } else if (!config.get().first.present()) { @@ -240,15 +268,24 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto state bool success = true; try { - if (tokens[4] == "restore_known_data_cluster"_sr) { - wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::True, &messages)); - - } else if (tokens[4] == "repopulate_from_data_cluster"_sr) { - wait(MetaclusterAPI::restoreCluster( - db, tokens[2], config.get().first.get(), ApplyManagementClusterUpdates::False, &messages)); + if (restoreType == "restore_known_data_cluster"_sr) { + wait(MetaclusterAPI::restoreCluster(db, + clusterName, + config.get().first.get(), + ApplyManagementClusterUpdates::True, + RestoreDryRun(dryRun), + ForceJoinNewMetacluster(forceJoin), + &messages)); + } else if (restoreType == "repopulate_from_data_cluster"_sr) { + wait(MetaclusterAPI::restoreCluster(db, + clusterName, + config.get().first.get(), + ApplyManagementClusterUpdates::False, + RestoreDryRun(dryRun), + ForceJoinNewMetacluster(forceJoin), + &messages)); } else { - fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(tokens[4])); + fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(restoreType)); success = false; } } catch (Error& e) { @@ -257,11 +294,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto } if (!messages.empty()) { - if (!success) { - fmt::print(stderr, "\n"); - } - - fmt::print(success ? stdout : stderr, "The restore reported the following messages:\n"); + fmt::print(success ? stdout : stderr, "\nThe restore reported the following messages:\n\n"); for (int i = 0; i < messages.size(); ++i) { fmt::print(success ? stdout : stderr, " {}. {}\n", i + 1, messages[i]); } @@ -272,7 +305,12 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto } if (success) { - fmt::print("The cluster `{}' has been restored\n", printable(tokens[2]).c_str()); + if (dryRun) { + fmt::print("The restore dry run completed successfully. To perform the restore, run the same command\n"); + fmt::print("without the `dryrun' argument.\n"); + } else { + fmt::print("The cluster `{}' has been restored\n", printable(clusterName).c_str()); + } } return success; @@ -376,6 +414,7 @@ ACTOR Future metaclusterGetCommand(Reference db, std::vector 1 && tokencmp(tokens[1], "restore")) { + if (tokens.size() == 3) { + const char* opts[] = { "dryrun", "connection_string=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else { + bool dryrun = tokens[3] == "dryrun"_sr; + if (tokens.size() == 3 + (int)dryrun) { + const char* opts[] = { "connection_string=", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 4 + (int)dryrun) { + const char* opts[] = { "restore_known_data_cluster", "repopulate_from_data_cluster", nullptr }; + arrayGenerator(text, line, opts, lc); + } else if (tokens.size() == 5 + (int)dryrun) { + const char* opts[] = { "force_join_new_metacluster", nullptr }; + arrayGenerator(text, line, opts, lc); + } + } } } std::vector metaclusterHintGenerator(std::vector const& tokens, bool inArgument) { if (tokens.size() == 1) { return { "", "[ARGS]" }; - } else if (tokencmp(tokens[1], "create_experimental")) { - return { " " }; + } else if (tokencmp(tokens[1], "create_experimental") && tokens.size() < 4) { + static std::vector opts = { "", "" }; + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); } else if (tokencmp(tokens[1], "decommission")) { return {}; } else if (tokencmp(tokens[1], "register") && tokens.size() < 5) { @@ -559,11 +619,19 @@ std::vector metaclusterHintGenerator(std::vector const& } else { return {}; } - } else if (tokencmp(tokens[1], "restore") && tokens.size() < 5) { + } else if (tokencmp(tokens[1], "restore") && tokens.size() < 7) { static std::vector opts = { "", - "connection_string= ", - "" }; - return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + "[dryrun]", + "connection_string=", + "", + "[force_join_new_metacluster]" }; + if (tokens.size() < 4 || (tokens[3].size() <= 6 && "dryrun"_sr.startsWith(tokens[3]))) { + return std::vector(opts.begin() + tokens.size() - 2, opts.end()); + } else if (tokens.size() < 6) { + return std::vector(opts.begin() + tokens.size() - 1, opts.end()); + } else { + return {}; + } } else if (tokencmp(tokens[1], "configure")) { static std::vector opts = { "", "|connection_string=>" diff --git a/fdbcli/TenantCommands.actor.cpp b/fdbcli/TenantCommands.actor.cpp index b4577e6fc5..587f8d1e76 100644 --- a/fdbcli/TenantCommands.actor.cpp +++ b/fdbcli/TenantCommands.actor.cpp @@ -463,13 +463,15 @@ ACTOR Future tenantGetCommand(Reference db, std::vector tenantGetCommand(Reference db, std::vectormetaclusterRegistration.present() && !self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { - throw invalid_metacluster_operation(); + throw metacluster_mismatch(); } // If a cluster was specified, check that the cluster metadata is present. If so, load it and store @@ -341,11 +344,13 @@ struct MetaclusterOperationContext { static Future()(Reference()).getValue())> runDataClusterTransaction(MetaclusterOperationContext* self, Function func, - RunOnDisconnectedCluster runOnDisconnectedCluster) { + RunOnDisconnectedCluster runOnDisconnectedCluster, + RunOnMismatchedCluster runOnMismatchedCluster) { ASSERT(self->dataClusterDb); - ASSERT(self->dataClusterMetadata.present()); + ASSERT(runOnDisconnectedCluster || self->dataClusterMetadata.present()); ASSERT(self->metaclusterRegistration.present() && - self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + (runOnDisconnectedCluster || + self->metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA)); self->checkClusterState(); @@ -365,7 +370,9 @@ struct MetaclusterOperationContext { } else if (currentMetaclusterRegistration.get().clusterType != ClusterType::METACLUSTER_DATA) { throw cluster_removed(); } else if (!self->metaclusterRegistration.get().matches(currentMetaclusterRegistration.get())) { - throw cluster_removed(); + if (!runOnMismatchedCluster) { + throw cluster_removed(); + } } self->dataClusterIsRegistered = currentMetaclusterRegistration.present(); @@ -383,8 +390,9 @@ struct MetaclusterOperationContext { template Future()(Reference()).getValue())> runDataClusterTransaction(Function func, - RunOnDisconnectedCluster runOnDisconnectedCluster = RunOnDisconnectedCluster::False) { - return runDataClusterTransaction(this, func, runOnDisconnectedCluster); + RunOnDisconnectedCluster runOnDisconnectedCluster = RunOnDisconnectedCluster::False, + RunOnMismatchedCluster runOnMismatchedCluster = RunOnMismatchedCluster::False) { + return runDataClusterTransaction(this, func, runOnDisconnectedCluster, runOnMismatchedCluster); } ACTOR static Future updateClusterName(MetaclusterOperationContext* self, @@ -649,13 +657,14 @@ ACTOR template static Future registerInManagementCluster(Transaction tr, ClusterName clusterName, DataClusterEntry clusterEntry, - ClusterConnectionString connectionString) { + ClusterConnectionString connectionString, + RestoreDryRun restoreDryRun) { state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, clusterName)); if (dataClusterMetadata.present() && !dataClusterMetadata.get().matchesConfiguration(DataClusterMetadata(clusterEntry, connectionString))) { TraceEvent("RegisterClusterAlreadyExists").detail("ClusterName", clusterName); throw cluster_already_exists(); - } else if (!dataClusterMetadata.present()) { + } else if (!restoreDryRun && !dataClusterMetadata.present()) { clusterEntry.allocated = ClusterUsage(); if (clusterEntry.hasCapacity()) { @@ -664,14 +673,14 @@ static Future registerInManagementCluster(Transaction tr, } ManagementClusterMetadata::dataClusters().set(tr, clusterName, clusterEntry); ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, clusterName, connectionString); - } - TraceEvent("RegisteredDataCluster") - .detail("ClusterName", clusterName) - .detail("ClusterID", clusterEntry.id) - .detail("Capacity", clusterEntry.capacity) - .detail("Version", tr->getCommittedVersion()) - .detail("ConnectionString", connectionString.toString()); + TraceEvent("RegisteredDataCluster") + .detail("ClusterName", clusterName) + .detail("ClusterID", clusterEntry.id) + .detail("Capacity", clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", connectionString.toString()); + } return Void(); } @@ -869,8 +878,10 @@ struct RemoveClusterImpl { ClusterType clusterType, bool forceRemove, double dataClusterTimeout) - : ctx(db, Optional(), { DataClusterState::REMOVING, DataClusterState::RESTORING }), db(db), - clusterType(clusterType), clusterName(clusterName), forceRemove(forceRemove), + : ctx(db, + Optional(), + { DataClusterState::REGISTERING, DataClusterState::REMOVING, DataClusterState::RESTORING }), + db(db), clusterType(clusterType), clusterName(clusterName), forceRemove(forceRemove), dataClusterTimeout(dataClusterTimeout) {} // Returns false if the cluster is no longer present, or true if it is present and the removal should proceed. @@ -1098,7 +1109,7 @@ struct RemoveClusterImpl { .detail("ExpectedName", self->clusterName) .detail("MetaclusterRegistration", metaclusterRegistrationEntry.map(&MetaclusterRegistrationEntry::toString)); - throw invalid_metacluster_operation(); + throw metacluster_mismatch(); } wait(updateDataCluster(self, tr, metaclusterRegistrationEntry.get().id)); @@ -1318,6 +1329,8 @@ struct RestoreClusterImpl { ClusterName clusterName; ClusterConnectionString connectionString; ApplyManagementClusterUpdates applyManagementClusterUpdates; + RestoreDryRun restoreDryRun; + ForceJoinNewMetacluster forceJoinNewMetacluster; std::vector& messages; // Loaded from the data cluster @@ -1333,32 +1346,57 @@ struct RestoreClusterImpl { ClusterName clusterName, ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates, + RestoreDryRun restoreDryRun, + ForceJoinNewMetacluster forceJoinNewMetacluster, std::vector& messages) : ctx(managementDb, {}, { DataClusterState::RESTORING }), clusterName(clusterName), connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates), - messages(messages) {} + restoreDryRun(restoreDryRun), forceJoinNewMetacluster(forceJoinNewMetacluster), messages(messages) {} // If restoring a data cluster, verify that it has a matching registration entry ACTOR static Future loadDataClusterRegistration(RestoreClusterImpl* self) { state Reference db = wait(openDatabase(self->connectionString)); + state Reference tr = db->createTransaction(); - Optional metaclusterRegistration = - wait(MetaclusterMetadata::metaclusterRegistration().get(db)); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Optional metaclusterRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); - if (!metaclusterRegistration.present()) { - throw invalid_data_cluster(); - } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || - metaclusterRegistration.get().name != self->clusterName) { - TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") - .detail("ExistingRegistration", metaclusterRegistration.get()) - .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); - throw cluster_already_exists(); + if (!metaclusterRegistration.present()) { + throw invalid_data_cluster(); + } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { + if (!self->forceJoinNewMetacluster) { + TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") + .detail("ExistingRegistration", metaclusterRegistration.get()) + .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); + throw cluster_already_registered(); + } else if (!self->restoreDryRun) { + ASSERT(self->ctx.metaclusterRegistration.get().clusterType == ClusterType::METACLUSTER_DATA); + MetaclusterMetadata::metaclusterRegistration().set(tr, self->ctx.metaclusterRegistration.get()); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + } else { + self->messages.push_back(fmt::format("Move data cluster to new metacluster\n" + " original: {}\n" + " updated: {}", + metaclusterRegistration.get().toString(), + self->ctx.metaclusterRegistration.get().toString())); + } + } else if (metaclusterRegistration.get().name != self->clusterName) { + TraceEvent(SevWarn, "MetaclusterRestoreClusterNameMismatch") + .detail("ExistingName", metaclusterRegistration.get().name) + .detail("ManagementClusterRegistration", self->clusterName); + throw cluster_already_registered(); + } + + self->dataClusterId = metaclusterRegistration.get().id; + self->ctx.dataClusterDb = db; + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } } - - self->dataClusterId = metaclusterRegistration.get().id; - self->ctx.dataClusterDb = db; - - return Void(); } // If adding a data cluster to a restored management cluster, write a metacluster registration entry @@ -1387,8 +1425,11 @@ struct RestoreClusterImpl { throw cluster_already_registered(); } - MetaclusterMetadata::metaclusterRegistration().set(tr, dataClusterEntry); - wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + if (!self->restoreDryRun) { + MetaclusterMetadata::metaclusterRegistration().set(tr, dataClusterEntry); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + } + break; } catch (Error& e) { wait(safeThreadFutureToFuture(tr->onError(e))); @@ -1403,15 +1444,14 @@ struct RestoreClusterImpl { DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::RESTORING; - updateClusterMetadata( - tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), connectionString, updatedEntry); + updateClusterMetadata(tr, clusterName, ctx.dataClusterMetadata.get(), connectionString, updatedEntry); // Remove this cluster from the cluster capacity index, but leave its configured capacity intact in the // cluster entry. This allows us to retain the configured capacity while preventing the cluster from // being used to allocate new tenant groups. DataClusterEntry noCapacityEntry = updatedEntry; noCapacityEntry.capacity.numTenantGroups = 0; - updateClusterCapacityIndex(tr, ctx.clusterName.get(), updatedEntry, noCapacityEntry); + updateClusterCapacityIndex(tr, clusterName, updatedEntry, noCapacityEntry); } TraceEvent("MarkedDataClusterRestoring").detail("Name", clusterName); @@ -1422,17 +1462,15 @@ struct RestoreClusterImpl { DataClusterEntry updatedEntry = ctx.dataClusterMetadata.get().entry; updatedEntry.clusterState = DataClusterState::READY; - updateClusterMetadata(tr, ctx.clusterName.get(), ctx.dataClusterMetadata.get(), {}, updatedEntry); + updateClusterMetadata(tr, clusterName, ctx.dataClusterMetadata.get(), {}, updatedEntry); // Add this cluster back to the cluster capacity index so that it can be assigned to again. DataClusterEntry noCapacityEntry = updatedEntry; noCapacityEntry.capacity.numTenantGroups = 0; - updateClusterCapacityIndex(tr, ctx.clusterName.get(), noCapacityEntry, updatedEntry); + updateClusterCapacityIndex(tr, clusterName, noCapacityEntry, updatedEntry); } - TraceEvent("MarkedDataClusterReady") - .detail("Name", ctx.clusterName.get()) - .detail("Version", tr->getCommittedVersion()); + TraceEvent("MarkedDataClusterReady").detail("Name", clusterName).detail("Version", tr->getCommittedVersion()); } ACTOR static Future markManagementTenantsAsError(RestoreClusterImpl* self, @@ -1479,7 +1517,7 @@ struct RestoreClusterImpl { for (auto const& t : tenants.results) { self->mgmtClusterTenantMap.emplace(t.first, t.second); - if (t.second.assignedCluster.present() && self->ctx.clusterName.get() == t.second.assignedCluster.get()) { + if (t.second.assignedCluster.present() && self->clusterName == t.second.assignedCluster.get()) { self->mgmtClusterTenantSetForCurrentDataCluster.emplace(t.first); } } @@ -1582,10 +1620,24 @@ struct RestoreClusterImpl { // A data cluster tenant is not present on the management cluster if (managementEntry == self->mgmtClusterTenantMap.end() || - managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get()) { - wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { - return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); - })); + managementEntry->second.assignedCluster.get() != self->clusterName) { + if (self->restoreDryRun) { + if (managementEntry == self->mgmtClusterTenantMap.end()) { + self->messages.push_back(fmt::format("Delete missing tenant `{}' with ID {} on data cluster", + printable(tenantEntry.tenantName), + tenantEntry.id)); + } else { + self->messages.push_back(fmt::format( + "Delete tenant `{}' with ID {} on data cluster because it is now located on the cluster `{}'", + printable(tenantEntry.tenantName), + tenantEntry.id, + printable(managementEntry->second.assignedCluster))); + } + } else { + wait(self->ctx.runDataClusterTransaction([tenantEntry = tenantEntry](Reference tr) { + return TenantAPI::deleteTenantTransaction(tr, tenantEntry.id, ClusterType::METACLUSTER_DATA); + })); + } return Optional>(); } else { @@ -1593,38 +1645,66 @@ struct RestoreClusterImpl { state TenantMapEntry managementTenant = managementEntry->second; // Rename - if (tenantName != managementTenant.tenantName) { + state bool renamed = tenantName != managementTenant.tenantName; + if (renamed) { state TenantName temporaryName; - if (self->dataClusterTenantNames.count(managementTenant.tenantName) > 0) { + state bool usingTemporaryName = self->dataClusterTenantNames.count(managementTenant.tenantName) > 0; + if (usingTemporaryName) { temporaryName = metaclusterTemporaryRenamePrefix.withSuffix(managementTenant.tenantName); } else { temporaryName = managementTenant.tenantName; } - wait(self->ctx.runDataClusterTransaction([self = self, - tenantName = tenantName, - temporaryName = temporaryName, - tenantEntry = tenantEntry, - managementTenant = - managementTenant](Reference tr) { - return renameTenant( - self, tr, tenantEntry.id, tenantName, temporaryName, managementTenant.configurationSequenceNum); - })); + if (self->restoreDryRun) { + self->messages.push_back(fmt::format("Rename tenant `{}' with ID {} to `{}' on data cluster{}", + printable(tenantEntry.tenantName), + tenantEntry.id, + printable(managementTenant.tenantName), + usingTemporaryName ? " via temporary name" : "")); + } else { + wait(self->ctx.runDataClusterTransaction( + [self = self, + tenantName = tenantName, + temporaryName = temporaryName, + tenantEntry = tenantEntry, + managementTenant = managementTenant](Reference tr) { + return renameTenant(self, + tr, + tenantEntry.id, + tenantName, + temporaryName, + managementTenant.configurationSequenceNum); + })); + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING + // state + } tenantName = temporaryName; - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the RENAMING - // state } // Update configuration - if (!managementTenant.matchesConfiguration(tenantEntry) || + bool configurationChanged = !managementTenant.matchesConfiguration(tenantEntry); + if (configurationChanged || managementTenant.configurationSequenceNum != tenantEntry.configurationSequenceNum) { ASSERT(managementTenant.configurationSequenceNum >= tenantEntry.configurationSequenceNum); - wait(self->ctx.runDataClusterTransaction( - [self = self, managementTenant = managementTenant](Reference tr) { - return updateTenantConfiguration(self, tr, managementTenant.id, managementTenant); - })); - // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the - // UPDATING_CONFIGURATION state + if (self->restoreDryRun) { + // If this is an update to the internal sequence number only and we are also renaming the tenant, + // we don't need to report anything. The internal metadata update is (at least partially) caused + // by the rename in that case + if (configurationChanged || !renamed) { + self->messages.push_back( + fmt::format("Update tenant configuration for tenant `{}' with ID {} on data cluster{}", + printable(tenantEntry.tenantName), + tenantEntry.id, + configurationChanged ? "" : " (internal metadata only)")); + } + } else { + wait(self->ctx.runDataClusterTransaction( + [self = self, managementTenant = managementTenant](Reference tr) { + return updateTenantConfiguration(self, tr, managementTenant.id, managementTenant); + })); + // SOMEDAY: we could mark the tenant in the management cluster as READY if it is in the + // UPDATING_CONFIGURATION state + } } return std::make_pair(tenantName, managementTenant); @@ -1642,22 +1722,79 @@ struct RestoreClusterImpl { ++itr; } - state std::unordered_map::iterator renameItr = partiallyRenamedTenants.begin(); - while (renameItr != partiallyRenamedTenants.end()) { - wait(self->ctx.runDataClusterTransaction([self = self, renameItr = renameItr](Reference tr) { - return renameTenant(self, - tr, - renameItr->second.id, - renameItr->first, - renameItr->first.removePrefix(metaclusterTemporaryRenamePrefix), - renameItr->second.configurationSequenceNum); - })); - ++renameItr; + if (!self->restoreDryRun) { + state std::unordered_map::iterator renameItr = partiallyRenamedTenants.begin(); + while (renameItr != partiallyRenamedTenants.end()) { + wait(self->ctx.runDataClusterTransaction( + [self = self, renameItr = renameItr](Reference tr) { + return renameTenant(self, + tr, + renameItr->second.id, + renameItr->first, + renameItr->first.removePrefix(metaclusterTemporaryRenamePrefix), + renameItr->second.configurationSequenceNum); + })); + ++renameItr; + } } return Void(); } + ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { + state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); + state std::vector missingTenants; + state int64_t missingTenantCount = 0; + while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { + int64_t tenantId = *setItr; + TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantId]; + + // If a tenant is present on the management cluster and not on the data cluster, mark it in an error + // state unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be + // missing on the data cluster + // + // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the + // tenant) + if (self->dataClusterTenantMap.find(tenantId) == self->dataClusterTenantMap.end() && + managementTenant.tenantState != TenantState::REGISTERING && + managementTenant.tenantState != TenantState::REMOVING && + managementTenant.tenantState != TenantState::ERROR) { + if (self->restoreDryRun) { + self->messages.push_back(fmt::format("The tenant `{}' with ID {} is missing on the data cluster", + printable(managementTenant.tenantName), + tenantId)); + } else { + missingTenants.push_back(tenantId); + ++missingTenantCount; + if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { + wait(self->ctx.runManagementTransaction( + [self = self, missingTenants = missingTenants](Reference tr) { + return markManagementTenantsAsError(self, tr, missingTenants); + })); + missingTenants.clear(); + } + } + } + ++setItr; + } + + if (!self->restoreDryRun && missingTenants.size() > 0) { + wait(self->ctx.runManagementTransaction( + [self = self, missingTenants = missingTenants](Reference tr) { + return markManagementTenantsAsError(self, tr, missingTenants); + })); + } + + // This is a best effort attempt to communicate the number of missing tenants. If a restore needs to be run + // twice and is interrupted in the middle of the first attempt to process missing tenants, we may not report + // a full count. + if (missingTenantCount > 0) { + self->messages.push_back(fmt::format( + "The metacluster has {} tenants that are missing in the restored data cluster", missingTenantCount)); + } + return Void(); + } + // Returns true if the group needs to be created ACTOR static Future addTenantToManagementCluster(RestoreClusterImpl* self, Reference tr, @@ -1670,7 +1807,7 @@ struct RestoreClusterImpl { Optional existingEntry = wait(tryGetTenantTransaction(tr, tenantEntry.tenantName)); if (existingEntry.present()) { - if (existingEntry.get().assignedCluster == self->ctx.clusterName) { + if (existingEntry.get().assignedCluster == self->clusterName) { ASSERT(existingEntry.get().matchesConfiguration(tenantEntry)); // This is a retry, so return success return false; @@ -1682,22 +1819,24 @@ struct RestoreClusterImpl { } } - tenantEntry.tenantState = TenantState::READY; - tenantEntry.assignedCluster = self->ctx.clusterName; - ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantEntry.id, tenantEntry); - ManagementClusterMetadata::tenantMetadata().tenantNameIndex.set(tr, tenantEntry.tenantName, tenantEntry.id); + if (!self->restoreDryRun) { + tenantEntry.tenantState = TenantState::READY; + tenantEntry.assignedCluster = self->clusterName; + ManagementClusterMetadata::tenantMetadata().tenantMap.set(tr, tenantEntry.id, tenantEntry); + ManagementClusterMetadata::tenantMetadata().tenantNameIndex.set(tr, tenantEntry.tenantName, tenantEntry.id); - ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); - ManagementClusterMetadata::clusterTenantCount.atomicOp( - tr, tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); + ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp(tr, 1, MutationRef::AddValue); + ManagementClusterMetadata::clusterTenantCount.atomicOp( + tr, tenantEntry.assignedCluster.get(), 1, MutationRef::AddValue); - // Updated indexes to include the new tenant - ManagementClusterMetadata::clusterTenantIndex.insert( - tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantName, tenantEntry.id)); + // Updated indexes to include the new tenant + ManagementClusterMetadata::clusterTenantIndex.insert( + tr, Tuple::makeTuple(tenantEntry.assignedCluster.get(), tenantEntry.tenantName, tenantEntry.id)); + } wait(success(tenantGroupEntry)); - if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->ctx.clusterName) { + if (tenantGroupEntry.get().present() && tenantGroupEntry.get().get().assignedCluster != self->clusterName) { self->messages.push_back( fmt::format("The tenant `{}' is part of a tenant group `{}' that already exists on cluster `{}'", printable(tenantEntry.tenantName), @@ -1706,11 +1845,13 @@ struct RestoreClusterImpl { throw invalid_tenant_configuration(); } - managementClusterAddTenantToGroup(tr, - tenantEntry, - &self->ctx.dataClusterMetadata.get(), - GroupAlreadyExists(tenantGroupEntry.get().present()), - IsRestoring::True); + if (!self->restoreDryRun) { + managementClusterAddTenantToGroup(tr, + tenantEntry, + &self->ctx.dataClusterMetadata.get(), + GroupAlreadyExists(tenantGroupEntry.get().present()), + IsRestoring::True); + } return !tenantGroupEntry.get().present(); } @@ -1746,39 +1887,43 @@ struct RestoreClusterImpl { numGroupsCreated += groupsCreated.size(); - if (numGroupsCreated > 0) { - state DataClusterMetadata clusterMetadata = wait(getClusterTransaction(tr, self->ctx.clusterName.get())); + if (!self->restoreDryRun) { + if (numGroupsCreated > 0) { + state DataClusterMetadata clusterMetadata = wait(getClusterTransaction(tr, self->clusterName)); - DataClusterEntry updatedEntry = clusterMetadata.entry; - updatedEntry.allocated.numTenantGroups += numGroupsCreated; - updateClusterMetadata(tr, - self->ctx.clusterName.get(), - clusterMetadata, - Optional(), - updatedEntry, - IsRestoring::True); + DataClusterEntry updatedEntry = clusterMetadata.entry; + updatedEntry.allocated.numTenantGroups += numGroupsCreated; + updateClusterMetadata(tr, + self->clusterName, + clusterMetadata, + Optional(), + updatedEntry, + IsRestoring::True); + } + + int64_t lastTenantId = + wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.getD(tr, Snapshot::False, 0)); + + ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, std::max(lastTenantId, maxId)); + ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); } - int64_t lastTenantId = - wait(ManagementClusterMetadata::tenantMetadata().lastTenantId.getD(tr, Snapshot::False, 0)); - - ManagementClusterMetadata::tenantMetadata().lastTenantId.set(tr, std::max(lastTenantId, maxId)); - ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); - return Void(); } ACTOR static Future addTenantsToManagementCluster(RestoreClusterImpl* self) { state std::unordered_map::iterator itr; state std::vector tenantBatch; + state int64_t tenantsToAdd = 0; for (itr = self->dataClusterTenantMap.begin(); itr != self->dataClusterTenantMap.end(); ++itr) { state std::unordered_map::iterator managementEntry = self->mgmtClusterTenantMap.find(itr->second.id); if (managementEntry == self->mgmtClusterTenantMap.end()) { + ++tenantsToAdd; tenantBatch.push_back(itr->second); } else if (managementEntry->second.tenantName != itr->second.tenantName || - managementEntry->second.assignedCluster.get() != self->ctx.clusterName.get() || + managementEntry->second.assignedCluster.get() != self->clusterName || !managementEntry->second.matchesConfiguration(itr->second)) { self->messages.push_back( fmt::format("The tenant `{}' has the same ID {} as an existing tenant `{}' on cluster `{}'", @@ -1807,54 +1952,13 @@ struct RestoreClusterImpl { })); } - return Void(); - } - - ACTOR static Future processMissingTenants(RestoreClusterImpl* self) { - state std::unordered_set::iterator setItr = self->mgmtClusterTenantSetForCurrentDataCluster.begin(); - state std::vector missingTenants; - state int64_t missingTenantCount = 0; - while (setItr != self->mgmtClusterTenantSetForCurrentDataCluster.end()) { - int64_t tenantId = *setItr; - TenantMapEntry const& managementTenant = self->mgmtClusterTenantMap[tenantId]; - - // If a tenant is present on the management cluster and not on the data cluster, mark it in an error - // state unless it is already in certain states (e.g. REGISTERING, REMOVING) that allow the tenant to be - // missing on the data cluster - // - // SOMEDAY: this could optionally complete the partial operations (e.g. finish creating or removing the - // tenant) - if (self->dataClusterTenantMap.find(tenantId) == self->dataClusterTenantMap.end() && - managementTenant.tenantState != TenantState::REGISTERING && - managementTenant.tenantState != TenantState::REMOVING && - managementTenant.tenantState != TenantState::ERROR) { - missingTenants.push_back(tenantId); - ++missingTenantCount; - if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { - wait(self->ctx.runManagementTransaction( - [self = self, missingTenants = missingTenants](Reference tr) { - return markManagementTenantsAsError(self, tr, missingTenants); - })); - missingTenants.clear(); - } - } - ++setItr; + if (self->restoreDryRun) { + self->messages.push_back( + fmt::format("Restore will add {} tenant(s) to the management cluster from the data cluster `{}'", + tenantsToAdd, + printable(self->clusterName))); } - if (missingTenants.size() > 0) { - wait(self->ctx.runManagementTransaction( - [self = self, missingTenants = missingTenants](Reference tr) { - return markManagementTenantsAsError(self, tr, missingTenants); - })); - } - - // This is a best effort attempt to communicate the number of missing tenants. If a restore needs to be run - // twice and is interrupted in the middle of the first attempt to process missing tenants, we may not report - // a full count. - if (missingTenantCount > 0) { - self->messages.push_back(fmt::format( - "The metacluster has {} tenants that are missing in the restored data cluster.", missingTenantCount)); - } return Void(); } @@ -1869,16 +1973,18 @@ struct RestoreClusterImpl { wait(loadDataClusterRegistration(self)); // set state to restoring - try { - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - self->markClusterRestoring(tr); - return Future(Void()); - })); - } catch (Error& e) { - // If the transaction retries after success or if we are trying a second time to restore the cluster, it - // will throw an error indicating that the restore has already started - if (e.code() != error_code_cluster_restoring) { - throw; + if (!self->restoreDryRun) { + try { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterRestoring(tr); + return Future(Void()); + })); + } catch (Error& e) { + // If the transaction retries after success or if we are trying a second time to restore the cluster, it + // will throw an error indicating that the restore has already started + if (e.code() != error_code_cluster_restoring) { + throw; + } } } @@ -1887,7 +1993,9 @@ struct RestoreClusterImpl { // get all the tenant information from the newly registered data cluster wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); }, + RunOnDisconnectedCluster::False, + RunOnMismatchedCluster(self->restoreDryRun && self->forceJoinNewMetacluster))); // Fix any differences between the data cluster and the management cluster wait(reconcileTenants(self)); @@ -1896,10 +2004,12 @@ struct RestoreClusterImpl { wait(processMissingTenants(self)); // set restored cluster to ready state - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - self->markClusterAsReady(tr); - return Future(Void()); - })); + if (!self->restoreDryRun) { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterAsReady(tr); + return Future(Void()); + })); + } return Void(); } @@ -1912,31 +2022,41 @@ struct RestoreClusterImpl { DataClusterEntry entry; entry.id = self->dataClusterId; entry.clusterState = DataClusterState::RESTORING; - return registerInManagementCluster(tr, self->clusterName, entry, self->connectionString); + return registerInManagementCluster( + tr, self->clusterName, entry, self->connectionString, self->restoreDryRun); })); // Write a metacluster registration entry in the data cluster wait(writeDataClusterRegistration(self)); - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - return self->ctx.setCluster(tr, self->clusterName); - })); + if (!self->restoreDryRun) { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + return self->ctx.setCluster(tr, self->clusterName); + })); + } // get all the tenants in the metacluster wait(getAllTenantsFromManagementCluster(self)); + if (self->restoreDryRun) { + wait(store(self->ctx.dataClusterDb, openDatabase(self->connectionString))); + } + // get all the tenant information from the newly registered data cluster wait(self->ctx.runDataClusterTransaction( - [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); })); + [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); }, + RunOnDisconnectedCluster(self->restoreDryRun))); // Add all tenants from the data cluster to the management cluster wait(addTenantsToManagementCluster(self)); // set restored cluster to ready state - wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - self->markClusterAsReady(tr); - return Future(Void()); - })); + if (!self->restoreDryRun) { + wait(self->ctx.runManagementTransaction([self = self](Reference tr) { + self->markClusterAsReady(tr); + return Future(Void()); + })); + } return Void(); } @@ -1955,8 +2075,11 @@ Future restoreCluster(Reference db, ClusterName name, ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates, + RestoreDryRun restoreDryRun, + ForceJoinNewMetacluster forceJoinNewMetacluster, std::vector* messages) { - state RestoreClusterImpl impl(db, name, connectionString, applyManagementClusterUpdates, *messages); + state RestoreClusterImpl impl( + db, name, connectionString, applyManagementClusterUpdates, restoreDryRun, forceJoinNewMetacluster, *messages); wait(impl.run()); return Void(); } diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 8b38701003..4551d03b95 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -275,6 +275,8 @@ struct MetaclusterManagementWorkload : TestWorkload { ACTOR static Future restoreCluster(MetaclusterManagementWorkload* self) { state ClusterName clusterName = self->chooseClusterName(); state DataClusterData* dataDb = &self->dataDbs[clusterName]; + state bool dryRun = deterministicRandom()->coinflip(); + state bool forceJoin = deterministicRandom()->coinflip(); state std::vector messages; try { @@ -284,6 +286,8 @@ struct MetaclusterManagementWorkload : TestWorkload { clusterName, dataDb->db->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::True, + RestoreDryRun(dryRun), + ForceJoinNewMetacluster(forceJoin), &messages); Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -292,7 +296,9 @@ struct MetaclusterManagementWorkload : TestWorkload { } ASSERT(dataDb->registered); - dataDb->detached = false; + if (!dryRun) { + dataDb->detached = false; + } } catch (Error& e) { if (e.code() == error_code_cluster_not_found) { ASSERT(!dataDb->registered); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 0b431b5bf0..aca0212428 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -238,6 +238,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { Database dataDb, std::string backupUrl, bool addToMetacluster, + ForceJoinNewMetacluster forceJoinNewMetacluster, MetaclusterRestoreWorkload* self) { state FileBackupAgent backupAgent; state Standalone> backupRanges; @@ -259,10 +260,27 @@ struct MetaclusterRestoreWorkload : TestWorkload { state std::vector messages; if (addToMetacluster) { TraceEvent("MetaclusterRestoreWorkloadAddClusterToMetacluster").detail("ClusterName", clusterName); + if (deterministicRandom()->coinflip()) { + TraceEvent("MetaclusterRestoreWorkloadAddClusterToMetaclusterDryRun") + .detail("ClusterName", clusterName); + wait(MetaclusterAPI::restoreCluster(self->managementDb, + clusterName, + dataDb->getConnectionRecord()->getConnectionString(), + ApplyManagementClusterUpdates::True, + RestoreDryRun::True, + forceJoinNewMetacluster, + &messages)); + TraceEvent("MetaclusterRestoreWorkloadAddClusterToMetaclusterDryRunDone") + .detail("ClusterName", clusterName); + messages.clear(); + } + wait(MetaclusterAPI::restoreCluster(self->managementDb, clusterName, dataDb->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::True, + RestoreDryRun::False, + forceJoinNewMetacluster, &messages)); TraceEvent("MetaclusterRestoreWorkloadRestoreComplete").detail("ClusterName", clusterName); } @@ -497,11 +515,34 @@ struct MetaclusterRestoreWorkload : TestWorkload { .detail("FromCluster", clusterItr->first) .detail("TenantCollisions", collisions.first.size()); + if (deterministicRandom()->coinflip()) { + TraceEvent("MetaclusterRestoreWorkloadRestoreManagementClusterDryRun") + .detail("FromCluster", clusterItr->first) + .detail("TenantCollisions", collisions.first.size()); + + wait(MetaclusterAPI::restoreCluster( + self->managementDb, + clusterItr->first, + clusterItr->second.db->getConnectionRecord()->getConnectionString(), + ApplyManagementClusterUpdates::False, + RestoreDryRun::True, + ForceJoinNewMetacluster(deterministicRandom()->coinflip()), + &messages)); + + TraceEvent("MetaclusterRestoreWorkloadRestoreManagementClusterDryRunDone") + .detail("FromCluster", clusterItr->first) + .detail("TenantCollisions", collisions.first.size()); + + messages.clear(); + } + wait(MetaclusterAPI::restoreCluster( self->managementDb, clusterItr->first, clusterItr->second.db->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::False, + RestoreDryRun::False, + ForceJoinNewMetacluster(deterministicRandom()->coinflip()), &messages)); ASSERT(collisions.first.empty() && collisions.second.empty()); @@ -842,14 +883,31 @@ struct MetaclusterRestoreWorkload : TestWorkload { std::vector> restores; for (auto [cluster, backupUrl] : backups) { - restores.push_back(restoreDataCluster( - cluster, self->dataDbs[cluster].db, backupUrl.get(), !self->recoverManagementCluster, self)); + restores.push_back(restoreDataCluster(cluster, + self->dataDbs[cluster].db, + backupUrl.get(), + !self->recoverManagementCluster, + ForceJoinNewMetacluster(deterministicRandom()->coinflip()), + self)); } wait(waitForAll(restores)); if (self->recoverManagementCluster) { wait(restoreManagementCluster(self)); + + if (deterministicRandom()->coinflip()) { + std::vector> secondRestores; + for (auto [cluster, backupUrl] : backups) { + secondRestores.push_back(restoreDataCluster(cluster, + self->dataDbs[cluster].db, + backupUrl.get(), + true, + ForceJoinNewMetacluster::True, + self)); + } + wait(waitForAll(secondRestores)); + } } return Void(); diff --git a/flow/include/flow/error_definitions.h b/flow/include/flow/error_definitions.h index bffddafe62..b42aefb2f1 100755 --- a/flow/include/flow/error_definitions.h +++ b/flow/include/flow/error_definitions.h @@ -270,6 +270,7 @@ ERROR( tenant_creation_permanently_failed, 2168, "The tenant creation did not co ERROR( cluster_removed, 2169, "The cluster is being removed from the metacluster" ) ERROR( cluster_restoring, 2170, "The cluster is being restored to the metacluster" ) ERROR( invalid_data_cluster, 2171, "The data cluster being restored has no record of its metacluster" ) +ERROR( metacluster_mismatch, 2172, "The cluster does not have the expected name or is associated with a different metacluster" ) // 2200 - errors from bindings and official APIs ERROR( api_version_unset, 2200, "API version is not set" ) From a93407303f37c2c9b4fc41cd3154044409bb4af2 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 14 Feb 2023 12:39:56 -0800 Subject: [PATCH 48/57] Include missing tenants in the restore output if their state is already an error state --- .../fdbclient/MetaclusterManagement.actor.h | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 1b6871a5d5..9345ca22fc 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1757,21 +1757,25 @@ struct RestoreClusterImpl { // tenant) if (self->dataClusterTenantMap.find(tenantId) == self->dataClusterTenantMap.end() && managementTenant.tenantState != TenantState::REGISTERING && - managementTenant.tenantState != TenantState::REMOVING && - managementTenant.tenantState != TenantState::ERROR) { + managementTenant.tenantState != TenantState::REMOVING) { if (self->restoreDryRun) { self->messages.push_back(fmt::format("The tenant `{}' with ID {} is missing on the data cluster", printable(managementTenant.tenantName), tenantId)); } else { - missingTenants.push_back(tenantId); + // Tenants in an error state that aren't on the data cluster count as missing tenants. This will + // include tenants we previously marked as missing, and as new errors are added it could include + // other tenants ++missingTenantCount; - if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { - wait(self->ctx.runManagementTransaction( - [self = self, missingTenants = missingTenants](Reference tr) { - return markManagementTenantsAsError(self, tr, missingTenants); - })); - missingTenants.clear(); + if (managementTenant.tenantState != TenantState::ERROR) { + missingTenants.push_back(tenantId); + if (missingTenants.size() == CLIENT_KNOBS->METACLUSTER_RESTORE_BATCH_SIZE) { + wait(self->ctx.runManagementTransaction([self = self, missingTenants = missingTenants]( + Reference tr) { + return markManagementTenantsAsError(self, tr, missingTenants); + })); + missingTenants.clear(); + } } } } @@ -1785,9 +1789,6 @@ struct RestoreClusterImpl { })); } - // This is a best effort attempt to communicate the number of missing tenants. If a restore needs to be run - // twice and is interrupted in the middle of the first attempt to process missing tenants, we may not report - // a full count. if (missingTenantCount > 0) { self->messages.push_back(fmt::format( "The metacluster has {} tenants that are missing in the restored data cluster", missingTenantCount)); From 87a71049df684d820b34001cf2a02d9186d49f0b Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 14 Feb 2023 12:52:39 -0800 Subject: [PATCH 49/57] Fix one more test toml spec --- .../to_7.3.0/ConfigureStorageMigrationTestRestart-1.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/restarting/to_7.3.0/ConfigureStorageMigrationTestRestart-1.toml b/tests/restarting/to_7.3.0/ConfigureStorageMigrationTestRestart-1.toml index b6010a92c8..b00389af57 100644 --- a/tests/restarting/to_7.3.0/ConfigureStorageMigrationTestRestart-1.toml +++ b/tests/restarting/to_7.3.0/ConfigureStorageMigrationTestRestart-1.toml @@ -5,10 +5,6 @@ disableHostname=true storageEngineExcludeTypes=[4] tenantModes=['disabled'] -[[knobs]] -# This can be removed once the lower bound of this downgrade test is a version that understands the new protocol -shard_encode_location_metadata = false - [[test]] testTitle = 'CloggedConfigureDatabaseTest' clearAfterTest = false From fe18c87ac639a69ed2e3e9ccd4e3e8198cc43f48 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 14 Feb 2023 13:05:51 -0800 Subject: [PATCH 50/57] EaR: commit proxy fetch additional cipher keys post-resolution (#9308) Commit proxy needs to fetch additional cipher keys post-resolution, since tenant ids for raw access requests and cross-tenant clear ranges are calculated after resolution. --- fdbclient/BlobCipher.cpp | 22 ++-- fdbclient/BlobGranuleFiles.cpp | 8 +- fdbclient/FileBackupAgent.actor.cpp | 9 +- fdbclient/include/fdbclient/BlobCipher.h | 5 + .../include/fdbclient/CommitTransaction.h | 45 ++++--- .../fdbclient/GetEncryptCipherKeys.actor.h | 23 +++- fdbserver/ApplyMetadataMutation.cpp | 20 ++- fdbserver/BackupWorker.actor.cpp | 4 +- fdbserver/CommitProxyServer.actor.cpp | 124 +++++++++--------- fdbserver/RestoreLoader.actor.cpp | 4 +- fdbserver/StorageCache.actor.cpp | 4 +- fdbserver/storageserver.actor.cpp | 8 +- fdbserver/workloads/EncryptionOps.actor.cpp | 11 +- flow/include/flow/EncryptUtils.h | 12 ++ 14 files changed, 176 insertions(+), 123 deletions(-) diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp index bea45205f4..1653c505a4 100644 --- a/fdbclient/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -294,6 +294,7 @@ BlobCipherMetrics::BlobCipherMetrics() FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SKETCH_ACCURACY), counterSets({ CounterSet(cc, "TLog"), + CounterSet(cc, "TLogPostResolution"), CounterSet(cc, "KVMemory"), CounterSet(cc, "KVRedwood"), CounterSet(cc, "BlobGranule"), @@ -308,6 +309,8 @@ std::string toString(BlobCipherMetrics::UsageType type) { switch (type) { case BlobCipherMetrics::UsageType::TLOG: return "TLog"; + case BlobCipherMetrics::UsageType::TLOG_POST_RESOLUTION: + return "TLogPostResolution"; case BlobCipherMetrics::UsageType::KV_MEMORY: return "KVMemory"; case BlobCipherMetrics::UsageType::KV_REDWOOD: @@ -804,7 +807,9 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference void EncryptBlobCipherAes265Ctr::init() { ASSERT(textCipherKey.isValid()); - ASSERT(headerCipherKey.isValid()); + if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { + ASSERT(headerCipherKey.isValid()); + } if (!isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)) { TraceEvent(SevWarn, "InvalidAuthTokenDetails") @@ -1023,14 +1028,15 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte ASSERT(isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)); // Populate cipherText encryption-key details - header->cipherTextDetails.baseCipherId = textCipherKey->getBaseCipherId(); - header->cipherTextDetails.encryptDomainId = textCipherKey->getDomainId(); - header->cipherTextDetails.salt = textCipherKey->getSalt(); + header->cipherTextDetails = textCipherKey->details(); // Populate header encryption-key details - // TODO: HeaderCipherKey is not necessary if AuthTokenMode == NONE - header->cipherHeaderDetails.encryptDomainId = headerCipherKey->getDomainId(); - header->cipherHeaderDetails.baseCipherId = headerCipherKey->getBaseCipherId(); - header->cipherHeaderDetails.salt = headerCipherKey->getSalt(); + if (authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + header->cipherHeaderDetails = headerCipherKey->details(); + } else { + header->cipherHeaderDetails.encryptDomainId = INVALID_ENCRYPT_DOMAIN_ID; + header->cipherHeaderDetails.baseCipherId = INVALID_ENCRYPT_CIPHER_KEY_ID; + header->cipherHeaderDetails.salt = INVALID_ENCRYPT_RANDOM_SALT; + } memcpy(&header->iv[0], &iv[0], AES_256_IV_LENGTH); diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index d05ccfac66..07fc44ae2d 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -220,9 +220,7 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys, const BlobCipherEncryptHeader& header, const StringRef& ivRef) { // Validate encryption header 'cipherHeader' details sanity - if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() && - header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() && - header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) { + if (header.cipherHeaderDetails.isValid() && header.cipherHeaderDetails != eKeys.headerCipherKey->details()) { TraceEvent(SevError, "EncryptionHeader_CipherHeaderMismatch") .detail("HeaderDomainId", eKeys.headerCipherKey->getDomainId()) .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId) @@ -233,9 +231,7 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys, throw encrypt_header_metadata_mismatch(); } // Validate encryption header 'cipherText' details sanity - if (!(header.cipherTextDetails.baseCipherId == eKeys.textCipherKey->getBaseCipherId() && - header.cipherTextDetails.encryptDomainId == eKeys.textCipherKey->getDomainId() && - header.cipherTextDetails.salt == eKeys.textCipherKey->getSalt())) { + if (!header.cipherTextDetails.isValid() || header.cipherTextDetails != eKeys.textCipherKey->details()) { TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch") .detail("TextDomainId", eKeys.textCipherKey->getDomainId()) .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId) diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index ac3f096dc0..ecb4480d3b 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -579,9 +579,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { Reference textCipherKey, BlobCipherEncryptHeader& header) { // Validate encryption header 'cipherHeader' details - if (!(header.cipherHeaderDetails.baseCipherId == headerCipherKey->getBaseCipherId() && - header.cipherHeaderDetails.encryptDomainId == headerCipherKey->getDomainId() && - header.cipherHeaderDetails.salt == headerCipherKey->getSalt())) { + if (header.cipherHeaderDetails.isValid() && header.cipherHeaderDetails != headerCipherKey->details()) { TraceEvent(SevWarn, "EncryptionHeader_CipherHeaderMismatch") .detail("HeaderDomainId", headerCipherKey->getDomainId()) .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId) @@ -593,9 +591,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { } // Validate encryption text 'cipherText' details sanity - if (!(header.cipherTextDetails.baseCipherId == textCipherKey->getBaseCipherId() && - header.cipherTextDetails.encryptDomainId == textCipherKey->getDomainId() && - header.cipherTextDetails.salt == textCipherKey->getSalt())) { + if (!header.cipherTextDetails.isValid() || header.cipherTextDetails != textCipherKey->details()) { TraceEvent(SevWarn, "EncryptionHeader_CipherTextMismatch") .detail("TextDomainId", textCipherKey->getDomainId()) .detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId) @@ -614,7 +610,6 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { Arena* arena) { Reference const> dbInfo = cx->clientInfo; TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::RESTORE)); - ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid()); validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header); DecryptBlobCipherAes256Ctr decryptor( cipherKeys.cipherTextKey, cipherKeys.cipherHeaderKey, header.iv, BlobCipherMetrics::BACKUP); diff --git a/fdbclient/include/fdbclient/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h index b1b8c231d4..9d3245b07c 100644 --- a/fdbclient/include/fdbclient/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -76,6 +76,7 @@ public: // Order of this enum has to match initializer of counterSets. enum UsageType : int { TLOG = 0, + TLOG_POST_RESOLUTION, KV_MEMORY, KV_REDWOOD, BLOB_GRANULE, @@ -169,6 +170,8 @@ struct BlobCipherDetails { const EncryptCipherRandomSalt& random) : encryptDomainId(dId), baseCipherId(bId), salt(random) {} + bool isValid() const { return encryptDomainId != INVALID_ENCRYPT_DOMAIN_ID; } + bool operator==(const BlobCipherDetails& o) const { return encryptDomainId == o.encryptDomainId && baseCipherId == o.baseCipherId && salt == o.salt; } @@ -621,6 +624,8 @@ public: return now() + INetwork::TIME_EPS >= expireAtTS ? true : false; } + BlobCipherDetails details() const { return BlobCipherDetails{ encryptDomainId, baseCipherId, randomSalt }; } + void reset(); private: diff --git a/fdbclient/include/fdbclient/CommitTransaction.h b/fdbclient/include/fdbclient/CommitTransaction.h index 28d5a55234..c714b1f3ac 100644 --- a/fdbclient/include/fdbclient/CommitTransaction.h +++ b/fdbclient/include/fdbclient/CommitTransaction.h @@ -169,17 +169,23 @@ struct MutationRef { Arena& arena, BlobCipherMetrics::UsageType usageType) const { ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID); - auto textCipherItr = cipherKeys.find(domainId); - auto headerCipherItr = cipherKeys.find(ENCRYPT_HEADER_DOMAIN_ID); - ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); - ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + auto getCipherKey = [&](const EncryptCipherDomainId& domainId) { + auto iter = cipherKeys.find(domainId); + ASSERT(iter != cipherKeys.end() && iter->second.isValid()); + return iter->second; + }; + Reference textCipherKey = getCipherKey(domainId); + Reference headerCipherKey; + if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { + headerCipherKey = getCipherKey(ENCRYPT_HEADER_DOMAIN_ID); + } uint8_t iv[AES_256_IV_LENGTH] = { 0 }; deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); BinaryWriter bw(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); bw << *this; EncryptBlobCipherAes265Ctr cipher( - textCipherItr->second, - headerCipherItr->second, + textCipherKey, + headerCipherKey, iv, AES_256_IV_LENGTH, getEncryptAuthTokenMode(EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE), @@ -217,27 +223,24 @@ struct MutationRef { Arena& arena, BlobCipherMetrics::UsageType usageType, StringRef* buf = nullptr) const { - const BlobCipherEncryptHeader* header = encryptionHeader(); - auto textCipherItr = cipherKeys.find(header->cipherTextDetails); - auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); - ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); - ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); - TextAndHeaderCipherKeys textAndHeaderKeys; - textAndHeaderKeys.cipherHeaderKey = headerCipherItr->second; - textAndHeaderKeys.cipherTextKey = textCipherItr->second; + TextAndHeaderCipherKeys textAndHeaderKeys = getCipherKeys(cipherKeys); return decrypt(textAndHeaderKeys, arena, usageType, buf); } TextAndHeaderCipherKeys getCipherKeys( - const std::unordered_map>& cipherKeys) { + const std::unordered_map>& cipherKeys) const { const BlobCipherEncryptHeader* header = encryptionHeader(); - auto textCipherItr = cipherKeys.find(header->cipherTextDetails); - auto headerCipherItr = cipherKeys.find(header->cipherHeaderDetails); - ASSERT(textCipherItr != cipherKeys.end() && textCipherItr->second.isValid()); - ASSERT(headerCipherItr != cipherKeys.end() && headerCipherItr->second.isValid()); + auto getCipherKey = [&](const BlobCipherDetails& details) -> Reference { + if (!details.isValid()) { + return {}; + } + auto iter = cipherKeys.find(details); + ASSERT(iter != cipherKeys.end() && iter->second.isValid()); + return iter->second; + }; TextAndHeaderCipherKeys textAndHeaderKeys; - textAndHeaderKeys.cipherHeaderKey = headerCipherItr->second; - textAndHeaderKeys.cipherTextKey = textCipherItr->second; + textAndHeaderKeys.cipherHeaderKey = getCipherKey(header->cipherHeaderDetails); + textAndHeaderKeys.cipherTextKey = getCipherKey(header->cipherTextDetails); return textAndHeaderKeys; } diff --git a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 84f4e2432d..6e51c82394 100644 --- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -296,6 +296,7 @@ ACTOR template Future getLatestEncryptCipherKeysForDomain(Reference const> db, EncryptCipherDomainId domainId, BlobCipherMetrics::UsageType usageType) { + // TODO: Do not fetch header cipher key if authentication is diabled. std::unordered_set domainIds = { domainId, ENCRYPT_HEADER_DOMAIN_ID }; std::unordered_map> cipherKeys = wait(getLatestEncryptCipherKeys(db, domainIds, usageType)); @@ -317,15 +318,23 @@ ACTOR template Future getEncryptCipherKeys(Reference const> db, BlobCipherEncryptHeader header, BlobCipherMetrics::UsageType usageType) { - std::unordered_set cipherDetails{ header.cipherTextDetails, header.cipherHeaderDetails }; + std::unordered_set cipherDetails{ header.cipherTextDetails }; + if (header.cipherHeaderDetails.isValid()) { + cipherDetails.insert(header.cipherHeaderDetails); + } std::unordered_map> cipherKeys = wait(getEncryptCipherKeys(db, cipherDetails, usageType)); - ASSERT(cipherKeys.count(header.cipherTextDetails) > 0); - ASSERT(cipherKeys.count(header.cipherHeaderDetails) > 0); - TextAndHeaderCipherKeys result{ cipherKeys.at(header.cipherTextDetails), - cipherKeys.at(header.cipherHeaderDetails) }; - ASSERT(result.cipherTextKey.isValid()); - ASSERT(result.cipherHeaderKey.isValid()); + TextAndHeaderCipherKeys result; + auto setCipherKey = [&](const BlobCipherDetails& details, Reference& cipherKey) { + if (!details.isValid()) { + return; + } + auto iter = cipherKeys.find(details); + ASSERT(iter != cipherKeys.end() && iter->second.isValid()); + cipherKey = iter->second; + }; + setCipherKey(header.cipherTextDetails, result.cipherTextKey); + setCipherKey(header.cipherHeaderDetails, result.cipherHeaderKey); return result; } diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index e91e84105e..e119174e5b 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -87,7 +87,15 @@ public: storageCache(&proxyCommitData_.storageCache), tag_popped(&proxyCommitData_.tag_popped), tssMapping(&proxyCommitData_.tssMapping), tenantMap(&proxyCommitData_.tenantMap), tenantNameIndex(&proxyCommitData_.tenantNameIndex), initialCommit(initialCommit_), - provisionalCommitProxy(provisionalCommitProxy_) {} + provisionalCommitProxy(provisionalCommitProxy_) { + if (encryptMode.isEncryptionEnabled()) { + ASSERT(cipherKeys != nullptr); + ASSERT(cipherKeys->count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0); + if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { + ASSERT(cipherKeys->count(ENCRYPT_HEADER_DOMAIN_ID)); + } + } + } ApplyMetadataMutationsImpl(const SpanContext& spanContext_, ResolverData& resolverData_, @@ -98,7 +106,15 @@ public: cipherKeys(cipherKeys_), encryptMode(encryptMode), txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit), confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion), keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache), - initialCommit(resolverData_.initialCommit), forResolver(true) {} + initialCommit(resolverData_.initialCommit), forResolver(true) { + if (encryptMode.isEncryptionEnabled()) { + ASSERT(cipherKeys != nullptr); + ASSERT(cipherKeys->count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID) > 0); + if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { + ASSERT(cipherKeys->count(ENCRYPT_HEADER_DOMAIN_ID)); + } + } + } private: // The following variables are incoming parameters diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 18e32c824b..0716129dee 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -107,7 +107,9 @@ struct VersionedMessage { reader >> m; const BlobCipherEncryptHeader* header = m.encryptionHeader(); cipherDetails.insert(header->cipherTextDetails); - cipherDetails.insert(header->cipherHeaderDetails); + if (header->cipherHeaderDetails.isValid()) { + cipherDetails.insert(header->cipherHeaderDetails); + } } } }; diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 5e92bcb537..cd269366ce 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -999,10 +999,11 @@ ACTOR Future getResolution(CommitBatchContext* self) { // Fetch cipher keys if needed. state Future>> getCipherKeys; if (pProxyCommitData->encryptMode.isEncryptionEnabled()) { - static const std::unordered_set defaultDomainIds = { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, - ENCRYPT_HEADER_DOMAIN_ID, - FDB_DEFAULT_ENCRYPT_DOMAIN_ID }; - std::unordered_set encryptDomainIds = defaultDomainIds; + std::unordered_set encryptDomainIds = { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, + FDB_DEFAULT_ENCRYPT_DOMAIN_ID }; + if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { + encryptDomainIds.insert(ENCRYPT_HEADER_DOMAIN_ID); + } // For cluster aware encryption only the default domain id is needed if (pProxyCommitData->encryptMode.mode == EncryptionAtRestMode::DOMAIN_AWARE) { for (int t = 0; t < trs.size(); t++) { @@ -1010,18 +1011,6 @@ ACTOR Future getResolution(CommitBatchContext* self) { int64_t tenantId = tenantInfo.tenantId; if (tenantId != TenantInfo::INVALID_TENANT) { encryptDomainIds.emplace(tenantId); - } else { - // Optimization: avoid enumerating mutations if cluster only serves default encryption domains - if (pProxyCommitData->tenantMap.size() > 0) { - for (auto m : trs[t].transaction.mutations) { - EncryptCipherDomainId domainId = getEncryptDetailsFromMutationRef(pProxyCommitData, m); - encryptDomainIds.emplace(domainId); - } - } else { - // Ensure default encryption domain-ids are present. - ASSERT_EQ(encryptDomainIds.count(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID), 1); - ASSERT_EQ(encryptDomainIds.count(FDB_DEFAULT_ENCRYPT_DOMAIN_ID), 1); - } } } } @@ -1151,10 +1140,14 @@ TEST_CASE("/CommitProxy/SplitRange/LowerBoundTenantId") { // t1_end), ... [tn_begin, b); The references are allocated on arena; std::vector splitClearRangeByTenant(Arena& arena, const MutationRef& mutation, - const std::map& tenantMap) { + const std::map& tenantMap, + std::vector* tenantIds = nullptr) { std::vector results; auto it = lowerBoundTenantId(mutation.param1, tenantMap); while (it != tenantMap.end()) { + if (tenantIds != nullptr) { + tenantIds->push_back(it->first); + } KeyRef tPrefix = TenantAPI::idToPrefix(arena, it->first); if (tPrefix >= mutation.param2) { break; @@ -1298,8 +1291,9 @@ size_t processClearRangeMutation(Arena& arena, MutationRef& mutation, int mutationIdx, int& newMutationSize, - std::vector>>& idxSplitMutations) { - std::vector newClears = splitClearRangeByTenant(arena, mutation, tenantMap); + std::vector>>& idxSplitMutations, + std::vector* tenantIds = nullptr) { + std::vector newClears = splitClearRangeByTenant(arena, mutation, tenantMap, tenantIds); if (newClears.size() == 1) { mutation = newClears[0]; } else if (newClears.size() > 1) { @@ -1377,10 +1371,12 @@ TEST_CASE("/CommitProxy/SplitRange/replaceRawClearRanges") { Error validateAndProcessTenantAccess(Arena& arena, VectorRef& mutations, ProxyCommitData* const pProxyCommitData, + std::unordered_set& rawAccessTenantIds, Optional debugId = Optional(), const char* context = "") { bool changeTenant = false; bool writeNormalKey = false; + std::vector tids; // tenant ids accessed by the raw access transaction std::vector>> idxSplitMutations; int newMutationSize = mutations.size(); @@ -1392,7 +1388,7 @@ Error validateAndProcessTenantAccess(Arena& arena, if (mutation.type == MutationRef::ClearRange) { int newClearSize = processClearRangeMutation( - arena, pProxyCommitData->tenantMap, mutation, i, newMutationSize, idxSplitMutations); + arena, pProxyCommitData->tenantMap, mutation, i, newMutationSize, idxSplitMutations, &tids); if (debugId.present()) { DisabledTraceEvent(SevDebug, "SplitTenantClearRange", pProxyCommitData->dbgid) @@ -1435,13 +1431,21 @@ Error validateAndProcessTenantAccess(Arena& arena, .detail("Reason", "Tenant change and normal key write in same transaction"); return illegal_tenant_access(); } + if (tenantId.present()) { + ASSERT(tenantId.get() != TenantInfo::INVALID_TENANT); + tids.push_back(tenantId.get()); + } } + rawAccessTenantIds.insert(tids.begin(), tids.end()); replaceRawClearRanges(arena, mutations, idxSplitMutations, newMutationSize); return success(); } -Error validateAndProcessTenantAccess(CommitTransactionRequest& tr, ProxyCommitData* const pProxyCommitData) { +// If the validation success, return the list of tenant Ids refered by the transaction via tenantIds. +Error validateAndProcessTenantAccess(CommitTransactionRequest& tr, + ProxyCommitData* const pProxyCommitData, + std::unordered_set& rawAccessTenantIds) { bool isValid = checkTenantNoWait(pProxyCommitData, tr.tenantInfo.tenantId, "Commit", true); if (!isValid) { return tenant_not_found(); @@ -1449,11 +1453,18 @@ Error validateAndProcessTenantAccess(CommitTransactionRequest& tr, ProxyCommitDa // only do the mutation check when the transaction use raw_access option and the tenant mode is required if (pProxyCommitData->getTenantMode() != TenantMode::REQUIRED || tr.tenantInfo.hasTenant()) { + if (tr.tenantInfo.hasTenant()) { + rawAccessTenantIds.insert(tr.tenantInfo.tenantId); + } return success(); } - return validateAndProcessTenantAccess( - tr.arena, tr.transaction.mutations, pProxyCommitData, tr.debugID, "validateAndProcessTenantAccess"); + return validateAndProcessTenantAccess(tr.arena, + tr.transaction.mutations, + pProxyCommitData, + rawAccessTenantIds, + tr.debugID, + "validateAndProcessTenantAccess"); } // Compute and apply "metadata" effects of each other proxy's most recent batch @@ -1587,13 +1598,14 @@ void determineCommittedTransactions(CommitBatchContext* self) { // This first pass through committed transactions deals with "metadata" effects (modifications of txnStateStore, changes // to storage servers' responsibilities) ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self) { - auto pProxyCommitData = self->pProxyCommitData; + state ProxyCommitData* const pProxyCommitData = self->pProxyCommitData; + state std::unordered_set rawAccessTenantIds; auto& trs = self->trs; int t; for (t = 0; t < trs.size() && !self->forceRecovery; t++) { if (self->committed[t] == ConflictBatch::TransactionCommitted && (!self->locked || trs[t].isLockAware())) { - Error e = validateAndProcessTenantAccess(trs[t], pProxyCommitData); + Error e = validateAndProcessTenantAccess(trs[t], pProxyCommitData, rawAccessTenantIds); if (e.code() != error_code_success) { trs[t].reply.sendError(e); self->committed[t] = ConflictBatch::TransactionTenantFailure; @@ -1605,8 +1617,7 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self pProxyCommitData->logSystem, trs[t].transaction.mutations, SERVER_KNOBS->PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ? nullptr : &self->toCommit, - pProxyCommitData->encryptMode.isEncryptionEnabled() ? &self->cipherKeys - : nullptr, + &self->cipherKeys, pProxyCommitData->encryptMode, self->forceRecovery, self->commitVersion, @@ -1660,6 +1671,23 @@ ACTOR Future applyMetadataToCommittedTransactions(CommitBatchContext* self ASSERT(false); // ChangeCoordinatorsRequest should always throw } + // If there are raw access requests or cross-tenant boundary clear ranges in the batch, tenant ids for those + // requests are availalbe only after resolution. We need to fetch additional cipher keys for these requests. + if (pProxyCommitData->encryptMode == EncryptionAtRestMode::DOMAIN_AWARE && !rawAccessTenantIds.empty()) { + std::unordered_set extraDomainIds; + for (auto tenantId : rawAccessTenantIds) { + if (self->cipherKeys.count(tenantId) == 0) { + extraDomainIds.insert(tenantId); + } + } + if (!extraDomainIds.empty()) { + std::unordered_map> extraCipherKeys = + wait(getLatestEncryptCipherKeys( + pProxyCommitData->db, extraDomainIds, BlobCipherMetrics::TLOG_POST_RESOLUTION)); + self->cipherKeys.insert(extraCipherKeys.begin(), extraCipherKeys.end()); + } + } + return Void(); } @@ -1690,30 +1718,6 @@ ACTOR Future writeMutationEncryptedMutation(CommitBatchCont return encryptedMutation; } -ACTOR Future writeMutationFetchEncryptKey(CommitBatchContext* self, - int64_t tenantId, - const MutationRef* mutation, - Arena* arena) { - - state EncryptCipherDomainId domainId = tenantId; - state MutationRef encryptedMutation; - - static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID); - ASSERT(self->pProxyCommitData->encryptMode.isEncryptionEnabled()); - ASSERT_NE((MutationRef::Type)mutation->type, MutationRef::Type::ClearRange); - - domainId = getEncryptDetailsFromMutationRef(self->pProxyCommitData, *mutation); - Reference cipherKey = - wait(getLatestEncryptCipherKey(self->pProxyCommitData->db, domainId, BlobCipherMetrics::TLOG)); - self->cipherKeys[domainId] = cipherKey; - - CODE_PROBE(true, "Raw access mutation encryption", probe::decoration::rare); - ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID); - encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG); - self->toCommit.writeTypedMessage(encryptedMutation); - return encryptedMutation; -} - Future writeMutation(CommitBatchContext* self, int64_t domainId, const MutationRef* mutation, @@ -1753,14 +1757,10 @@ Future writeMutation(CommitBatchContext* self, } else { if (domainId == INVALID_ENCRYPT_DOMAIN_ID) { domainId = getEncryptDetailsFromMutationRef(self->pProxyCommitData, *mutation); - if (self->cipherKeys.find(domainId) == self->cipherKeys.end()) { - return writeMutationFetchEncryptKey(self, domainId, mutation, arena); - } - CODE_PROBE(true, "Raw access mutation encryption"); } - ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID); + ASSERT(self->cipherKeys.count(domainId) > 0); encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG); } ASSERT(encryptedMutation.isEncrypted()); @@ -3219,13 +3219,11 @@ ACTOR Future processCompleteTransactionStateRequest(TransactionStateResolv tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key); } - state std::unordered_map> cipherKeys; + state std::unordered_map> systemCipherKeys; if (pContext->pCommitData->encryptMode.isEncryptionEnabled()) { - static const std::unordered_set metadataDomainIds = { SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, - ENCRYPT_HEADER_DOMAIN_ID }; - std::unordered_map> cks = - wait(getLatestEncryptCipherKeys(pContext->pCommitData->db, metadataDomainIds, BlobCipherMetrics::TLOG)); - cipherKeys = cks; + std::unordered_map> cks = wait(getLatestEncryptCipherKeys( + pContext->pCommitData->db, ENCRYPT_CIPHER_SYSTEM_DOMAINS, BlobCipherMetrics::TLOG)); + systemCipherKeys = cks; } loop { @@ -3294,7 +3292,7 @@ ACTOR Future processCompleteTransactionStateRequest(TransactionStateResolv Reference(), mutations, /* pToCommit= */ nullptr, - pContext->pCommitData->encryptMode.isEncryptionEnabled() ? &cipherKeys : nullptr, + &systemCipherKeys, pContext->pCommitData->encryptMode, confChanges, /* version= */ 0, diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index f97cf7f9ef..3102e273a1 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -375,8 +375,10 @@ ACTOR static Future _decryptMutation(MutationRef mutation, Database Reference const> dbInfo = cx->clientInfo; state const BlobCipherEncryptHeader* header = mutation.encryptionHeader(); std::unordered_set cipherDetails; - cipherDetails.insert(header->cipherHeaderDetails); cipherDetails.insert(header->cipherTextDetails); + if (header->cipherHeaderDetails.isValid()) { + cipherDetails.insert(header->cipherHeaderDetails); + } std::unordered_map> getCipherKeysResult = wait(getEncryptCipherKeys(dbInfo, cipherDetails, BlobCipherMetrics::BACKUP)); return mutation.decrypt(getCipherKeysResult, *arena, BlobCipherMetrics::BACKUP); diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 1b574dab88..a097b5e414 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -1927,7 +1927,9 @@ ACTOR Future pullAsyncData(StorageCacheData* data) { if (!cipherKeys.present()) { const BlobCipherEncryptHeader* header = msg.encryptionHeader(); cipherDetails.insert(header->cipherTextDetails); - cipherDetails.insert(header->cipherHeaderDetails); + if (header->cipherHeaderDetails.isValid()) { + cipherDetails.insert(header->cipherHeaderDetails); + } collectingCipherKeys = true; } else { msg = msg.decrypt(cipherKeys.get(), cloneReader.arena(), BlobCipherMetrics::TLOG); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d5a7a31dcf..0b68c51d77 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -3022,7 +3022,9 @@ ACTOR Future> getChangeFeedMutations(Stor if (m.isEncrypted()) { const BlobCipherEncryptHeader* header = m.encryptionHeader(); cipherDetails.insert(header->cipherTextDetails); - cipherDetails.insert(header->cipherHeaderDetails); + if (header->cipherHeaderDetails.isValid()) { + cipherDetails.insert(header->cipherHeaderDetails); + } } } } @@ -9257,7 +9259,9 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { if (!cipherKeys.present()) { const BlobCipherEncryptHeader* header = msg.encryptionHeader(); cipherDetails.insert(header->cipherTextDetails); - cipherDetails.insert(header->cipherHeaderDetails); + if (header->cipherHeaderDetails.isValid()) { + cipherDetails.insert(header->cipherHeaderDetails); + } collectingCipherKeys = true; } else { msg = msg.decrypt(cipherKeys.get(), eager.arena, BlobCipherMetrics::TLOG); diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index c1a3e06f9f..7ccb5c1a72 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -352,9 +352,12 @@ struct EncryptionOpsWorkload : TestWorkload { Reference cipherKey = getEncryptionKey(header.cipherTextDetails.encryptDomainId, header.cipherTextDetails.baseCipherId, header.cipherTextDetails.salt); - Reference headerCipherKey = getEncryptionKey(header.cipherHeaderDetails.encryptDomainId, - header.cipherHeaderDetails.baseCipherId, - header.cipherHeaderDetails.salt); + Reference headerCipherKey; + if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { + headerCipherKey = getEncryptionKey(header.cipherHeaderDetails.encryptDomainId, + header.cipherHeaderDetails.baseCipherId, + header.cipherHeaderDetails.salt); + } ASSERT(cipherKey.isValid()); ASSERT(cipherKey->isEqual(orgCipherKey)); ASSERT(headerCipherKey.isValid() || @@ -407,7 +410,7 @@ struct EncryptionOpsWorkload : TestWorkload { Reference cipherKey = getEncryptionKey(textCipherDetails.encryptDomainId, textCipherDetails.baseCipherId, textCipherDetails.salt); Reference headerCipherKey = - headerCipherDetails.encryptDomainId == INVALID_ENCRYPT_DOMAIN_ID + !headerCipherDetails.isValid() ? Reference() // no authentication mode cipher header-key is not needed : getEncryptionKey( headerCipherDetails.encryptDomainId, headerCipherDetails.baseCipherId, headerCipherDetails.salt); diff --git a/flow/include/flow/EncryptUtils.h b/flow/include/flow/EncryptUtils.h index 528d3fbb50..bcf5133d12 100644 --- a/flow/include/flow/EncryptUtils.h +++ b/flow/include/flow/EncryptUtils.h @@ -28,6 +28,7 @@ #include #include #include +#include constexpr const int AUTH_TOKEN_HMAC_SHA_SIZE = 32; constexpr const int AUTH_TOKEN_AES_CMAC_SIZE = 16; @@ -46,6 +47,17 @@ constexpr const EncryptCipherBaseKeyId INVALID_ENCRYPT_CIPHER_KEY_ID = 0; constexpr const EncryptCipherRandomSalt INVALID_ENCRYPT_RANDOM_SALT = 0; +static const std::unordered_set ENCRYPT_CIPHER_SYSTEM_DOMAINS = { + SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, + ENCRYPT_HEADER_DOMAIN_ID +}; + +static const std::unordered_set ENCRYPT_CIPHER_DETAULT_DOMAINS = { + SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, + ENCRYPT_HEADER_DOMAIN_ID, + FDB_DEFAULT_ENCRYPT_DOMAIN_ID, +}; + typedef enum { ENCRYPT_CIPHER_MODE_NONE = 0, ENCRYPT_CIPHER_MODE_AES_256_CTR = 1, From cc3536e48ba428de8e25300194824d9c159f917c Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 14 Feb 2023 13:43:41 -0800 Subject: [PATCH 51/57] EaR: fix BlobCipher.h build failure --- fdbclient/include/fdbclient/BlobCipher.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fdbclient/include/fdbclient/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h index 9d3245b07c..c82bcf5776 100644 --- a/fdbclient/include/fdbclient/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -170,8 +170,6 @@ struct BlobCipherDetails { const EncryptCipherRandomSalt& random) : encryptDomainId(dId), baseCipherId(bId), salt(random) {} - bool isValid() const { return encryptDomainId != INVALID_ENCRYPT_DOMAIN_ID; } - bool operator==(const BlobCipherDetails& o) const { return encryptDomainId == o.encryptDomainId && baseCipherId == o.baseCipherId && salt == o.salt; } From d99fb2ee028c8ff09d9626552c5922062b35cca7 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 14 Feb 2023 14:53:14 -0800 Subject: [PATCH 52/57] Remove storageEngineExcludeTypes from blob tests I.e., enable rocksDB for these tests. --- tests/fast/BlobGranuleMoveVerifyCycle.toml | 2 -- tests/fast/BlobGranuleVerifyAtomicOps.toml | 2 -- tests/fast/BlobGranuleVerifyCycle.toml | 2 -- tests/fast/BlobGranuleVerifySmall.toml | 2 -- tests/fast/BlobGranuleVerifySmallClean.toml | 2 -- tests/rare/BlobGranuleRanges.toml | 2 -- tests/slow/BlobGranuleCorrectness.toml | 2 -- tests/slow/BlobGranuleCorrectnessClean.toml | 2 -- tests/slow/BlobGranuleVerifyBalance.toml | 2 -- tests/slow/BlobGranuleVerifyBalanceClean.toml | 2 -- tests/slow/BlobGranuleVerifyLarge.toml | 2 -- tests/slow/BlobGranuleVerifyLargeClean.toml | 2 -- 12 files changed, 24 deletions(-) diff --git a/tests/fast/BlobGranuleMoveVerifyCycle.toml b/tests/fast/BlobGranuleMoveVerifyCycle.toml index 5152fd65f4..421f3003c5 100644 --- a/tests/fast/BlobGranuleMoveVerifyCycle.toml +++ b/tests/fast/BlobGranuleMoveVerifyCycle.toml @@ -2,8 +2,6 @@ testClass = "BlobGranule" blobGranulesEnabled = true allowDefaultTenant = false -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4] [[test]] testTitle = 'BlobGranuleMoveVerifyCycle' diff --git a/tests/fast/BlobGranuleVerifyAtomicOps.toml b/tests/fast/BlobGranuleVerifyAtomicOps.toml index 6d26499967..a9756043e9 100644 --- a/tests/fast/BlobGranuleVerifyAtomicOps.toml +++ b/tests/fast/BlobGranuleVerifyAtomicOps.toml @@ -4,8 +4,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyAtomicOps' diff --git a/tests/fast/BlobGranuleVerifyCycle.toml b/tests/fast/BlobGranuleVerifyCycle.toml index b3e4fdedcd..d5db339a34 100644 --- a/tests/fast/BlobGranuleVerifyCycle.toml +++ b/tests/fast/BlobGranuleVerifyCycle.toml @@ -4,8 +4,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyCycle' diff --git a/tests/fast/BlobGranuleVerifySmall.toml b/tests/fast/BlobGranuleVerifySmall.toml index 42b37c4f04..c761441d8b 100644 --- a/tests/fast/BlobGranuleVerifySmall.toml +++ b/tests/fast/BlobGranuleVerifySmall.toml @@ -4,8 +4,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifySmall' diff --git a/tests/fast/BlobGranuleVerifySmallClean.toml b/tests/fast/BlobGranuleVerifySmallClean.toml index 8fb36517e6..2b6e9f8ea2 100644 --- a/tests/fast/BlobGranuleVerifySmallClean.toml +++ b/tests/fast/BlobGranuleVerifySmallClean.toml @@ -1,8 +1,6 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] testClass = "BlobGranule" [[test]] diff --git a/tests/rare/BlobGranuleRanges.toml b/tests/rare/BlobGranuleRanges.toml index 85f7dd7fb7..3ae8e10876 100644 --- a/tests/rare/BlobGranuleRanges.toml +++ b/tests/rare/BlobGranuleRanges.toml @@ -3,8 +3,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleRanges' diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml index c5f5369359..bc44be7fc0 100644 --- a/tests/slow/BlobGranuleCorrectness.toml +++ b/tests/slow/BlobGranuleCorrectness.toml @@ -4,8 +4,6 @@ allowDefaultTenant = false tenantModes = ['optional', 'required'] injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] encryptModes = ['domain_aware', 'cluster_aware'] [[knobs]] diff --git a/tests/slow/BlobGranuleCorrectnessClean.toml b/tests/slow/BlobGranuleCorrectnessClean.toml index 22c5bd4dc0..cfdab46df5 100644 --- a/tests/slow/BlobGranuleCorrectnessClean.toml +++ b/tests/slow/BlobGranuleCorrectnessClean.toml @@ -2,8 +2,6 @@ blobGranulesEnabled = true allowDefaultTenant = false tenantModes = ['optional', 'required'] -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] encryptModes = ['domain_aware', 'cluster_aware'] [[knobs]] diff --git a/tests/slow/BlobGranuleVerifyBalance.toml b/tests/slow/BlobGranuleVerifyBalance.toml index e610ff6299..57956ef0f1 100644 --- a/tests/slow/BlobGranuleVerifyBalance.toml +++ b/tests/slow/BlobGranuleVerifyBalance.toml @@ -3,8 +3,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyBalance' diff --git a/tests/slow/BlobGranuleVerifyBalanceClean.toml b/tests/slow/BlobGranuleVerifyBalanceClean.toml index 5a5627f95f..e3636cd9de 100644 --- a/tests/slow/BlobGranuleVerifyBalanceClean.toml +++ b/tests/slow/BlobGranuleVerifyBalanceClean.toml @@ -1,8 +1,6 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyBalanceClean' diff --git a/tests/slow/BlobGranuleVerifyLarge.toml b/tests/slow/BlobGranuleVerifyLarge.toml index dffb8579c8..219724a66f 100644 --- a/tests/slow/BlobGranuleVerifyLarge.toml +++ b/tests/slow/BlobGranuleVerifyLarge.toml @@ -3,8 +3,6 @@ blobGranulesEnabled = true allowDefaultTenant = false injectTargetedSSRestart = true injectSSDelay = true -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyLarge' diff --git a/tests/slow/BlobGranuleVerifyLargeClean.toml b/tests/slow/BlobGranuleVerifyLargeClean.toml index 7f2f1ce423..981bd75869 100644 --- a/tests/slow/BlobGranuleVerifyLargeClean.toml +++ b/tests/slow/BlobGranuleVerifyLargeClean.toml @@ -1,8 +1,6 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false -# FIXME: re-enable rocks at some point -# storageEngineExcludeTypes = [4, 5] [[test]] testTitle = 'BlobGranuleVerifyLargeClean' From bf85c9f8afffe3e3edc052fd85655075fd5c156f Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Tue, 14 Feb 2023 16:46:09 -0800 Subject: [PATCH 53/57] Backup Mutation Log Separates Tenant Map Modifications During Restore (#9292) mutation log separates tenant map modifications --- fdbclient/BackupAgentBase.actor.cpp | 152 ++++++++++++++---- fdbclient/TenantManagement.actor.cpp | 10 ++ .../fdbclient/TenantManagement.actor.h | 1 + fdbserver/CommitProxyServer.actor.cpp | 19 +-- 4 files changed, 136 insertions(+), 46 deletions(-) diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 5c7ffda624..7dd2f25ab7 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -23,6 +23,7 @@ #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BlobCipher.h" +#include "fdbclient/CommitProxyInterface.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/GetEncryptCipherKeys.actor.h" @@ -290,10 +291,28 @@ std::pair decodeBKMutationLogKey(Key key) { bigEndian32(*(int32_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t) + sizeof(int64_t)))); } +void _addResult(bool* tenantMapChanging, + VectorRef* result, + int* mutationSize, + Arena* arena, + MutationRef logValue, + KeyRangeRef tenantMapRange) { + *tenantMapChanging = *tenantMapChanging || TenantAPI::tenantMapChanging(logValue, tenantMapRange); + result->push_back_deep(*arena, logValue); + *mutationSize += logValue.expectedSize(); +} + +/* + This actor is responsible for taking an original transaction which was added to the backup mutation log (represented + by "value" parameter), breaking it up into the individual MutationRefs (that constitute the transaction), decrypting + each mutation (if needed) and adding/removing prefixes from the mutations. The final mutations are then added to the + "result" vector alongside their encrypted counterparts (which is added to the "encryptedResult" vector) +*/ ACTOR static Future decodeBackupLogValue(Arena* arena, VectorRef* result, VectorRef>* encryptedResult, int* mutationSize, + bool* tenantMapChanging, Standalone value, Key addPrefix, Key removePrefix, @@ -325,6 +344,7 @@ ACTOR static Future decodeBackupLogValue(Arena* arena, state int originalOffset = offset; state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx)); + state KeyRangeRef tenantMapRange = TenantMetadata::tenantMap().subspace; while (consumed < totalBytes) { uint32_t type = 0; @@ -410,8 +430,7 @@ ACTOR static Future decodeBackupLogValue(Arena* arena, logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } logValue.param2 = addPrefix == StringRef() ? allKeys.end : strinc(addPrefix, tempArena); - result->push_back_deep(*arena, logValue); - *mutationSize += logValue.expectedSize(); + _addResult(tenantMapChanging, result, mutationSize, arena, logValue, tenantMapRange); } else { logValue.param1 = std::max(r.range().begin, range.begin); logValue.param2 = minKey; @@ -423,8 +442,7 @@ ACTOR static Future decodeBackupLogValue(Arena* arena, logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); logValue.param2 = logValue.param2.withPrefix(addPrefix, tempArena); } - result->push_back_deep(*arena, logValue); - *mutationSize += logValue.expectedSize(); + _addResult(tenantMapChanging, result, mutationSize, arena, logValue, tenantMapRange); } if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { encryptedResult->push_back_deep(*arena, encryptedLogValue); @@ -443,8 +461,7 @@ ACTOR static Future decodeBackupLogValue(Arena* arena, if (addPrefix.size()) { logValue.param1 = logValue.param1.withPrefix(addPrefix, tempArena); } - result->push_back_deep(*arena, logValue); - *mutationSize += logValue.expectedSize(); + _addResult(tenantMapChanging, result, mutationSize, arena, logValue, tenantMapRange); // If we did not remove/add prefixes to the mutation then keep the original encrypted mutation so we // do not have to re-encrypt unnecessarily if (originalLogValue.param1 == logValue.param1 && originalLogValue.param2 == logValue.param2) { @@ -695,6 +712,41 @@ Future readCommitted(Database cx, cx, results, Void(), lock, range, groupBy, Terminator::True, AccessSystemKeys::True, LockAware::True); } +ACTOR Future sendCommitTransactionRequest(CommitTransactionRequest req, + Key uid, + Version newBeginVersion, + Key rangeBegin, + NotifiedVersion* committedVersion, + int* totalBytes, + int* mutationSize, + PromiseStream> addActor, + FlowLock* commitLock, + PublicRequestStream commit) { + Key applyBegin = uid.withPrefix(applyMutationsBeginRange.begin); + Key versionKey = BinaryWriter::toValue(newBeginVersion, Unversioned()); + Key rangeEnd = getApplyKey(newBeginVersion, uid); + + // mutations and encrypted mutations (and their relationship) is described in greater detail in the defenition of + // CommitTransactionRef in CommitTransaction.h + req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::SetValue, applyBegin, versionKey)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); + req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(applyBegin)); + req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd)); + req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); + req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(rangeBegin)); + + // The commit request contains no read conflict ranges, so regardless of what read version we + // choose, it's impossible for us to get a transaction_too_old error back, and it's impossible + // for our transaction to be aborted due to conflicts. + req.transaction.read_snapshot = committedVersion->get(); + req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; + + *totalBytes += *mutationSize; + wait(commitLock->take(TaskPriority::DefaultYield, *mutationSize)); + addActor.send(commitLock->releaseWhen(success(commit.getReply(req)), *mutationSize)); + return Void(); +} + ACTOR Future kvMutationLogToTransactions(Database cx, PromiseStream results, Reference lock, @@ -717,20 +769,26 @@ ACTOR Future kvMutationLogToTransactions(Database cx, state CommitTransactionRequest req; state Version newBeginVersion = invalidVersion; state int mutationSize = 0; + state bool tenantMapChanging = false; loop { try { state RCGroup group = waitNext(results.getFuture()); + state CommitTransactionRequest curReq; lock->release(group.items.expectedSize()); + state int curBatchMutationSize = 0; + tenantMapChanging = false; BinaryWriter bw(Unversioned()); for (int i = 0; i < group.items.size(); ++i) { bw.serializeBytes(group.items[i].value); } + // Parse a single transaction from the backup mutation log Standalone value = bw.toValue(); - wait(decodeBackupLogValue(&req.arena, - &req.transaction.mutations, - &req.transaction.encryptedMutations, - &mutationSize, + wait(decodeBackupLogValue(&curReq.arena, + &curReq.transaction.mutations, + &curReq.transaction.encryptedMutations, + &curBatchMutationSize, + &tenantMapChanging, value, addPrefix, removePrefix, @@ -739,8 +797,48 @@ ACTOR Future kvMutationLogToTransactions(Database cx, cx, tenantMap, provisionalProxy)); + + // A single call to decodeBackupLogValue (above) will only parse mutations from a single transaction, + // however in the code below we batch the results across several calls to decodeBackupLogValue and send + // it in one big CommitTransactionRequest (so one CTR contains mutations from multiple transactions). + // Generally, this would be fine since the mutations in the log are ordered (and thus so are the results + // after calling decodeBackupLogValue). However in the CommitProxy we do not allow mutations which + // change the tenant map to appear alongside regular normalKey mutations in a single + // CommitTransactionRequest. Thus the code below will immediately send any mutations accumulated thus + // far if the latest call to decodeBackupLogValue contained a transaction which changed the tenant map + // (before processing the mutations which caused the tenant map to change). + if (tenantMapChanging && req.transaction.mutations.size()) { + // If the tenantMap is changing send the previous CommitTransactionRequest to the CommitProxy + TraceEvent("MutationLogRestoreTenantMapChanging").detail("BeginVersion", newBeginVersion); + CODE_PROBE(true, "mutation log tenant map changing"); + wait(sendCommitTransactionRequest(req, + uid, + newBeginVersion, + rangeBegin, + committedVersion, + &totalBytes, + &mutationSize, + addActor, + commitLock, + commit)); + req = CommitTransactionRequest(); + mutationSize = 0; + } + + state int i; + for (i = 0; i < curReq.transaction.mutations.size(); i++) { + req.transaction.mutations.push_back_deep(req.arena, curReq.transaction.mutations[i]); + req.transaction.encryptedMutations.push_back_deep(req.arena, + curReq.transaction.encryptedMutations[i]); + } + mutationSize += curBatchMutationSize; newBeginVersion = group.groupKey + 1; - if (mutationSize >= CLIENT_KNOBS->BACKUP_LOG_WRITE_BATCH_MAX_SIZE) { + + // At this point if the tenant map changed we would have already sent any normalKey mutations + // accumulated thus far, so all thats left to do is to send all the mutations in the the offending + // transaction that changed the tenant map. This is necessary so that we don't batch these tenant map + // mutations with future normalKey mutations (which will result in the same problem discussed above). + if (tenantMapChanging || mutationSize >= CLIENT_KNOBS->BACKUP_LOG_WRITE_BATCH_MAX_SIZE) { break; } } catch (Error& e) { @@ -756,28 +854,16 @@ ACTOR Future kvMutationLogToTransactions(Database cx, throw; } } - - Key applyBegin = uid.withPrefix(applyMutationsBeginRange.begin); - Key versionKey = BinaryWriter::toValue(newBeginVersion, Unversioned()); - Key rangeEnd = getApplyKey(newBeginVersion, uid); - - req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::SetValue, applyBegin, versionKey)); - req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); - req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(applyBegin)); - req.transaction.mutations.push_back_deep(req.arena, MutationRef(MutationRef::ClearRange, rangeBegin, rangeEnd)); - req.transaction.encryptedMutations.push_back_deep(req.arena, Optional()); - req.transaction.write_conflict_ranges.push_back_deep(req.arena, singleKeyRange(rangeBegin)); - - // The commit request contains no read conflict ranges, so regardless of what read version we - // choose, it's impossible for us to get a transaction_too_old error back, and it's impossible - // for our transaction to be aborted due to conflicts. - req.transaction.read_snapshot = committedVersion->get(); - req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; - - totalBytes += mutationSize; - wait(commitLock->take(TaskPriority::DefaultYield, mutationSize)); - addActor.send(commitLock->releaseWhen(success(commit.getReply(req)), mutationSize)); - + wait(sendCommitTransactionRequest(req, + uid, + newBeginVersion, + rangeBegin, + committedVersion, + &totalBytes, + &mutationSize, + addActor, + commitLock, + commit)); if (endOfStream) { return totalBytes; } diff --git a/fdbclient/TenantManagement.actor.cpp b/fdbclient/TenantManagement.actor.cpp index 955adbf200..9c1b81d247 100644 --- a/fdbclient/TenantManagement.actor.cpp +++ b/fdbclient/TenantManagement.actor.cpp @@ -67,6 +67,16 @@ int64_t extractTenantIdFromKeyRef(StringRef s) { return TenantAPI::prefixToId(prefix, EnforceValidTenantId::False); } +bool tenantMapChanging(MutationRef const& mutation, KeyRangeRef const& tenantMapRange) { + if (isSingleKeyMutation((MutationRef::Type)mutation.type) && mutation.param1.startsWith(tenantMapRange.begin)) { + return true; + } else if (mutation.type == MutationRef::ClearRange && + tenantMapRange.intersects(KeyRangeRef(mutation.param1, mutation.param2))) { + return true; + } + return false; +} + // validates whether the lastTenantId and the nextTenantId share the same 2 byte prefix bool nextTenantIdPrefixMatches(int64_t lastTenantId, int64_t nextTenantId) { if (getTenantIdPrefix(nextTenantId) != getTenantIdPrefix(lastTenantId)) { diff --git a/fdbclient/include/fdbclient/TenantManagement.actor.h b/fdbclient/include/fdbclient/TenantManagement.actor.h index 768ad06844..f48997a0cc 100644 --- a/fdbclient/include/fdbclient/TenantManagement.actor.h +++ b/fdbclient/include/fdbclient/TenantManagement.actor.h @@ -126,6 +126,7 @@ Future checkTenantMode(Transaction tr, ClusterType expectedClusterType) { TenantMode tenantModeForClusterType(ClusterType clusterType, TenantMode tenantMode); int64_t extractTenantIdFromMutation(MutationRef m); int64_t extractTenantIdFromKeyRef(StringRef s); +bool tenantMapChanging(MutationRef const& mutation, KeyRangeRef const& tenantMapRange); bool nextTenantIdPrefixMatches(int64_t lastTenantId, int64_t nextTenantId); int64_t getMaxAllowableTenantId(int64_t curTenantId); int64_t getTenantIdPrefix(int64_t tenantId); diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index cd269366ce..fe93531364 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1070,17 +1070,6 @@ bool validTenantAccess(MutationRef m, std::map const& tenan return true; } -inline bool tenantMapChanging(MutationRef const& mutation) { - const KeyRangeRef tenantMapRange = TenantMetadata::tenantMap().subspace; - if (isSingleKeyMutation((MutationRef::Type)mutation.type) && mutation.param1.startsWith(tenantMapRange.begin)) { - return true; - } else if (mutation.type == MutationRef::ClearRange && - tenantMapRange.intersects(KeyRangeRef(mutation.param1, mutation.param2))) { - return true; - } - return false; -} - // return an iterator to the first tenantId whose idToPrefix(id) >= prefix[0..8] in lexicographic order. If no such id, // return tenantMap.end() inline auto lowerBoundTenantId(const StringRef& prefix, const std::map& tenantMap) { @@ -1380,11 +1369,12 @@ Error validateAndProcessTenantAccess(Arena& arena, std::vector>> idxSplitMutations; int newMutationSize = mutations.size(); + KeyRangeRef tenantMapRange = TenantMetadata::tenantMap().subspace; for (int i = 0; i < mutations.size(); ++i) { auto& mutation = mutations[i]; Optional tenantId; bool validAccess = true; - changeTenant = changeTenant || tenantMapChanging(mutation); + changeTenant = changeTenant || TenantAPI::tenantMapChanging(mutation, tenantMapRange); if (mutation.type == MutationRef::ClearRange) { int newClearSize = processClearRangeMutation( @@ -1471,6 +1461,7 @@ Error validateAndProcessTenantAccess(CommitTransactionRequest& tr, void applyMetadataEffect(CommitBatchContext* self) { bool initialState = self->isMyFirstBatch; self->firstStateMutations = self->isMyFirstBatch; + KeyRangeRef tenantMapRange = TenantMetadata::tenantMap().subspace; for (int versionIndex = 0; versionIndex < self->resolution[0].stateMutations.size(); versionIndex++) { // pProxyCommitData->logAdapter->setNextVersion( ??? ); << Ideally we would be telling the log adapter that the // pushes in this commit will be in the version at which these state mutations were committed by another proxy, @@ -1492,7 +1483,9 @@ void applyMetadataEffect(CommitBatchContext* self) { // fail transaction if it contain both of tenant changes and normal key writing auto& mutations = self->resolution[0].stateMutations[versionIndex][transactionIndex].mutations; committed = - tenantIds.get().empty() || std::none_of(mutations.begin(), mutations.end(), tenantMapChanging); + tenantIds.get().empty() || std::none_of(mutations.begin(), mutations.end(), [&](MutationRef m) { + return TenantAPI::tenantMapChanging(m, tenantMapRange); + }); // check if all tenant ids are valid if committed == true committed = committed && From c27cdcd2df0aecc5973829c0542858d4f380850f Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 14 Feb 2023 18:27:50 -0800 Subject: [PATCH 54/57] Fix arm nightly tests by skipping "until" restarting tests if no old binaries (#9362) * Fix the joshua bug for restarting tests with until keyword * Remove unnecessary changes * Remove unnecessary changes * Remove unnecessary changes * Add comments, remove debugging symbols --- contrib/TestHarness2/test_harness/run.py | 28 +++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py index 3f6f02cc36..33108784ed 100644 --- a/contrib/TestHarness2/test_harness/run.py +++ b/contrib/TestHarness2/test_harness/run.py @@ -58,7 +58,7 @@ class StatFetcher: class TestPicker: - def __init__(self, test_dir: Path): + def __init__(self, test_dir: Path, binaries: OrderedDict[Version, Path]): if not test_dir.exists(): raise RuntimeError("{} is neither a directory nor a file".format(test_dir)) self.include_files_regex = re.compile(config.include_test_files) @@ -69,6 +69,7 @@ class TestPicker: self.tests: OrderedDict[str, TestDescription] = collections.OrderedDict() self.restart_test: Pattern = re.compile(r".*-\d+\.(txt|toml)") self.follow_test: Pattern = re.compile(r".*-[2-9]\d*\.(txt|toml)") + self.old_binaries: OrderedDict[Version, Path] = binaries for subdir in self.test_dir.iterdir(): if subdir.is_dir() and subdir.name in config.test_dirs: @@ -85,6 +86,10 @@ class TestPicker: else: self.fetch_stats() + if not self.tests: + raise Exception( + "No tests to run! Please check if tests are included/excluded incorrectly or old binaries are missing for restarting tests") + def add_time(self, test_file: Path, run_time: int, out: SummaryTree) -> None: # getting the test name is fairly inefficient. But since we only have 100s of tests, I won't bother test_name: str | None = None @@ -132,6 +137,23 @@ class TestPicker: or self.exclude_files_regex.search(str(path)) is not None ): return + # Skip restarting tests that do not have old binaries in the given version range + # In particular, this is only for restarting tests with the "until" keyword, + # since without "until", it will at least run with the current binary. + if is_restarting_test(path): + candidates: List[Path] = [] + dirs = path.parent.parts + version_expr = dirs[-1].split("_") + if (version_expr[0] == "from" or version_expr[0] == "to") and len(version_expr) == 4 and version_expr[2] == "until": + max_version = Version.parse(version_expr[3]) + min_version = Version.parse(version_expr[1]) + for ver, binary in self.old_binaries.items(): + if min_version <= ver < max_version: + candidates.append(binary) + if not len(candidates): + # No valid old binary found + return + with path.open("r") as f: test_name: str | None = None test_class: str | None = None @@ -263,7 +285,7 @@ class OldBinaries: max_version = Version.parse(version_expr[3]) candidates: List[Path] = [] for ver, binary in self.binaries.items(): - if min_version <= ver <= max_version: + if min_version <= ver < max_version: candidates.append(binary) if len(candidates) == 0: return config.binary @@ -474,7 +496,7 @@ class TestRunner: self.cluster_file: str | None = None self.fdb_app_dir: str | None = None self.binary_chooser = OldBinaries() - self.test_picker = TestPicker(self.test_path) + self.test_picker = TestPicker(self.test_path, self.binary_chooser.binaries) def backup_sim_dir(self, seed: int): temp_dir = config.run_dir / str(self.uid) From 3d882a99c55309de8e8d323094a459c338e495c4 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 14 Feb 2023 20:19:27 -0800 Subject: [PATCH 55/57] EaR: Refactor encryption header std::variant serializer and versioning (#9345) Changes: 1. Make binary serializer natively support `std::variant`. Serialize size is 1 byte (the type index, i.e. `std::variant::index()`), plus the serialize size of the actual type stored in the `std::variant`. Update `BlobCipherEncryptHeaderRef` to use the `std::variant` binary serializer 3. Remove `flagsVersion` and `algoHeaderVersion` from `BlobCipherEncryptHeaderRef`. The former is replaced by `flags.index() + 1`, and the latter is moved into each of the algorithm-specific sub-headers. Each sub-header types will have nesting version-specific subtypes to handle serialization of that specific version (e.g. for `AesCtrNoAuth` it has a `AesCtrNoAuthV1` subtype). --- fdbclient/BlobCipher.cpp | 400 +++++++++----------- fdbclient/BlobGranuleFiles.cpp | 10 +- fdbclient/include/fdbclient/BlobCipher.h | 306 ++++++++------- fdbserver/workloads/EncryptionOps.actor.cpp | 34 +- flow/include/flow/serialize.h | 31 ++ 5 files changed, 378 insertions(+), 403 deletions(-) diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp index 1653c505a4..f51e0e318a 100644 --- a/fdbclient/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -40,6 +40,7 @@ #include "flow/Trace.h" #include "flow/UnitTest.h" #include "flow/xxhash.h" +#include "include/fdbclient/BlobCipher.h" #include #include @@ -115,29 +116,33 @@ void validateEncryptHeaderAlgoHeaderVersion(const EncryptCipherMode cipherMode, // BlobCipherEncryptHeaderRef uint32_t BlobCipherEncryptHeaderRef::getHeaderSize(const int flagVersion, - const int authAlgoVersion, + const int algoVersion, const EncryptCipherMode cipherMode, const EncryptAuthTokenMode authMode, const EncryptAuthTokenAlgo authAlgo) { - uint32_t total = sizeof(uint16_t) * 2; // sizeof(flagVersion + algoHeaderVersion) if (flagVersion != 1) { + TraceEvent("BlobCipherGetHeaderSizeInvalidFlagVersion").detail("FlagVersion", flagVersion); + throw not_implemented(); + } + if (algoVersion != 1) { + TraceEvent("BlobCipherGetHeaderSizeInvalidAlgoVersion").detail("AlgoVersion", algoVersion); throw not_implemented(); } - total += sizeof(BlobCipherEncryptHeaderFlagsV1); + uint32_t total = sizeof(BlobCipherEncryptHeaderFlagsV1) + 2; // 2 bytes of std::variant index if (cipherMode != ENCRYPT_CIPHER_MODE_AES_256_CTR) { throw not_implemented(); } if (authMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - total += sizeof(AesCtrNoAuthV1); + total += AesCtrNoAuth::getSize(); } else { if (authAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { - total += sizeof(AesCtrWithAuthV1); + total += AesCtrWithHmac::getSize(); } else { ASSERT_EQ(authAlgo, ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); - total += sizeof(AesCtrWithAuthV1); + total += AesCtrWithCmac::getSize(); } } return total; @@ -146,18 +151,18 @@ uint32_t BlobCipherEncryptHeaderRef::getHeaderSize(const int flagVersion, const uint8_t* BlobCipherEncryptHeaderRef::getIV() const { ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); - validateEncryptHeaderFlagVersion(flagsVersion); - ASSERT_EQ(flagsVersion, 1); + validateEncryptHeaderFlagVersion(flagsVersion()); + ASSERT_EQ(flagsVersion(), 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, (EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo, - algoHeaderVersion); - ASSERT_EQ(algoHeaderVersion, 1); + algoHeaderVersion()); + ASSERT_EQ(algoHeaderVersion(), 1); - return std::visit([](auto& h) { return h.iv; }, algoHeader); + return std::visit([](auto& h) { return h.v1.iv; }, algoHeader); } template @@ -166,26 +171,25 @@ inline constexpr bool always_false_v = false; const EncryptHeaderCipherDetails BlobCipherEncryptHeaderRef::getCipherDetails() const { ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); - validateEncryptHeaderFlagVersion(flagsVersion); - ASSERT_EQ(flagsVersion, 1); + validateEncryptHeaderFlagVersion(flagsVersion()); + ASSERT_EQ(flagsVersion(), 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, (EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo, - algoHeaderVersion); - ASSERT_EQ(algoHeaderVersion, 1); + algoHeaderVersion()); + ASSERT_EQ(algoHeaderVersion(), 1); // TODO: Replace with "Overload visitor pattern" someday. return std::visit( [](auto&& h) { using T = std::decay_t; - if constexpr (std::is_same_v) { - return EncryptHeaderCipherDetails(h.cipherTextDetails); - } else if constexpr (std::is_same_v> || - std::is_same_v>) { - return EncryptHeaderCipherDetails(h.cipherTextDetails, h.cipherHeaderDetails); + if constexpr (std::is_same_v) { + return EncryptHeaderCipherDetails(h.v1.cipherTextDetails); + } else if constexpr (std::is_same_v || std::is_same_v) { + return EncryptHeaderCipherDetails(h.v1.cipherTextDetails, h.v1.cipherHeaderDetails); } else { static_assert(always_false_v, "Unknown encryption authentication"); } @@ -198,16 +202,16 @@ void BlobCipherEncryptHeaderRef::validateEncryptionHeaderDetails(const BlobCiphe const StringRef& ivRef) const { ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); - validateEncryptHeaderFlagVersion(flagsVersion); - ASSERT_EQ(flagsVersion, 1); + validateEncryptHeaderFlagVersion(flagsVersion()); + ASSERT_EQ(flagsVersion(), 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(this->flags); validateEncryptHeaderAlgoHeaderVersion((EncryptCipherMode)flags.encryptMode, (EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo, - algoHeaderVersion); - ASSERT_EQ(algoHeaderVersion, 1); + algoHeaderVersion()); + ASSERT_EQ(algoHeaderVersion(), 1); BlobCipherDetails persistedTextCipherDetails; BlobCipherDetails persistedHeaderCipherDetails; @@ -217,14 +221,13 @@ void BlobCipherEncryptHeaderRef::validateEncryptionHeaderDetails(const BlobCiphe return std::visit( [&persistedTextCipherDetails, &persistedHeaderCipherDetails, &persistedIV](auto&& h) { using T = std::decay_t; - if constexpr (std::is_same_v) { - persistedTextCipherDetails = h.cipherTextDetails; - persistedIV = (uint8_t*)&h.iv[0]; - } else if constexpr (std::is_same_v> || - std::is_same_v>) { - persistedTextCipherDetails = h.cipherTextDetails; - persistedHeaderCipherDetails = h.cipherHeaderDetails; - persistedIV = (uint8_t*)&h.iv[0]; + if constexpr (std::is_same_v) { + persistedTextCipherDetails = h.v1.cipherTextDetails; + persistedIV = (uint8_t*)&h.v1.iv[0]; + } else if constexpr (std::is_same_v || std::is_same_v) { + persistedTextCipherDetails = h.v1.cipherTextDetails; + persistedHeaderCipherDetails = h.v1.cipherHeaderDetails; + persistedIV = (uint8_t*)&h.v1.iv[0]; } else { static_assert(always_false_v, "Unknown encryption authentication"); } @@ -829,72 +832,66 @@ void EncryptBlobCipherAes265Ctr::init() { } } -template +template void EncryptBlobCipherAes265Ctr::setCipherAlgoHeaderWithAuthV1(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeaderFlagsV1& flags, - BlobCipherEncryptHeaderRef* headerRef, - Arena& arena) { + BlobCipherEncryptHeaderRef* headerRef) { // Construct algorithm specific details except 'authToken', serialize the details into 'headerRef' to allow // authToken generation - AesCtrWithAuthV1 algoHeader( + AesCtrWithAuthV1 algoHeader( BlobCipherDetails(textCipherKey->getDomainId(), textCipherKey->getBaseCipherId(), textCipherKey->getSalt()), BlobCipherDetails( headerCipherKey->getDomainId(), headerCipherKey->getBaseCipherId(), headerCipherKey->getSalt()), iv, - AES_256_IV_LENGTH, - arena); - headerRef->algoHeader = algoHeader; + AES_256_IV_LENGTH); + headerRef->algoHeader = AesCtrWithAuth(algoHeader); // compute the authentication token Standalone serialized = BlobCipherEncryptHeaderRef::toStringRef(*headerRef); - uint8_t computed[AuthTokenSize]; + uint8_t computed[Params::authTokenSize]; computeAuthToken({ { ciphertext, ciphertextLen }, { serialized.begin(), serialized.size() } }, headerCipherKey->rawCipher(), AES_256_KEY_LENGTH, &computed[0], (EncryptAuthTokenAlgo)flags.authTokenAlgo, AUTH_TOKEN_MAX_SIZE); - memcpy(&algoHeader.authToken[0], &computed[0], AuthTokenSize); + memcpy(&algoHeader.authToken[0], &computed[0], Params::authTokenSize); // Populate headerRef algorithm specific header details headerRef->algoHeader = algoHeader; } void EncryptBlobCipherAes265Ctr::setCipherAlgoHeaderNoAuthV1(const BlobCipherEncryptHeaderFlagsV1& flags, - BlobCipherEncryptHeaderRef* headerRef, - Arena& arena) { + BlobCipherEncryptHeaderRef* headerRef) { ASSERT_EQ(flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE); AesCtrNoAuthV1 aesCtrNoAuth( BlobCipherDetails(textCipherKey->getDomainId(), textCipherKey->getBaseCipherId(), textCipherKey->getSalt()), iv, - AES_256_IV_LENGTH, - arena); - headerRef->algoHeader = aesCtrNoAuth; + AES_256_IV_LENGTH); + headerRef->algoHeader = AesCtrNoAuth(aesCtrNoAuth); } void EncryptBlobCipherAes265Ctr::setCipherAlgoHeaderV1(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeaderFlagsV1& flags, - BlobCipherEncryptHeaderRef* headerRef, - Arena& arena) { + BlobCipherEncryptHeaderRef* headerRef) { ASSERT_EQ(1, getEncryptAlgoHeaderVersion((EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo)); if (flags.authTokenMode == EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - setCipherAlgoHeaderNoAuthV1(flags, headerRef, arena); + setCipherAlgoHeaderNoAuthV1(flags, headerRef); } else if (flags.authTokenAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { - setCipherAlgoHeaderWithAuthV1(ciphertext, ciphertextLen, flags, headerRef, arena); + setCipherAlgoHeaderWithAuthV1(ciphertext, ciphertextLen, flags, headerRef); } else { ASSERT_EQ(flags.authTokenAlgo, ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); - setCipherAlgoHeaderWithAuthV1(ciphertext, ciphertextLen, flags, headerRef, arena); + setCipherAlgoHeaderWithAuthV1(ciphertext, ciphertextLen, flags, headerRef); } } void EncryptBlobCipherAes265Ctr::updateEncryptHeaderFlagsV1(BlobCipherEncryptHeaderRef* headerRef, - BlobCipherEncryptHeaderFlagsV1* flags, - Arena& arena) { + BlobCipherEncryptHeaderFlagsV1* flags) { // Populate encryption header flags details flags->encryptMode = ENCRYPT_CIPHER_MODE_AES_256_CTR; @@ -905,22 +902,18 @@ void EncryptBlobCipherAes265Ctr::updateEncryptHeaderFlagsV1(BlobCipherEncryptHea void EncryptBlobCipherAes265Ctr::updateEncryptHeader(const uint8_t* ciphertext, const int ciphertextLen, - BlobCipherEncryptHeaderRef* headerRef, - Arena& arena) { + BlobCipherEncryptHeaderRef* headerRef) { ASSERT_LE(CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION, std::numeric_limits::max()); - - headerRef->flagsVersion = CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION; + ASSERT_EQ(1, CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); // update header flags - ASSERT_EQ(headerRef->flagsVersion, 1); BlobCipherEncryptHeaderFlagsV1 flags; - updateEncryptHeaderFlagsV1(headerRef, &flags, arena); + updateEncryptHeaderFlagsV1(headerRef, &flags); // update cipher algo header int algoHeaderVersion = getEncryptAlgoHeaderVersion(authTokenMode, authTokenAlgo); ASSERT_EQ(algoHeaderVersion, 1); - headerRef->algoHeaderVersion = algoHeaderVersion; - setCipherAlgoHeaderV1(ciphertext, ciphertextLen, flags, headerRef, arena); + setCipherAlgoHeaderV1(ciphertext, ciphertextLen, flags, headerRef); } StringRef EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext, @@ -960,7 +953,7 @@ StringRef EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plaintext, // Ensure encryption header authToken details sanity ASSERT(isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)); - updateEncryptHeader(ciphertext, plaintextLen, headerRef, arena); + updateEncryptHeader(ciphertext, plaintextLen, headerRef); if (CLIENT_KNOBS->ENABLE_ENCRYPTION_CPU_TIME_LOGGING) { BlobCipherMetrics::counters(usageType).encryptCPUTimeNS += int64_t((timer_monotonic() - startTime) * 1e9); } @@ -1100,27 +1093,26 @@ DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference } } -template +template void DecryptBlobCipherAes256Ctr::validateAuthTokenV1(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeaderFlagsV1& flags, const BlobCipherEncryptHeaderRef& headerRef) { ASSERT_EQ(flags.encryptMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - ASSERT_LE(AuthTokenSize, AUTH_TOKEN_MAX_SIZE); + ASSERT_LE(Params::authTokenSize, AUTH_TOKEN_MAX_SIZE); Arena tmpArena; - uint8_t persited[AuthTokenSize]; - uint8_t computed[AuthTokenSize]; + uint8_t persited[Params::authTokenSize]; + uint8_t computed[Params::authTokenSize]; // prepare the payload {cipherText + encryptionHeader} // ensure the 'authToken' is reset before computing the 'authentication token' BlobCipherEncryptHeaderRef headerRefCopy = BlobCipherEncryptHeaderRef(headerRef); - AesCtrWithAuthV1 algoHeaderCopy = - std::get>(headerRefCopy.algoHeader); + AesCtrWithAuth algoHeaderCopy = std::get>(headerRefCopy.algoHeader); // preserve the 'persisted' token for future validation before reseting the field - memcpy(&persited[0], &algoHeaderCopy.authToken[0], AuthTokenSize); - memset(&algoHeaderCopy.authToken[0], 0, AuthTokenSize); + memcpy(&persited[0], &algoHeaderCopy.v1.authToken[0], Params::authTokenSize); + memset(&algoHeaderCopy.v1.authToken[0], 0, Params::authTokenSize); headerRefCopy.algoHeader = algoHeaderCopy; Standalone serializedHeader = BlobCipherEncryptHeaderRef::toStringRef(headerRefCopy); @@ -1131,12 +1123,12 @@ void DecryptBlobCipherAes256Ctr::validateAuthTokenV1(const uint8_t* ciphertext, (EncryptAuthTokenAlgo)flags.authTokenAlgo, AUTH_TOKEN_MAX_SIZE); - if (memcmp(&persited[0], &computed[0], AuthTokenSize) != 0) { + if (memcmp(&persited[0], &computed[0], Params::authTokenSize) != 0) { TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") - .detail("HeaderFlagsVersion", headerRef.flagsVersion) + .detail("HeaderFlagsVersion", headerRef.flagsVersion()) .detail("HeaderMode", flags.encryptMode) - .detail("SingleAuthToken", StringRef(tmpArena, persited, AuthTokenSize)) - .detail("ComputedSingleAuthToken", StringRef(tmpArena, computed, AuthTokenSize)); + .detail("SingleAuthToken", StringRef(tmpArena, persited, Params::authTokenSize)) + .detail("ComputedSingleAuthToken", StringRef(tmpArena, computed, Params::authTokenSize)); CODE_PROBE(flags.authTokenAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, "ConfigurableEncryption: AuthToken value mismatch - HMAC_SHA auth token generation"); @@ -1155,10 +1147,10 @@ void DecryptBlobCipherAes256Ctr::validateHeaderSingleAuthTokenV1(const uint8_t* // ensure the 'authToken' is reset before computing the 'authentication token' if (flags.authTokenAlgo == EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC) { - validateAuthTokenV1(ciphertext, ciphertextLen, flags, headerRef); + validateAuthTokenV1(ciphertext, ciphertextLen, flags, headerRef); } else { ASSERT_EQ(flags.authTokenAlgo, EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA); - validateAuthTokenV1(ciphertext, ciphertextLen, flags, headerRef); + validateAuthTokenV1(ciphertext, ciphertextLen, flags, headerRef); } } @@ -1199,9 +1191,9 @@ void DecryptBlobCipherAes256Ctr::validateEncryptHeader(const uint8_t* ciphertext const BlobCipherEncryptHeaderRef& headerRef, EncryptAuthTokenMode* authTokenMode, EncryptAuthTokenAlgo* authTokenAlgo) { - if (headerRef.flagsVersion > CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION) { + if (headerRef.flagsVersion() != 1) { TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeader") - .detail("HeaderVersion", headerRef.flagsVersion) + .detail("HeaderVersion", headerRef.flagsVersion()) .detail("MaxSupportedVersion", CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); CODE_PROBE(true, "ConfigurableEncryption: Encryption header version unsupported"); @@ -1209,12 +1201,12 @@ void DecryptBlobCipherAes256Ctr::validateEncryptHeader(const uint8_t* ciphertext throw encrypt_header_metadata_mismatch(); } - if (headerRef.flagsVersion != 1) { + if (headerRef.flagsVersion() != 1) { throw not_implemented(); } BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); - validateEncryptHeaderFlagsV1(headerRef.flagsVersion, flags); + validateEncryptHeaderFlagsV1(headerRef.flagsVersion(), flags); validateAuthTokensV1(ciphertext, ciphertextLen, flags, headerRef); *authTokenMode = (EncryptAuthTokenMode)flags.authTokenMode; @@ -1278,8 +1270,7 @@ StringRef DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext, void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena) { + const BlobCipherEncryptHeader& header) { // prepare the payload {cipherText + encryptionHeader} // ensure the 'authToken' is reset before computing the 'authentication token' BlobCipherEncryptHeader headerCopy; @@ -1302,8 +1293,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph TraceEvent(SevWarn, "BlobCipherVerifyEncryptBlobHeaderAuthTokenMismatch") .detail("HeaderVersion", header.flags.headerVersion) .detail("HeaderMode", header.flags.encryptMode) - .detail("SingleAuthToken", - StringRef(arena, &header.singleAuthToken.authToken[0], AUTH_TOKEN_MAX_SIZE).toString()) + .detail("SingleAuthToken", StringRef(&header.singleAuthToken.authToken[0], AUTH_TOKEN_MAX_SIZE).toString()) .detail("ComputedSingleAuthToken", StringRef(computed, AUTH_TOKEN_MAX_SIZE)); CODE_PROBE(header.flags.authTokenAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA, @@ -1317,10 +1307,9 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph void DecryptBlobCipherAes256Ctr::verifyAuthTokens(const uint8_t* ciphertext, const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena) { + const BlobCipherEncryptHeader& header) { ASSERT_EQ(header.flags.authTokenMode, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header, arena); + verifyHeaderSingleAuthToken(ciphertext, ciphertextLen, header); authTokensValidationDone = true; } @@ -1365,7 +1354,7 @@ Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert Reference decrypted = makeReference(allocSize, arena); if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - verifyAuthTokens(ciphertext, ciphertextLen, header, arena); + verifyAuthTokens(ciphertext, ciphertextLen, header); ASSERT(authTokensValidationDone); } @@ -1799,34 +1788,31 @@ void testConfigurableEncryptionAesCtrNoAuthV1Ser(const int minDomainId) { BlobCipherEncryptHeaderFlagsV1 flags = BlobCipherEncryptHeaderFlagsV1( ENCRYPT_CIPHER_MODE_AES_256_CTR, ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE, ENCRYPT_HEADER_AUTH_TOKEN_ALGO_NONE); - size += sizeof(BlobCipherEncryptHeaderFlagsV1); - size += sizeof(uint16_t) * 2; - - headerRef.flagsVersion = CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION; - headerRef.algoHeaderVersion = CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION; + size += sizeof(BlobCipherEncryptHeaderFlagsV1) + 2; headerRef.flags = flags; - AesCtrNoAuthV1 noAuth; - noAuth.cipherTextDetails = BlobCipherDetails(1, 2, 23); - deterministicRandom()->randomBytes(&noAuth.iv[0], AES_256_IV_LENGTH); - Standalone serAlgo = AesCtrNoAuthV1::toStringRef(noAuth, arena); + AesCtrNoAuth noAuth; + noAuth.v1.cipherTextDetails = BlobCipherDetails(1, 2, 23); + deterministicRandom()->randomBytes(&noAuth.v1.iv[0], AES_256_IV_LENGTH); + Standalone serAlgo = AesCtrNoAuth::toStringRef(noAuth); ASSERT_EQ(serAlgo.size(), sizeof(noAuth)); - size += sizeof(noAuth); + size += AesCtrNoAuth::getSize(); headerRef.algoHeader = noAuth; Standalone serHeader = BlobCipherEncryptHeaderRef::toStringRef(headerRef); ASSERT_EQ(serHeader.size(), size); ASSERT_EQ(size, - BlobCipherEncryptHeaderRef::getHeaderSize(headerRef.flagsVersion, - headerRef.algoHeaderVersion, + BlobCipherEncryptHeaderRef::getHeaderSize(headerRef.flagsVersion(), + headerRef.algoHeaderVersion(), (EncryptCipherMode)flags.encryptMode, (EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo)); } -template +template void testConfigurableEncryptionAesCtrWithAuthSer(const int minDomainId) { + constexpr bool isHmac = std::is_same_v; ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); Arena arena; @@ -1836,32 +1822,27 @@ void testConfigurableEncryptionAesCtrWithAuthSer(const int minDomainId) { BlobCipherEncryptHeaderFlagsV1 flags = BlobCipherEncryptHeaderFlagsV1( ENCRYPT_CIPHER_MODE_AES_256_CTR, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, - AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE ? ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA - : ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); - size += (sizeof(BlobCipherEncryptHeaderFlagsV1) + 2 * sizeof(uint16_t)); + isHmac ? ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA : ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); + size += sizeof(BlobCipherEncryptHeaderFlagsV1) + 2; headerRef.flags = flags; - headerRef.flagsVersion = CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION; - headerRef.algoHeaderVersion = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE - ? CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION - : CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION; - AesCtrWithAuthV1 withAuth; - withAuth.cipherTextDetails = BlobCipherDetails(1, 2, 23); - withAuth.cipherHeaderDetails = BlobCipherDetails(ENCRYPT_HEADER_DOMAIN_ID, 2, 23); - deterministicRandom()->randomBytes(&withAuth.iv[0], AES_256_IV_LENGTH); - deterministicRandom()->randomBytes(&withAuth.authToken[0], AuthTokenSize); - Standalone serAlgo = AesCtrWithAuthV1::toStringRef(withAuth, arena); + AesCtrWithAuth withAuth; + withAuth.v1.cipherTextDetails = BlobCipherDetails(1, 2, 23); + withAuth.v1.cipherHeaderDetails = BlobCipherDetails(ENCRYPT_HEADER_DOMAIN_ID, 2, 23); + deterministicRandom()->randomBytes(&withAuth.v1.iv[0], AES_256_IV_LENGTH); + deterministicRandom()->randomBytes(&withAuth.v1.authToken[0], Params::authTokenSize); + Standalone serAlgo = AesCtrWithAuth::toStringRef(withAuth); ASSERT_EQ(serAlgo.size(), sizeof(withAuth)); - size += sizeof(withAuth); + size += AesCtrWithAuth::getSize(); headerRef.algoHeader = withAuth; Standalone serHeader = BlobCipherEncryptHeaderRef::toStringRef(headerRef); ASSERT_EQ(serHeader.size(), size); ASSERT_EQ(size, - BlobCipherEncryptHeaderRef::getHeaderSize(headerRef.flagsVersion, - headerRef.algoHeaderVersion, + BlobCipherEncryptHeaderRef::getHeaderSize(headerRef.flagsVersion(), + headerRef.algoHeaderVersion(), (EncryptCipherMode)flags.encryptMode, (EncryptAuthTokenMode)flags.authTokenMode, (EncryptAuthTokenAlgo)flags.authTokenAlgo)); @@ -1896,7 +1877,7 @@ void testConfigurableEncryptionHeaderNoAuthMode(const int minDomainId) { encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); - AesCtrNoAuthV1 noAuth = std::get(headerRef.algoHeader); + AesCtrNoAuth noAuth = std::get(headerRef.algoHeader); const uint8_t* headerIV = headerRef.getIV(); ASSERT_EQ(memcmp(&headerIV[0], &iv[0], AES_256_IV_LENGTH), 0); @@ -1912,13 +1893,13 @@ void testConfigurableEncryptionHeaderNoAuthMode(const int minDomainId) { BlobCipherEncryptHeaderFlagsV1 validateFlags = std::get(validateHeader.flags); ASSERT(validateFlags == flags); - AesCtrNoAuthV1 validateAlgo = std::get(validateHeader.algoHeader); - ASSERT(validateAlgo.cipherTextDetails == noAuth.cipherTextDetails); - ASSERT_EQ(memcmp(&validateAlgo.iv[0], &noAuth.iv[0], AES_256_IV_LENGTH), 0); + AesCtrNoAuth validateAlgo = std::get(validateHeader.algoHeader); + ASSERT(validateAlgo.v1.cipherTextDetails == noAuth.v1.cipherTextDetails); + ASSERT_EQ(memcmp(&validateAlgo.v1.iv[0], &noAuth.v1.iv[0], AES_256_IV_LENGTH), 0); TraceEvent("NoAuthHeaderSize") .detail("Flags", sizeof(flags)) - .detail("AlgoHeader", sizeof(noAuth)) + .detail("AlgoHeader", noAuth.getSize()) .detail("TotalHeader", serHeaderRef.size()); TraceEvent("TestConfigurableEncryptionHeader").detail("Mode", "No-Auth"); @@ -1953,45 +1934,30 @@ void testConfigurableEncryptionNoAuthMode(const int minDomainId) { StringRef encryptedBuf = encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); // validate header version details - AesCtrNoAuthV1 noAuth = std::get(headerRef.algoHeader); - Reference tCipherKeyKey = cipherKeyCache->getCipherKey( - noAuth.cipherTextDetails.encryptDomainId, noAuth.cipherTextDetails.baseCipherId, noAuth.cipherTextDetails.salt); + AesCtrNoAuth noAuth = std::get(headerRef.algoHeader); + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(noAuth.v1.cipherTextDetails.encryptDomainId, + noAuth.v1.cipherTextDetails.baseCipherId, + noAuth.v1.cipherTextDetails.salt); ASSERT(tCipherKeyKey->isEqual(cipherKey)); DecryptBlobCipherAes256Ctr decryptor( - tCipherKeyKey, Reference(), &noAuth.iv[0], BlobCipherMetrics::TEST); + tCipherKeyKey, Reference(), &noAuth.v1.iv[0], BlobCipherMetrics::TEST); StringRef decryptedBuf = decryptor.decrypt(encryptedBuf.begin(), encryptedBuf.size(), headerRef, arena); ASSERT_EQ(decryptedBuf.size(), bufLen); ASSERT_EQ(memcmp(decryptedBuf.begin(), &orgData[0], bufLen), 0); TraceEvent("BlobCipherTestEncryptDecryptDone") - .detail("HeaderFlagsVersion", headerRef.flagsVersion) - .detail("AlgoHeaderVersion", headerRef.algoHeaderVersion) + .detail("HeaderFlagsVersion", headerRef.flagsVersion()) + .detail("AlgoHeaderVersion", headerRef.algoHeaderVersion()) .detail("HeaderEncryptMode", ENCRYPT_CIPHER_MODE_AES_256_CTR) .detail("HeaderEncryptAuthTokenMode", ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) .detail("HeaderEncryptAuthTokenAlgo", ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) - .detail("DomainId", noAuth.cipherTextDetails.encryptDomainId) - .detail("BaseCipherId", noAuth.cipherTextDetails.baseCipherId) - .detail("Salt", noAuth.cipherTextDetails.salt); - - // induce encryption header corruption - headerVersion corrupted - BlobCipherEncryptHeaderRef corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); - corruptedHeaderRef.flagsVersion += 1; - try { - encryptedBuf = encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); - DecryptBlobCipherAes256Ctr decryptor( - tCipherKeyKey, Reference(), &iv[0], BlobCipherMetrics::TEST); - decryptedBuf = decryptor.decrypt(encryptedBuf.begin(), bufLen, corruptedHeaderRef, arena); - ASSERT(false); // error expected - } catch (Error& e) { - if (e.code() != error_code_encrypt_header_metadata_mismatch) { - throw; - } - TraceEvent("ConfigurableEncryptionNoAuthCorruptFlagsDone"); - } + .detail("DomainId", noAuth.v1.cipherTextDetails.encryptDomainId) + .detail("BaseCipherId", noAuth.v1.cipherTextDetails.baseCipherId) + .detail("Salt", noAuth.v1.cipherTextDetails.salt); // induce encryption header corruption - encryptionMode corrupted - corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); + BlobCipherEncryptHeaderRef corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); BlobCipherEncryptHeaderFlagsV1 corruptedFlags = std::get(headerRef.flags); corruptedFlags.encryptMode += 1; corruptedHeaderRef.flags = corruptedFlags; @@ -2029,12 +1995,12 @@ void testConfigurableEncryptionNoAuthMode(const int minDomainId) { // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE // HMAC_SHA authToken algorithm -template +template void testSingleAuthMode(const int minDomainId) { - const std::string authAlgoStr = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE ? "HMAC-SHA" : "AES-CMAC"; - const EncryptAuthTokenAlgo authAlgo = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE - ? EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA - : EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC; + constexpr bool isHmac = std::is_same_v; + const std::string authAlgoStr = isHmac ? "HMAC-SHA" : "AES-CMAC"; + const EncryptAuthTokenAlgo authAlgo = isHmac ? EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA + : EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC; TraceEvent("BlobCipherTestSingleAuthTokenStart").detail("Mode", authAlgoStr); @@ -2074,7 +2040,8 @@ void testSingleAuthMode(const int minDomainId) { .detail("HeaderEncryptAuthTokenAlgo", header.flags.authTokenAlgo) .detail("DomainId", header.cipherTextDetails.encryptDomainId) .detail("BaseCipherId", header.cipherTextDetails.baseCipherId) - .detail("HeaderAuthToken", StringRef(arena, &header.singleAuthToken.authToken[0], AuthTokenSize).toString()); + .detail("HeaderAuthToken", + StringRef(arena, &header.singleAuthToken.authToken[0], Params::authTokenSize).toString()); Reference tCipherKeyKey = cipherKeyCache->getCipherKey( header.cipherTextDetails.encryptDomainId, header.cipherTextDetails.baseCipherId, header.cipherTextDetails.salt); @@ -2128,7 +2095,7 @@ void testSingleAuthMode(const int minDomainId) { memcpy(reinterpret_cast(&headerCopy), reinterpret_cast(&header), sizeof(BlobCipherEncryptHeader)); - int hIdx = deterministicRandom()->randomInt(0, AuthTokenSize - 1); + int hIdx = deterministicRandom()->randomInt(0, Params::authTokenSize - 1); headerCopy.singleAuthToken.authToken[hIdx] += 1; try { DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, header.iv, BlobCipherMetrics::TEST); @@ -2158,12 +2125,12 @@ void testSingleAuthMode(const int minDomainId) { TraceEvent("BlobCipherTestSingleAuthTokenEnd").detail("Mode", authAlgoStr); } -template +template void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { + constexpr bool isHmac = std::is_same_v; ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); - TraceEvent("TestEncryptionHeaderStart") - .detail("Mode", AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE ? "HMAC_SHA" : "AES-CMAC"); + TraceEvent("TestEncryptionHeaderStart").detail("Mode", isHmac ? "HMAC_SHA" : "AES-CMAC"); Reference cipherKeyCache = BlobCipherKeyCache::getInstance(); @@ -2182,7 +2149,7 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { iv, AES_256_IV_LENGTH, EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE, - AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE + std::is_same_v ? EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA : EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC, BlobCipherMetrics::TEST); @@ -2190,7 +2157,7 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); - AesCtrWithAuthV1 algoHeader = std::get>(headerRef.algoHeader); + AesCtrWithAuth algoHeader = std::get>(headerRef.algoHeader); const uint8_t* headerIV = headerRef.getIV(); ASSERT_EQ(memcmp(&headerIV[0], &iv[0], AES_256_IV_LENGTH), 0); @@ -2209,31 +2176,29 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { BlobCipherEncryptHeaderFlagsV1 validateFlags = std::get(validateHeader.flags); ASSERT(validateFlags == flags); - AesCtrWithAuthV1 validateAlgo = std::get>(validateHeader.algoHeader); - ASSERT(validateAlgo.cipherTextDetails == algoHeader.cipherTextDetails); - ASSERT(validateAlgo.cipherHeaderDetails == algoHeader.cipherHeaderDetails); - ASSERT_EQ(memcmp(&iv[0], &validateAlgo.iv[0], AES_256_IV_LENGTH), 0); - ASSERT_EQ(memcmp(&algoHeader.authToken[0], &validateAlgo.authToken[0], AuthTokenSize), 0); + AesCtrWithAuth validateAlgo = std::get>(validateHeader.algoHeader); + ASSERT(validateAlgo.v1.cipherTextDetails == algoHeader.v1.cipherTextDetails); + ASSERT(validateAlgo.v1.cipherHeaderDetails == algoHeader.v1.cipherHeaderDetails); + ASSERT_EQ(memcmp(&iv[0], &validateAlgo.v1.iv[0], AES_256_IV_LENGTH), 0); + ASSERT_EQ(memcmp(&algoHeader.v1.authToken[0], &validateAlgo.v1.authToken[0], Params::authTokenSize), 0); TraceEvent("HeaderSize") .detail("Flags", sizeof(flags)) - .detail("AlgoHeader", sizeof(algoHeader)) + .detail("AlgoHeader", algoHeader.getSize()) .detail("TotalHeader", serHeaderRef.size()); - TraceEvent("TestEncryptionHeaderEnd") - .detail("Mode", AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE ? "HMAC_SHA" : "AES-CMAC"); + TraceEvent("TestEncryptionHeaderEnd").detail("Mode", isHmac ? "HMAC_SHA" : "AES-CMAC"); } // validate basic encrypt followed by decrypt operation for AUTH_TOKEN_MODE_SINGLE -template +template void testConfigurableEncryptionSingleAuthMode(const int minDomainId) { - const std::string authAlgoStr = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE ? "HMAC-SHA" : "AES-CMAC"; - const EncryptAuthTokenAlgo authAlgo = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE - ? EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA - : EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC; - const int algoHeaderVersion = AuthTokenSize == AUTH_TOKEN_HMAC_SHA_SIZE - ? CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION - : CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION; + constexpr bool isHmac = std::is_same_v; + const std::string authAlgoStr = isHmac ? "HMAC-SHA" : "AES-CMAC"; + const EncryptAuthTokenAlgo authAlgo = isHmac ? EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA + : EncryptAuthTokenAlgo::ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC; + const int algoHeaderVersion = isHmac ? CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION + : CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION; ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); @@ -2263,8 +2228,8 @@ void testConfigurableEncryptionSingleAuthMode(const int minDomainId) { ASSERT_EQ(encryptedBuf.size(), bufLen); ASSERT_NE(memcmp(&orgData[0], encryptedBuf.begin(), bufLen), 0); - ASSERT_EQ(headerRef.flagsVersion, CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); - ASSERT_EQ(headerRef.algoHeaderVersion, algoHeaderVersion); + ASSERT_EQ(headerRef.flagsVersion(), CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); + ASSERT_EQ(headerRef.algoHeaderVersion(), algoHeaderVersion); // validate flags BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); @@ -2273,61 +2238,46 @@ void testConfigurableEncryptionSingleAuthMode(const int minDomainId) { ASSERT_EQ(flags.authTokenAlgo, authAlgo); // validate IV - AesCtrWithAuthV1 withAuth = std::get>(headerRef.algoHeader); - ASSERT_EQ(memcmp(&iv[0], &withAuth.iv[0], AES_256_IV_LENGTH), 0); + AesCtrWithAuth withAuth = std::get>(headerRef.algoHeader); + ASSERT_EQ(memcmp(&iv[0], &withAuth.v1.iv[0], AES_256_IV_LENGTH), 0); ASSERT_NE(memcmp(&orgData[0], encryptedBuf.begin(), bufLen), 0); // validate cipherKey details - ASSERT_EQ(withAuth.cipherTextDetails.encryptDomainId, cipherKey->getDomainId()); - ASSERT_EQ(withAuth.cipherTextDetails.baseCipherId, cipherKey->getBaseCipherId()); - ASSERT_EQ(withAuth.cipherTextDetails.salt, cipherKey->getSalt()); - ASSERT_EQ(withAuth.cipherHeaderDetails.encryptDomainId, headerCipherKey->getDomainId()); - ASSERT_EQ(withAuth.cipherHeaderDetails.baseCipherId, headerCipherKey->getBaseCipherId()); - ASSERT_EQ(withAuth.cipherHeaderDetails.salt, headerCipherKey->getSalt()); + ASSERT_EQ(withAuth.v1.cipherTextDetails.encryptDomainId, cipherKey->getDomainId()); + ASSERT_EQ(withAuth.v1.cipherTextDetails.baseCipherId, cipherKey->getBaseCipherId()); + ASSERT_EQ(withAuth.v1.cipherTextDetails.salt, cipherKey->getSalt()); + ASSERT_EQ(withAuth.v1.cipherHeaderDetails.encryptDomainId, headerCipherKey->getDomainId()); + ASSERT_EQ(withAuth.v1.cipherHeaderDetails.baseCipherId, headerCipherKey->getBaseCipherId()); + ASSERT_EQ(withAuth.v1.cipherHeaderDetails.salt, headerCipherKey->getSalt()); - Reference tCipherKeyKey = cipherKeyCache->getCipherKey(withAuth.cipherTextDetails.encryptDomainId, - withAuth.cipherTextDetails.baseCipherId, - withAuth.cipherTextDetails.salt); - Reference hCipherKey = cipherKeyCache->getCipherKey(withAuth.cipherHeaderDetails.encryptDomainId, - withAuth.cipherHeaderDetails.baseCipherId, - withAuth.cipherHeaderDetails.salt); + Reference tCipherKeyKey = cipherKeyCache->getCipherKey(withAuth.v1.cipherTextDetails.encryptDomainId, + withAuth.v1.cipherTextDetails.baseCipherId, + withAuth.v1.cipherTextDetails.salt); + Reference hCipherKey = cipherKeyCache->getCipherKey(withAuth.v1.cipherHeaderDetails.encryptDomainId, + withAuth.v1.cipherHeaderDetails.baseCipherId, + withAuth.v1.cipherHeaderDetails.salt); ASSERT(tCipherKeyKey->isEqual(cipherKey)); ASSERT(hCipherKey->isEqual(headerCipherKey)); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &withAuth.iv[0], BlobCipherMetrics::TEST); + DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &withAuth.v1.iv[0], BlobCipherMetrics::TEST); StringRef decryptedBuf = decryptor.decrypt(encryptedBuf.begin(), bufLen, headerRef, arena); ASSERT_EQ(decryptedBuf.size(), bufLen); ASSERT_EQ(memcmp(decryptedBuf.begin(), &orgData[0], bufLen), 0); TraceEvent("BlobCipherTestEncryptDecryptDone") - .detail("HeaderFlagsVersion", headerRef.flagsVersion) - .detail("AlgoHeaderVersion", headerRef.algoHeaderVersion) + .detail("HeaderFlagsVersion", headerRef.flagsVersion()) + .detail("AlgoHeaderVersion", headerRef.algoHeaderVersion()) .detail("HeaderEncryptMode", flags.encryptMode) .detail("HeaderEncryptAuthTokenMode", flags.authTokenMode) .detail("HeaderEncryptAuthTokenAlgo", flags.authTokenAlgo) - .detail("TextDomainId", withAuth.cipherTextDetails.encryptDomainId) - .detail("TextBaseCipherId", withAuth.cipherTextDetails.baseCipherId) - .detail("TextSalt", withAuth.cipherTextDetails.salt) - .detail("HeaderDomainId", withAuth.cipherHeaderDetails.encryptDomainId) - .detail("HeaderBaseCipherId", withAuth.cipherHeaderDetails.baseCipherId) - .detail("HeaderSalt", withAuth.cipherHeaderDetails.salt); - - // induce encryption header corruption - headerVersion corrupted - BlobCipherEncryptHeaderRef corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); - corruptedHeaderRef.flagsVersion += 1; - try { - encryptedBuf = encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); - DecryptBlobCipherAes256Ctr decryptor(tCipherKeyKey, hCipherKey, &iv[0], BlobCipherMetrics::TEST); - decryptedBuf = decryptor.decrypt(encryptedBuf.begin(), bufLen, corruptedHeaderRef, arena); - ASSERT(false); // error expected - } catch (Error& e) { - if (e.code() != error_code_encrypt_header_metadata_mismatch) { - throw; - } - TraceEvent("ConfigurableEncryptionCorruptFlagsDone").detail("Mode", authAlgoStr); - } + .detail("TextDomainId", withAuth.v1.cipherTextDetails.encryptDomainId) + .detail("TextBaseCipherId", withAuth.v1.cipherTextDetails.baseCipherId) + .detail("TextSalt", withAuth.v1.cipherTextDetails.salt) + .detail("HeaderDomainId", withAuth.v1.cipherHeaderDetails.encryptDomainId) + .detail("HeaderBaseCipherId", withAuth.v1.cipherHeaderDetails.baseCipherId) + .detail("HeaderSalt", withAuth.v1.cipherHeaderDetails.salt); // induce encryption header corruption - encryptionMode corrupted - corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); + BlobCipherEncryptHeaderRef corruptedHeaderRef = BlobCipherEncryptHeaderRef(headerRef); BlobCipherEncryptHeaderFlagsV1 corruptedFlags = std::get(headerRef.flags); corruptedFlags.encryptMode += 1; corruptedHeaderRef.flags = corruptedFlags; @@ -2407,20 +2357,20 @@ TEST_CASE("/blobCipher") { testConfigurableEncryptionBlobCipherHeaderFlagsV1Ser(); testConfigurableEncryptionAesCtrNoAuthV1Ser(minDomainId); - testConfigurableEncryptionAesCtrWithAuthSer(minDomainId); - testConfigurableEncryptionAesCtrWithAuthSer(minDomainId); + testConfigurableEncryptionAesCtrWithAuthSer(minDomainId); + testConfigurableEncryptionAesCtrWithAuthSer(minDomainId); testConfigurableEncryptionHeaderNoAuthMode(minDomainId); - testConfigurableEncryptionHeaderSingleAuthMode(minDomainId); - testConfigurableEncryptionHeaderSingleAuthMode(minDomainId); + testConfigurableEncryptionHeaderSingleAuthMode(minDomainId); + testConfigurableEncryptionHeaderSingleAuthMode(minDomainId); testNoAuthMode(minDomainId); - testSingleAuthMode(minDomainId); - testSingleAuthMode(minDomainId); + testSingleAuthMode(minDomainId); + testSingleAuthMode(minDomainId); testConfigurableEncryptionNoAuthMode(minDomainId); - testConfigurableEncryptionSingleAuthMode(minDomainId); - testConfigurableEncryptionSingleAuthMode(minDomainId); + testConfigurableEncryptionSingleAuthMode(minDomainId); + testConfigurableEncryptionSingleAuthMode(minDomainId); testKeyCacheCleanup(minDomainId, maxDomainId); return Void(); diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 07fc44ae2d..8f352a8b5c 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -253,12 +253,10 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys, void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys, const BlobCipherEncryptHeaderRef& headerRef, const StringRef& ivRef) { - headerRef.validateEncryptionHeaderDetails(BlobCipherDetails(eKeys.textCipherKey->getDomainId(), - eKeys.textCipherKey->getBaseCipherId(), - eKeys.textCipherKey->getSalt()), - BlobCipherDetails(eKeys.headerCipherKey->getDomainId(), - eKeys.headerCipherKey->getBaseCipherId(), - eKeys.headerCipherKey->getSalt()), + ASSERT(eKeys.textCipherKey.isValid()); + headerRef.validateEncryptionHeaderDetails(eKeys.textCipherKey->details(), + eKeys.headerCipherKey.isValid() ? eKeys.headerCipherKey->details() + : BlobCipherDetails(), ivRef); } diff --git a/fdbclient/include/fdbclient/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h index c82bcf5776..defaeb8642 100644 --- a/fdbclient/include/fdbclient/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -23,6 +23,7 @@ #include "fdbrpc/Stats.h" +#include "fdbclient/Knobs.h" #include "flow/Arena.h" #include "flow/EncryptUtils.h" #include "flow/FastRef.h" @@ -164,6 +165,10 @@ struct BlobCipherDetails { // Random salt EncryptCipherRandomSalt salt{}; + static uint32_t getSize() { + return sizeof(EncryptCipherDomainId) + sizeof(EncryptCipherBaseKeyId) + sizeof(EncryptCipherRandomSalt); + } + BlobCipherDetails() {} BlobCipherDetails(const EncryptCipherDomainId& dId, const EncryptCipherBaseKeyId& bId, @@ -200,6 +205,15 @@ struct hash { }; } // namespace std +struct EncryptHeaderCipherDetails { + BlobCipherDetails textCipherDetails; + Optional headerCipherDetails; + + EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails) : textCipherDetails(tCipherDetails) {} + EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails, const BlobCipherDetails& hCipherDetails) + : textCipherDetails(tCipherDetails), headerCipherDetails(hCipherDetails) {} +}; + #pragma pack(push, 1) // exact fit - no padding // Why BinarySerialization instead of ObjectSerialization? @@ -211,9 +225,9 @@ struct hash { // ---------------------------------------------------------------------------------------------------------- // | S.No | ObjFlags | BinaryFlags | ObjectAlgo | BinaryAlgo | TotalObject | TotalBinary | // | ----------------- | ----------- | ------------ | ----------- | ---------- | ------------ | ------------ | -// | AesCtrNoAuth | 40 | 3 | 104 | 40 | 208 | 47 | -// | AesCtrHmacSha | 40 | 3 | 184 | 96 | 288 | 103 | -// | AesCtrAesCmac | 40 | 3 | 168 | 80 | 272 | 87 | +// | AesCtrNoAuth | 40 | 3 | 104 | 40 | 208 | 46 | +// | AesCtrHmacSha | 40 | 3 | 184 | 96 | 288 | 102 | +// | AesCtrAesCmac | 40 | 3 | 168 | 80 | 272 | 86 | // ---------------------------------------------------------------------------------------------------------- struct BlobCipherEncryptHeaderFlagsV1 { @@ -262,8 +276,10 @@ struct BlobCipherEncryptHeaderFlagsV1 { // 'encrypted buffer', compared to reading only encryptionHeader and ensuring its sanity; for instance: // backup-files. -template +template struct AesCtrWithAuthV1 { + using Self = AesCtrWithAuthV1; + // Serializable fields // Text cipher encryption information @@ -273,40 +289,96 @@ struct AesCtrWithAuthV1 { // Initialization vector uint8_t iv[AES_256_IV_LENGTH]; // Authentication token - uint8_t authToken[AuthTokenSize]; + uint8_t authToken[Params::authTokenSize]; - AesCtrWithAuthV1() {} + AesCtrWithAuthV1() = default; AesCtrWithAuthV1(const BlobCipherDetails& textDetails, const BlobCipherDetails& headerDetails, const uint8_t* ivBuf, - const int ivLen, - Arena& arena) + const int ivLen) : cipherTextDetails(textDetails), cipherHeaderDetails(headerDetails) { ASSERT_EQ(ivLen, AES_256_IV_LENGTH); memcpy(&iv[0], ivBuf, ivLen); - memset(&authToken[0], 0, AuthTokenSize); + memset(&authToken[0], 0, Params::authTokenSize); } - bool operator==(const AesCtrWithAuthV1& o) const { + bool operator==(const Self& o) const { return cipherHeaderDetails == o.cipherHeaderDetails && cipherTextDetails == o.cipherTextDetails && memcmp(&iv[0], &o.iv[0], AES_256_IV_LENGTH) == 0 && - memcmp(&authToken[0], &o.authToken[0], AuthTokenSize) == 0; + memcmp(&authToken[0], &o.authToken[0], Params::authTokenSize) == 0; } - static Standalone toStringRef(const AesCtrWithAuthV1& algoHeader, Arena& arena) { - BinaryWriter wr(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); - wr.serializeBytes(&algoHeader, sizeof(AesCtrWithAuthV1)); - return wr.toValue(arena); - } + static uint32_t getSize() { return BlobCipherDetails::getSize() * 2 + AES_256_IV_LENGTH + Params::authTokenSize; } template void serialize(Ar& ar) { serializer(ar, cipherTextDetails, cipherHeaderDetails); ar.serializeBytes(iv, AES_256_IV_LENGTH); - ar.serializeBytes(authToken, AuthTokenSize); + ar.serializeBytes(authToken, Params::authTokenSize); } }; +template +struct AesCtrWithAuth { + // Serializable fields + + // Algorithm header version + uint8_t version = 1; + // List of supported versions. + union { + AesCtrWithAuthV1 v1; + }; + + AesCtrWithAuth() { + // Only V1 is supported + ASSERT_EQ(1, Params::getDefaultHeaderVersion()); + } + + AesCtrWithAuth(AesCtrWithAuthV1& v) : v1(v) { + // Only V1 is supported + ASSERT_EQ(1, Params::getDefaultHeaderVersion()); + } + + static uint32_t getSize() { return AesCtrWithAuthV1::getSize() + 1; } + + static Standalone toStringRef(const AesCtrWithAuth& algoHeader) { + BinaryWriter wr(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + wr << algoHeader; + return wr.toValue(); + } + + template + void serialize(Ar& ar) { + if (ar.isSerializing) { + ASSERT_EQ(1, version); + } + serializer(ar, version); + if (ar.isDeserializing && version != 1) { + TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedAlgoHeaderVersion") + .detail("HeaderType", "AesCtrWith" + Params::authAlgoName()) + .detail("Version", version); + throw not_implemented(); + } + serializer(ar, v1); + } +}; + +struct AesCtrWithHmacParams { + static constexpr int authTokenSize = AUTH_TOKEN_HMAC_SHA_SIZE; + + static std::string authAlgoName() { return "Hmac"; } + static uint8_t getDefaultHeaderVersion() { return CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_HMAC_SHA_AUTH_VERSION; } +}; +using AesCtrWithHmac = AesCtrWithAuth; + +struct AesCtrWithCmacParams { + static constexpr int authTokenSize = AUTH_TOKEN_AES_CMAC_SIZE; + + static std::string authAlgoName() { return "Cmac"; } + static uint8_t getDefaultHeaderVersion() { return CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_AES_CMAC_AUTH_VERSION; } +}; +using AesCtrWithCmac = AesCtrWithAuth; + struct AesCtrNoAuthV1 { // Serializable fields @@ -315,8 +387,8 @@ struct AesCtrNoAuthV1 { // Initialization vector uint8_t iv[AES_256_IV_LENGTH]; - AesCtrNoAuthV1() {} - AesCtrNoAuthV1(const BlobCipherDetails& textDetails, const uint8_t* ivBuf, const int ivLen, Arena& arena) + AesCtrNoAuthV1() = default; + AesCtrNoAuthV1(const BlobCipherDetails& textDetails, const uint8_t* ivBuf, const int ivLen) : cipherTextDetails(textDetails) { ASSERT_EQ(ivLen, AES_256_IV_LENGTH); memcpy(&iv[0], ivBuf, ivLen); @@ -326,11 +398,7 @@ struct AesCtrNoAuthV1 { return cipherTextDetails == o.cipherTextDetails && memcmp(&iv[0], &o.iv[0], AES_256_IV_LENGTH) == 0; } - static Standalone toStringRef(const AesCtrNoAuthV1& algoHeader, Arena& arena) { - BinaryWriter wr(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); - wr.serializeBytes(&algoHeader, sizeof(AesCtrNoAuthV1)); - return wr.toValue(arena); - } + static uint32_t getSize() { return BlobCipherDetails::getSize() + AES_256_IV_LENGTH; } template void serialize(Ar& ar) { @@ -339,36 +407,57 @@ struct AesCtrNoAuthV1 { } }; -struct EncryptHeaderCipherDetails { - BlobCipherDetails textCipherDetails; - Optional headerCipherDetails; +struct AesCtrNoAuth { + // Serializable fields - EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails) : textCipherDetails(tCipherDetails) {} - EncryptHeaderCipherDetails(const BlobCipherDetails& tCipherDetails, const BlobCipherDetails& hCipherDetails) - : textCipherDetails(tCipherDetails), headerCipherDetails(hCipherDetails) {} + // Algorithm header version + uint8_t version = 1; + // List of supported versions. + union { + AesCtrNoAuthV1 v1; + }; + + AesCtrNoAuth() { + // Only V1 is supported + ASSERT_EQ(1, CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION); + } + + AesCtrNoAuth(AesCtrNoAuthV1& v) : v1(v) { + // Only V1 is supported + ASSERT_EQ(1, CLIENT_KNOBS->ENCRYPT_HEADER_AES_CTR_NO_AUTH_VERSION); + } + + static uint32_t getSize() { return AesCtrNoAuthV1::getSize() + 1; } + + static Standalone toStringRef(const AesCtrNoAuth& algoHeader) { + BinaryWriter wr(AssumeVersion(ProtocolVersion::withEncryptionAtRest())); + wr << algoHeader; + return wr.toValue(); + } + + template + void serialize(Ar& ar) { + if (ar.isSerializing) { + ASSERT_EQ(1, version); + } + serializer(ar, version); + if (ar.isDeserializing && version != 1) { + TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedAlgoHeaderVersion") + .detail("HeaderType", "AesCtrNoAuth") + .detail("Version", version); + throw not_implemented(); + } + serializer(ar, v1); + } }; struct BlobCipherEncryptHeaderRef { // Serializable fields - - // HeaderFlags version tracker - uint16_t flagsVersion; - // Encryption algorithm header version tracker - uint16_t algoHeaderVersion; - - // The on-disk format doesn't store std::variant, currently "serializer" doesn't support std::variant, the - // (de)serialization code serializes the relevant BlobCipherEncryptHeader and AlgoHeader structs for a given - // 'flagVersion' and 'algoHeaderVersion'. Refer BlobCipherEncryptHeaderRef::serialize() for more details. std::variant flags; - std::variant, AesCtrWithAuthV1> - algoHeader; + std::variant algoHeader; - BlobCipherEncryptHeaderRef() - : flagsVersion(INVALID_ENCRYPT_HEADERS_FLAG_VERSION), - algoHeaderVersion(INVALID_ENCRYPT_HEADER_ALGO_HEADER_VERSION) {} - BlobCipherEncryptHeaderRef(const BlobCipherEncryptHeaderRef& src) - : flagsVersion(src.flagsVersion), algoHeaderVersion(src.algoHeaderVersion), flags(src.flags), - algoHeader(src.algoHeader) {} + BlobCipherEncryptHeaderRef() = default; + BlobCipherEncryptHeaderRef(const BlobCipherEncryptHeaderRef& src) = default; static BlobCipherEncryptHeaderRef fromStringRef(const StringRef& header) { return BinaryReader::fromStringRef( @@ -386,93 +475,15 @@ struct BlobCipherEncryptHeaderRef { const EncryptAuthTokenMode authMode, const EncryptAuthTokenAlgo authAlgo); + int flagsVersion() const { return flags.index() + 1; } + + int algoHeaderVersion() const { + return std::visit([&](auto& h) { return h.version; }, algoHeader); + } + template void serialize(Ar& ar) { - // TODO: once std::variant native (de)serialization support is added, the method would transform to much shorter - // implementaion - uint8_t encryptMode; - EncryptAuthTokenMode authMode; - EncryptAuthTokenAlgo authAlgo; - - serializer(ar, flagsVersion, algoHeaderVersion); - if (ar.isSerializing) { - if (flagsVersion != 1) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedFlagVersion").detail("Version", flagsVersion); - throw not_implemented(); - } - - BlobCipherEncryptHeaderFlagsV1 f = std::get(flags); - encryptMode = f.encryptMode; - authMode = (EncryptAuthTokenMode)f.authTokenMode; - authAlgo = (EncryptAuthTokenAlgo)f.authTokenAlgo; - serializer(ar, f); - - if (encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedEncryptMode").detail("Mode", encryptMode); - throw not_implemented(); - } - if (algoHeaderVersion != 1) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedAlgoHeaderVersion") - .detail("Version", algoHeaderVersion); - throw not_implemented(); - } - - if (authMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - AesCtrNoAuthV1 noAuth = std::get(algoHeader); - serializer(ar, noAuth); - } else { - ASSERT_EQ(authMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - if (authAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { - AesCtrWithAuthV1 hmacSha = - std::get>(algoHeader); - serializer(ar, hmacSha); - } else { - ASSERT_EQ(authAlgo, ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); - AesCtrWithAuthV1 aesCmac = - std::get>(algoHeader); - serializer(ar, aesCmac); - } - } - } else if (ar.isDeserializing) { - if (flagsVersion != 1) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedFlagVersion").detail("Version", flagsVersion); - throw not_implemented(); - } - BlobCipherEncryptHeaderFlagsV1 f; - serializer(ar, f); - this->flags = f; - encryptMode = f.encryptMode; - authMode = (EncryptAuthTokenMode)f.authTokenMode; - authAlgo = (EncryptAuthTokenAlgo)f.authTokenAlgo; - - if (encryptMode != ENCRYPT_CIPHER_MODE_AES_256_CTR) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedEncryptMode").detail("Mode", encryptMode); - throw not_implemented(); - } - if (algoHeaderVersion != 1) { - TraceEvent(SevWarn, "BlobCipherEncryptHeaderUnsupportedAlgoHeaderVersion") - .detail("Version", algoHeaderVersion); - throw not_implemented(); - } - - if (authMode == ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - AesCtrNoAuthV1 noAuth; - serializer(ar, noAuth); - this->algoHeader = noAuth; - } else { - ASSERT_EQ(authMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - if (authAlgo == ENCRYPT_HEADER_AUTH_TOKEN_ALGO_HMAC_SHA) { - AesCtrWithAuthV1 hmacSha; - serializer(ar, hmacSha); - this->algoHeader = hmacSha; - } else { - ASSERT_EQ(authAlgo, ENCRYPT_HEADER_AUTH_TOKEN_ALGO_AES_CMAC); - AesCtrWithAuthV1 aesCmac; - serializer(ar, aesCmac); - this->algoHeader = aesCmac; - } - } - } + serializer(ar, flags, algoHeader); } const uint8_t* getIV() const; @@ -885,22 +896,18 @@ public: private: void init(); - void updateEncryptHeader(const uint8_t*, const int, BlobCipherEncryptHeaderRef* headerRef, Arena& arena); - void updateEncryptHeaderFlagsV1(BlobCipherEncryptHeaderRef* headerRef, - BlobCipherEncryptHeaderFlagsV1* flags, - Arena& arena); + void updateEncryptHeader(const uint8_t*, const int, BlobCipherEncryptHeaderRef* headerRef); + void updateEncryptHeaderFlagsV1(BlobCipherEncryptHeaderRef* headerRef, BlobCipherEncryptHeaderFlagsV1* flags); void setCipherAlgoHeaderV1(const uint8_t*, const int, const BlobCipherEncryptHeaderFlagsV1&, - BlobCipherEncryptHeaderRef*, - Arena&); - void setCipherAlgoHeaderNoAuthV1(const BlobCipherEncryptHeaderFlagsV1&, BlobCipherEncryptHeaderRef*, Arena&); - template + BlobCipherEncryptHeaderRef*); + void setCipherAlgoHeaderNoAuthV1(const BlobCipherEncryptHeaderFlagsV1&, BlobCipherEncryptHeaderRef*); + template void setCipherAlgoHeaderWithAuthV1(const uint8_t*, const int, const BlobCipherEncryptHeaderFlagsV1&, - BlobCipherEncryptHeaderRef*, - Arena&); + BlobCipherEncryptHeaderRef*); EVP_CIPHER_CTX* ctx; Reference textCipherKey; @@ -952,26 +959,17 @@ private: const int, const BlobCipherEncryptHeaderFlagsV1&, const BlobCipherEncryptHeaderRef&); - template + template void validateAuthTokenV1(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeaderFlagsV1&, const BlobCipherEncryptHeaderRef& header); - void validateHeaderSingleAuthToken(const uint8_t* ciphertext, - const int ciphertextLen, - const BlobCipherEncryptHeaderRef& header, - Arena& arena); - void verifyEncryptHeaderMetadata(const BlobCipherEncryptHeader& header); - void verifyAuthTokens(const uint8_t* ciphertext, - const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena); + void verifyAuthTokens(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header); void verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, - const BlobCipherEncryptHeader& header, - Arena& arena); + const BlobCipherEncryptHeader& header); }; class HmacSha256DigestGen final : NonCopyable { diff --git a/fdbserver/workloads/EncryptionOps.actor.cpp b/fdbserver/workloads/EncryptionOps.actor.cpp index 7ccb5c1a72..c08593a1ce 100644 --- a/fdbserver/workloads/EncryptionOps.actor.cpp +++ b/fdbserver/workloads/EncryptionOps.actor.cpp @@ -334,7 +334,7 @@ struct EncryptionOpsWorkload : TestWorkload { } ASSERT_EQ(encrypted.size(), len); - ASSERT_EQ(headerRef->flagsVersion, CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); + ASSERT_EQ(headerRef->flagsVersion(), CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); ASSERT_NE(memcmp(encrypted.begin(), payload, len), 0); metrics->updateEncryptionTime(std::chrono::duration(end - start).count()); @@ -382,30 +382,28 @@ struct EncryptionOpsWorkload : TestWorkload { Reference orgCipherKey) { BlobCipherEncryptHeaderRef headerRef = BlobCipherEncryptHeaderRef::fromStringRef(headerStr); - ASSERT_EQ(headerRef.flagsVersion, CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); + ASSERT_EQ(headerRef.flagsVersion(), CLIENT_KNOBS->ENCRYPT_HEADER_FLAGS_VERSION); // validate flags BlobCipherDetails textCipherDetails; BlobCipherDetails headerCipherDetails; uint8_t iv[AES_256_IV_LENGTH]; - if (std::holds_alternative(headerRef.algoHeader)) { - AesCtrNoAuthV1 noAuth = std::get(headerRef.algoHeader); - memcpy(&iv[0], &noAuth.iv[0], AES_256_IV_LENGTH); - textCipherDetails = noAuth.cipherTextDetails; + if (std::holds_alternative(headerRef.algoHeader)) { + AesCtrNoAuth noAuth = std::get(headerRef.algoHeader); + memcpy(&iv[0], &noAuth.v1.iv[0], AES_256_IV_LENGTH); + textCipherDetails = noAuth.v1.cipherTextDetails; headerCipherDetails = BlobCipherDetails(); - } else if (std::holds_alternative>(headerRef.algoHeader)) { - AesCtrWithAuthV1 hmacSha = - std::get>(headerRef.algoHeader); - memcpy(&iv[0], &hmacSha.iv[0], AES_256_IV_LENGTH); - textCipherDetails = hmacSha.cipherTextDetails; - headerCipherDetails = hmacSha.cipherHeaderDetails; + } else if (std::holds_alternative(headerRef.algoHeader)) { + AesCtrWithHmac hmacSha = std::get(headerRef.algoHeader); + memcpy(&iv[0], &hmacSha.v1.iv[0], AES_256_IV_LENGTH); + textCipherDetails = hmacSha.v1.cipherTextDetails; + headerCipherDetails = hmacSha.v1.cipherHeaderDetails; } else { - ASSERT(std::holds_alternative>(headerRef.algoHeader)); - AesCtrWithAuthV1 aesCmac = - std::get>(headerRef.algoHeader); - memcpy(&iv[0], &aesCmac.iv[0], AES_256_IV_LENGTH); - textCipherDetails = aesCmac.cipherTextDetails; - headerCipherDetails = aesCmac.cipherHeaderDetails; + ASSERT(std::holds_alternative(headerRef.algoHeader)); + AesCtrWithCmac aesCmac = std::get(headerRef.algoHeader); + memcpy(&iv[0], &aesCmac.v1.iv[0], AES_256_IV_LENGTH); + textCipherDetails = aesCmac.v1.cipherTextDetails; + headerCipherDetails = aesCmac.v1.cipherHeaderDetails; } Reference cipherKey = getEncryptionKey(textCipherDetails.encryptDomainId, textCipherDetails.baseCipherId, textCipherDetails.salt); diff --git a/flow/include/flow/serialize.h b/flow/include/flow/serialize.h index fa01f08ea9..5ca4a16517 100644 --- a/flow/include/flow/serialize.h +++ b/flow/include/flow/serialize.h @@ -303,6 +303,37 @@ inline void load(Archive& ar, boost::container::flat_map& value) { ASSERT(ar.protocolVersion().isValid()); } +template +inline void save(Archive& ar, const std::variant value) { + ar << (uint8_t)value.index(); + std::visit([&](auto& inner) { ar << inner; }, value); + ASSERT(ar.protocolVersion().isValid()); +} + +namespace { +template +inline void loadVariant(Archive& ar, uint8_t index, Value& value) { + if (index == 0) { + Variant v; + ar >> v; + value = v; + } else if constexpr (sizeof...(Variants) > 0) { + loadVariant(ar, index - 1, value); + } else { + ASSERT(false); + } +} +} // anonymous namespace + +template +inline void load(Archive& ar, std::variant& value) { + uint8_t index; + ar >> index; + ASSERT(index < sizeof...(Variants)); + loadVariant, Variants...>(ar, index, value); + ASSERT(ar.protocolVersion().isValid()); +} + #ifdef _MSC_VER #pragma intrinsic(memcpy) #endif From 8c94b340cee93dce121806a1b8935dfee25b209f Mon Sep 17 00:00:00 2001 From: Ata E Husain Bohra Date: Wed, 15 Feb 2023 08:56:11 -0800 Subject: [PATCH 56/57] EaR: Update encryption methods to make 'cipherHeaderKey' optional (#9378) * EaR: Update encryption methods to make 'cipherHeaderKey' optional Description diff-1: Address review comments Major changes includes: 1. Update BlobCipher Encrypt/Decrypt classes to make 'headerCipher' optional 2. Update GetEncryptionCipherKeys actor methods to make 'headerCipherKey' optional 3. Update the usage across all encryption participant methods Testing BlobCipherUnitTest EnryptedBackupCorrecctness BlobGranuleCorrectness* devRunCorrectness - 100K --- fdbclient/BlobCipher.cpp | 61 +++++++++++------ fdbclient/FileBackupAgent.actor.cpp | 28 ++++---- fdbclient/include/fdbclient/BlobCipher.h | 15 +++-- .../fdbclient/GetEncryptCipherKeys.actor.h | 65 ++++++++++++++++--- fdbserver/BlobWorker.actor.cpp | 1 + flow/EncryptUtils.cpp | 4 ++ flow/include/flow/EncryptUtils.h | 1 + 7 files changed, 130 insertions(+), 45 deletions(-) diff --git a/fdbclient/BlobCipher.cpp b/fdbclient/BlobCipher.cpp index f51e0e318a..c73bca82e7 100644 --- a/fdbclient/BlobCipher.cpp +++ b/fdbclient/BlobCipher.cpp @@ -197,6 +197,20 @@ const EncryptHeaderCipherDetails BlobCipherEncryptHeaderRef::getCipherDetails() algoHeader); } +EncryptAuthTokenMode BlobCipherEncryptHeaderRef::getAuthTokenMode() const { + // TODO: Replace with "Overload visitor pattern" someday. + return std::visit( + [](auto&& f) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return (EncryptAuthTokenMode)f.authTokenMode; + } else { + static_assert(always_false_v, "Unknown encryption flag header"); + } + }, + flags); +} + void BlobCipherEncryptHeaderRef::validateEncryptionHeaderDetails(const BlobCipherDetails& textCipherDetails, const BlobCipherDetails& headerCipherDetails, const StringRef& ivRef) const { @@ -759,12 +773,12 @@ int getEncryptAlgoHeaderVersion(const EncryptAuthTokenMode mode, const EncryptAu // EncryptBlobCipherAes265Ctr class methods EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKeyOpt, const uint8_t* cipherIV, const int ivLen, const EncryptAuthTokenMode mode, BlobCipherMetrics::UsageType usageType) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKeyOpt(hCipherKeyOpt), authTokenMode(mode), usageType(usageType) { ASSERT_EQ(ivLen, AES_256_IV_LENGTH); authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); @@ -773,13 +787,13 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference } EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKeyOpt, const uint8_t* cipherIV, const int ivLen, const EncryptAuthTokenMode mode, const EncryptAuthTokenAlgo algo, BlobCipherMetrics::UsageType usageType) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKeyOpt(hCipherKeyOpt), authTokenMode(mode), authTokenAlgo(algo), usageType(usageType) { ASSERT_EQ(ivLen, AES_256_IV_LENGTH); memcpy(&iv[0], cipherIV, ivLen); @@ -787,10 +801,10 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference } EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKeyOpt, const EncryptAuthTokenMode mode, BlobCipherMetrics::UsageType usageType) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKeyOpt(hCipherKeyOpt), authTokenMode(mode), usageType(usageType) { authTokenAlgo = getAuthTokenAlgoFromMode(authTokenMode); deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); @@ -798,11 +812,11 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference } EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKeyOpt, const EncryptAuthTokenMode mode, const EncryptAuthTokenAlgo algo, BlobCipherMetrics::UsageType usageType) - : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), authTokenMode(mode), + : ctx(EVP_CIPHER_CTX_new()), textCipherKey(tCipherKey), headerCipherKeyOpt(hCipherKeyOpt), authTokenMode(mode), authTokenAlgo(algo), usageType(usageType) { deterministicRandom()->randomBytes(iv, AES_256_IV_LENGTH); init(); @@ -811,7 +825,7 @@ EncryptBlobCipherAes265Ctr::EncryptBlobCipherAes265Ctr(Reference void EncryptBlobCipherAes265Ctr::init() { ASSERT(textCipherKey.isValid()); if (FLOW_KNOBS->ENCRYPT_HEADER_AUTH_TOKEN_ENABLED) { - ASSERT(headerCipherKey.isValid()); + ASSERT(headerCipherKeyOpt.present() && headerCipherKeyOpt.get().isValid()); } if (!isEncryptHeaderAuthTokenDetailsValid(authTokenMode, authTokenAlgo)) { @@ -837,12 +851,15 @@ void EncryptBlobCipherAes265Ctr::setCipherAlgoHeaderWithAuthV1(const uint8_t* ci const int ciphertextLen, const BlobCipherEncryptHeaderFlagsV1& flags, BlobCipherEncryptHeaderRef* headerRef) { + ASSERT(headerCipherKeyOpt.present() && headerCipherKeyOpt.get().isValid()); + // Construct algorithm specific details except 'authToken', serialize the details into 'headerRef' to allow // authToken generation AesCtrWithAuthV1 algoHeader( BlobCipherDetails(textCipherKey->getDomainId(), textCipherKey->getBaseCipherId(), textCipherKey->getSalt()), - BlobCipherDetails( - headerCipherKey->getDomainId(), headerCipherKey->getBaseCipherId(), headerCipherKey->getSalt()), + BlobCipherDetails(headerCipherKeyOpt.get()->getDomainId(), + headerCipherKeyOpt.get()->getBaseCipherId(), + headerCipherKeyOpt.get()->getSalt()), iv, AES_256_IV_LENGTH); headerRef->algoHeader = AesCtrWithAuth(algoHeader); @@ -850,7 +867,7 @@ void EncryptBlobCipherAes265Ctr::setCipherAlgoHeaderWithAuthV1(const uint8_t* ci Standalone serialized = BlobCipherEncryptHeaderRef::toStringRef(*headerRef); uint8_t computed[Params::authTokenSize]; computeAuthToken({ { ciphertext, ciphertextLen }, { serialized.begin(), serialized.size() } }, - headerCipherKey->rawCipher(), + headerCipherKeyOpt.get()->rawCipher(), AES_256_KEY_LENGTH, &computed[0], (EncryptAuthTokenAlgo)flags.authTokenAlgo, @@ -1024,7 +1041,7 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte header->cipherTextDetails = textCipherKey->details(); // Populate header encryption-key details if (authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE) { - header->cipherHeaderDetails = headerCipherKey->details(); + header->cipherHeaderDetails = headerCipherKeyOpt.get()->details(); } else { header->cipherHeaderDetails.encryptDomainId = INVALID_ENCRYPT_DOMAIN_ID; header->cipherHeaderDetails.baseCipherId = INVALID_ENCRYPT_CIPHER_KEY_ID; @@ -1044,7 +1061,7 @@ Reference EncryptBlobCipherAes265Ctr::encrypt(const uint8_t* plainte computeAuthToken({ { ciphertext, bytes + finalBytes }, { reinterpret_cast(header), sizeof(BlobCipherEncryptHeader) } }, - headerCipherKey->rawCipher(), + headerCipherKeyOpt.get()->rawCipher(), AES_256_KEY_LENGTH, &header->singleAuthToken.authToken[0], (EncryptAuthTokenAlgo)header->flags.authTokenAlgo, @@ -1077,10 +1094,10 @@ EncryptBlobCipherAes265Ctr::~EncryptBlobCipherAes265Ctr() { // DecryptBlobCipherAes256Ctr class methods DecryptBlobCipherAes256Ctr::DecryptBlobCipherAes256Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKeyOpt, const uint8_t* iv, BlobCipherMetrics::UsageType usageType) - : ctx(EVP_CIPHER_CTX_new()), usageType(usageType), textCipherKey(tCipherKey), headerCipherKey(hCipherKey), + : ctx(EVP_CIPHER_CTX_new()), usageType(usageType), textCipherKey(tCipherKey), headerCipherKeyOpt(hCipherKeyOpt), authTokensValidationDone(false) { if (ctx == nullptr) { throw encrypt_ops_error(); @@ -1100,6 +1117,7 @@ void DecryptBlobCipherAes256Ctr::validateAuthTokenV1(const uint8_t* ciphertext, const BlobCipherEncryptHeaderRef& headerRef) { ASSERT_EQ(flags.encryptMode, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); ASSERT_LE(Params::authTokenSize, AUTH_TOKEN_MAX_SIZE); + ASSERT(headerCipherKeyOpt.present() && headerCipherKeyOpt.get().isValid()); Arena tmpArena; uint8_t persited[Params::authTokenSize]; @@ -1117,7 +1135,7 @@ void DecryptBlobCipherAes256Ctr::validateAuthTokenV1(const uint8_t* ciphertext, headerRefCopy.algoHeader = algoHeaderCopy; Standalone serializedHeader = BlobCipherEncryptHeaderRef::toStringRef(headerRefCopy); computeAuthToken({ { ciphertext, ciphertextLen }, { serializedHeader.begin(), serializedHeader.size() } }, - headerCipherKey->rawCipher(), + headerCipherKeyOpt.get()->rawCipher(), AES_256_KEY_LENGTH, &computed[0], (EncryptAuthTokenAlgo)flags.authTokenAlgo, @@ -1271,6 +1289,7 @@ StringRef DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphertext, void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciphertext, const int ciphertextLen, const BlobCipherEncryptHeader& header) { + ASSERT(headerCipherKeyOpt.present() && headerCipherKeyOpt.get().isValid()); // prepare the payload {cipherText + encryptionHeader} // ensure the 'authToken' is reset before computing the 'authentication token' BlobCipherEncryptHeader headerCopy; @@ -1281,7 +1300,7 @@ void DecryptBlobCipherAes256Ctr::verifyHeaderSingleAuthToken(const uint8_t* ciph uint8_t computed[AUTH_TOKEN_MAX_SIZE]; computeAuthToken({ { ciphertext, ciphertextLen }, { reinterpret_cast(&headerCopy), sizeof(BlobCipherEncryptHeader) } }, - headerCipherKey->rawCipher(), + headerCipherKeyOpt.get()->rawCipher(), AES_256_KEY_LENGTH, &computed[0], (EncryptAuthTokenAlgo)header.flags.authTokenAlgo, @@ -1344,7 +1363,7 @@ Reference DecryptBlobCipherAes256Ctr::decrypt(const uint8_t* ciphert verifyEncryptHeaderMetadata(header); if (header.flags.authTokenMode != EncryptAuthTokenMode::ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE && - !headerCipherKey.isValid()) { + (!headerCipherKeyOpt.present() || !headerCipherKeyOpt.get().isValid())) { TraceEvent(SevWarn, "BlobCipherDecryptInvalidHeaderCipherKey") .detail("AuthTokenMode", header.flags.authTokenMode); throw encrypt_ops_error(); @@ -1876,7 +1895,9 @@ void testConfigurableEncryptionHeaderNoAuthMode(const int minDomainId) { BlobCipherEncryptHeaderRef headerRef; encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); + ASSERT_EQ(headerRef.flagsVersion(), 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); + ASSERT_EQ(flags.authTokenMode, headerRef.getAuthTokenMode()); AesCtrNoAuth noAuth = std::get(headerRef.algoHeader); const uint8_t* headerIV = headerRef.getIV(); @@ -2156,7 +2177,9 @@ void testConfigurableEncryptionHeaderSingleAuthMode(int minDomainId) { BlobCipherEncryptHeaderRef headerRef; encryptor.encrypt(&orgData[0], bufLen, &headerRef, arena); + ASSERT_EQ(headerRef.flagsVersion(), 1); BlobCipherEncryptHeaderFlagsV1 flags = std::get(headerRef.flags); + ASSERT_EQ(flags.authTokenMode, headerRef.getAuthTokenMode()); AesCtrWithAuth algoHeader = std::get>(headerRef.algoHeader); const uint8_t* headerIV = headerRef.getIV(); diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index ecb4480d3b..3f676f769e 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -505,7 +505,7 @@ public: struct SnapshotFileBackupEncryptionKeys { Reference textCipherKey; - Reference headerCipherKey; + Optional> headerCipherKey; StringRef ivRef; }; @@ -575,17 +575,18 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { wPtr = mutateString(buffer); } - static void validateEncryptionHeader(Reference headerCipherKey, + static void validateEncryptionHeader(Optional> headerCipherKey, Reference textCipherKey, BlobCipherEncryptHeader& header) { // Validate encryption header 'cipherHeader' details - if (header.cipherHeaderDetails.isValid() && header.cipherHeaderDetails != headerCipherKey->details()) { + if (header.cipherHeaderDetails.isValid() && + (!headerCipherKey.present() || header.cipherHeaderDetails != headerCipherKey.get()->details())) { TraceEvent(SevWarn, "EncryptionHeader_CipherHeaderMismatch") - .detail("HeaderDomainId", headerCipherKey->getDomainId()) + .detail("HeaderDomainId", headerCipherKey.get()->getDomainId()) .detail("ExpectedHeaderDomainId", header.cipherHeaderDetails.encryptDomainId) - .detail("HeaderBaseCipherId", headerCipherKey->getBaseCipherId()) + .detail("HeaderBaseCipherId", headerCipherKey.get()->getBaseCipherId()) .detail("ExpectedHeaderBaseCipherId", header.cipherHeaderDetails.baseCipherId) - .detail("HeaderSalt", headerCipherKey->getSalt()) + .detail("HeaderSalt", headerCipherKey.get()->getSalt()) .detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt); throw encrypt_header_metadata_mismatch(); } @@ -633,11 +634,14 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { } ACTOR static Future encrypt(EncryptedRangeFileWriter* self) { - ASSERT(self->cipherKeys.headerCipherKey.isValid() && self->cipherKeys.textCipherKey.isValid()); + // TODO: HeaderCipher key not needed for 'no authentication encryption' + ASSERT(self->cipherKeys.headerCipherKey.present() && self->cipherKeys.headerCipherKey.get().isValid() && + self->cipherKeys.textCipherKey.isValid()); // Ensure that the keys we got are still valid before flushing the block - if (self->cipherKeys.headerCipherKey->isExpired() || self->cipherKeys.headerCipherKey->needsRefresh()) { + if (self->cipherKeys.headerCipherKey.get()->isExpired() || + self->cipherKeys.headerCipherKey.get()->needsRefresh()) { Reference cipherKey = - wait(refreshKey(self, self->cipherKeys.headerCipherKey->getDomainId())); + wait(refreshKey(self, self->cipherKeys.headerCipherKey.get()->getDomainId())); self->cipherKeys.headerCipherKey = cipherKey; } if (self->cipherKeys.textCipherKey->isExpired() || self->cipherKeys.textCipherKey->needsRefresh()) { @@ -847,7 +851,8 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { // Start a new block if needed, then write the key and value ACTOR static Future writeKV_impl(EncryptedRangeFileWriter* self, Key k, Value v) { - if (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid()) { + if (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() || + !self->cipherKeys.textCipherKey.isValid()) { wait(updateEncryptionKeysCtx(self, k)); } state int toWrite = sizeof(int32_t) + k.size() + sizeof(int32_t) + v.size(); @@ -869,7 +874,8 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter { ACTOR static Future writeKey_impl(EncryptedRangeFileWriter* self, Key k) { // TODO (Nim): Is it possible to write empty begin and end keys? if (k.size() > 0 && - (!self->cipherKeys.headerCipherKey.isValid() || !self->cipherKeys.textCipherKey.isValid())) { + (!self->cipherKeys.headerCipherKey.present() || !self->cipherKeys.headerCipherKey.get().isValid() || + !self->cipherKeys.textCipherKey.isValid())) { wait(updateEncryptionKeysCtx(self, k)); } // Need to account for extra "empty" value being written in the case of crossing tenant boundaries diff --git a/fdbclient/include/fdbclient/BlobCipher.h b/fdbclient/include/fdbclient/BlobCipher.h index defaeb8642..139643b680 100644 --- a/fdbclient/include/fdbclient/BlobCipher.h +++ b/fdbclient/include/fdbclient/BlobCipher.h @@ -488,6 +488,7 @@ struct BlobCipherEncryptHeaderRef { const uint8_t* getIV() const; const EncryptHeaderCipherDetails getCipherDetails() const; + EncryptAuthTokenMode getAuthTokenMode() const; void validateEncryptionHeaderDetails(const BlobCipherDetails& textCipherDetails, const BlobCipherDetails& headerCipherDetails, @@ -864,24 +865,24 @@ public: static constexpr uint8_t ENCRYPT_HEADER_VERSION = 1; EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKey, const uint8_t* iv, const int ivLen, const EncryptAuthTokenMode mode, BlobCipherMetrics::UsageType usageType); EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKey, const uint8_t* iv, const int ivLen, const EncryptAuthTokenMode mode, const EncryptAuthTokenAlgo algo, BlobCipherMetrics::UsageType usageType); EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKey, const EncryptAuthTokenMode mode, BlobCipherMetrics::UsageType usageType); EncryptBlobCipherAes265Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKey, const EncryptAuthTokenMode mode, const EncryptAuthTokenAlgo algo, BlobCipherMetrics::UsageType usageType); @@ -911,7 +912,7 @@ private: EVP_CIPHER_CTX* ctx; Reference textCipherKey; - Reference headerCipherKey; + Optional> headerCipherKeyOpt; EncryptAuthTokenMode authTokenMode; uint8_t iv[AES_256_IV_LENGTH]; BlobCipherMetrics::UsageType usageType; @@ -924,7 +925,7 @@ private: class DecryptBlobCipherAes256Ctr final : NonCopyable, public ReferenceCounted { public: DecryptBlobCipherAes256Ctr(Reference tCipherKey, - Reference hCipherKey, + Optional> hCipherKey, const uint8_t* iv, BlobCipherMetrics::UsageType usageType); ~DecryptBlobCipherAes256Ctr(); @@ -942,7 +943,7 @@ private: EVP_CIPHER_CTX* ctx; BlobCipherMetrics::UsageType usageType; Reference textCipherKey; - Reference headerCipherKey; + Optional> headerCipherKeyOpt; bool authTokensValidationDone; void validateEncryptHeader(const uint8_t*, diff --git a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h index 6e51c82394..ebecec3b69 100644 --- a/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h +++ b/fdbclient/include/fdbclient/GetEncryptCipherKeys.actor.h @@ -34,6 +34,7 @@ #include "flow/Knobs.h" #include "flow/IRandom.h" +#include #include #include @@ -318,23 +319,71 @@ ACTOR template Future getEncryptCipherKeys(Reference const> db, BlobCipherEncryptHeader header, BlobCipherMetrics::UsageType usageType) { + state bool authenticatedEncryption = header.flags.authTokenMode != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE; + + ASSERT(header.cipherTextDetails.isValid()); + ASSERT(!authenticatedEncryption || header.cipherHeaderDetails.isValid()); + std::unordered_set cipherDetails{ header.cipherTextDetails }; - if (header.cipherHeaderDetails.isValid()) { + if (authenticatedEncryption) { cipherDetails.insert(header.cipherHeaderDetails); } + + std::unordered_map> cipherKeys = + wait(getEncryptCipherKeys(db, cipherDetails, usageType)); + + TextAndHeaderCipherKeys result; + auto setCipherKey = [&](const BlobCipherDetails& details, TextAndHeaderCipherKeys& result) { + ASSERT(details.isValid()); + auto iter = cipherKeys.find(details); + ASSERT(iter != cipherKeys.end() && iter->second.isValid()); + isEncryptHeaderDomain(details.encryptDomainId) ? result.cipherHeaderKey = iter->second + : result.cipherTextKey = iter->second; + }; + setCipherKey(header.cipherTextDetails, result); + if (authenticatedEncryption) { + setCipherKey(header.cipherHeaderDetails, result); + } + ASSERT(result.cipherTextKey.isValid() && (!authenticatedEncryption || result.cipherHeaderKey.isValid())); + + return result; +} + +ACTOR template +Future getEncryptCipherKeys(Reference const> db, + BlobCipherEncryptHeaderRef header, + BlobCipherMetrics::UsageType usageType) { + ASSERT(CLIENT_KNOBS->ENABLE_CONFIGURABLE_ENCRYPTION); + + state bool authenticatedEncryption = header.getAuthTokenMode() != ENCRYPT_HEADER_AUTH_TOKEN_MODE_NONE; + state EncryptHeaderCipherDetails details = header.getCipherDetails(); + + ASSERT(details.textCipherDetails.isValid()); + ASSERT(!authenticatedEncryption || + (details.headerCipherDetails.present() && details.headerCipherDetails.get().isValid())); + + std::unordered_set cipherDetails{ details.textCipherDetails }; + if (authenticatedEncryption) { + cipherDetails.insert(details.headerCipherDetails.get()); + } + std::unordered_map> cipherKeys = wait(getEncryptCipherKeys(db, cipherDetails, usageType)); TextAndHeaderCipherKeys result; - auto setCipherKey = [&](const BlobCipherDetails& details, Reference& cipherKey) { - if (!details.isValid()) { - return; - } + + auto setCipherKey = [&](const BlobCipherDetails& details, TextAndHeaderCipherKeys& result) { + ASSERT(details.isValid()); auto iter = cipherKeys.find(details); ASSERT(iter != cipherKeys.end() && iter->second.isValid()); - cipherKey = iter->second; + isEncryptHeaderDomain(details.encryptDomainId) ? result.cipherHeaderKey = iter->second + : result.cipherTextKey = iter->second; }; - setCipherKey(header.cipherTextDetails, result.cipherTextKey); - setCipherKey(header.cipherHeaderDetails, result.cipherHeaderKey); + setCipherKey(details.textCipherDetails, result); + if (authenticatedEncryption) { + setCipherKey(details.headerCipherDetails.get(), result); + } + ASSERT(result.cipherTextKey.isValid() && (!authenticatedEncryption || result.cipherHeaderKey.isValid())); + return result; } diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index c3cacedd69..796120a41c 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -476,6 +476,7 @@ ACTOR Future getLatestGranuleCipherKeys(ReferencedbInfo, BlobCipherMetrics::BLOB_GRANULE)); + ASSERT(systemCipherKeys.cipherHeaderKey.isValid()); cipherKeysCtx.headerCipherKey = BlobGranuleCipherKey::fromBlobCipherKey(systemCipherKeys.cipherHeaderKey, *arena); cipherKeysCtx.ivRef = makeString(AES_256_IV_LENGTH, *arena); diff --git a/flow/EncryptUtils.cpp b/flow/EncryptUtils.cpp index 7a3c7e3719..c721b785bb 100644 --- a/flow/EncryptUtils.cpp +++ b/flow/EncryptUtils.cpp @@ -131,4 +131,8 @@ EncryptAuthTokenAlgo getRandomAuthTokenAlgo() { bool isReservedEncryptDomain(EncryptCipherDomainId domainId) { return domainId == SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID || domainId == ENCRYPT_HEADER_DOMAIN_ID || domainId == FDB_DEFAULT_ENCRYPT_DOMAIN_ID; +} + +bool isEncryptHeaderDomain(EncryptCipherDomainId domainId) { + return domainId == ENCRYPT_HEADER_DOMAIN_ID; } \ No newline at end of file diff --git a/flow/include/flow/EncryptUtils.h b/flow/include/flow/EncryptUtils.h index bcf5133d12..2ee299c120 100644 --- a/flow/include/flow/EncryptUtils.h +++ b/flow/include/flow/EncryptUtils.h @@ -121,5 +121,6 @@ std::string getEncryptDbgTraceKeyWithTS(std::string_view prefix, int getEncryptHeaderAuthTokenSize(int algo); bool isReservedEncryptDomain(EncryptCipherDomainId domainId); +bool isEncryptHeaderDomain(EncryptCipherDomainId domainId); #endif From afc25035f8e56de66b79435a32fc943a7dfcac99 Mon Sep 17 00:00:00 2001 From: Vaidas Gasiunas Date: Wed, 15 Feb 2023 18:00:53 +0100 Subject: [PATCH 57/57] Client status report API in Java and python bindings (#9366) * get_client_status: add to Java bindings * Add get_client_status in python binding * Make python unit tests available as ctest * Fix file name in the copyright header * Fix library path name for mac * get_client_status: Minor changes in Java & Python bindings addressing review comments * Rename fdb_python_unit_tests to unit_tests --- bindings/java/fdbJNI.cpp | 14 + .../GetClientStatusIntegrationTest.java | 48 +++ .../main/com/apple/foundationdb/Database.java | 17 + .../com/apple/foundationdb/FDBDatabase.java | 12 +- bindings/java/src/tests.cmake | 1 + bindings/python/CMakeLists.txt | 7 + bindings/python/fdb/impl.py | 9 +- bindings/python/tests/tester.py | 277 ++++------------ bindings/python/tests/unit_tests.py | 296 ++++++++++++++++++ cmake/AddFdbTest.cmake | 7 + 10 files changed, 473 insertions(+), 215 deletions(-) create mode 100644 bindings/java/src/integration/com/apple/foundationdb/GetClientStatusIntegrationTest.java create mode 100644 bindings/python/tests/unit_tests.py diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index 7cea3499fc..8be5d629e0 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -1044,6 +1044,20 @@ JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1verify return (jlong)f; } +JNIEXPORT jlong JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getClientStatus(JNIEnv* jenv, + jobject, + jlong dbPtr) { + if (!dbPtr) { + throwParamNotNull(jenv); + return 0; + } + + FDBDatabase* database = (FDBDatabase*)dbPtr; + + FDBFuture* f = fdb_database_get_client_status(database); + return (jlong)f; +} + JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv, jobject, jint predicate, diff --git a/bindings/java/src/integration/com/apple/foundationdb/GetClientStatusIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/GetClientStatusIntegrationTest.java new file mode 100644 index 0000000000..a8722b9ba2 --- /dev/null +++ b/bindings/java/src/integration/com/apple/foundationdb/GetClientStatusIntegrationTest.java @@ -0,0 +1,48 @@ +/* + * GetClientStatusIntegrationTest.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2023 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.apple.foundationdb; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Integration tests around Range Queries. This requires a running FDB instance to work properly; + * all tests will be skipped if it can't connect to a running instance relatively quickly. + */ +class GetClientStatusIntegrationTest { + public static final int API_VERSION = 720; + private static final FDB fdb = FDB.selectAPIVersion(API_VERSION); + + @Test + public void clientStatusIsHealthy() throws Exception { + try (Database db = fdb.open()) { + // Run a simple transaction to make sure the database is fully initialized + db.run(tr -> { + return tr.getReadVersion(); + }); + + // Here we just check if a meaningful client report status is returned + // Different report attributes and error cases are covered by C API tests + String statusStr = new String(db.getClientStatus().join()); + Assertions.assertTrue(statusStr.contains("\"Healthy\":true"), + String.format("Healthy:true not found in client status: %s", statusStr)); + } + } +} diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java index 725ae0f7d5..cf5527b206 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Database.java +++ b/bindings/java/src/main/com/apple/foundationdb/Database.java @@ -507,4 +507,21 @@ public interface Database extends AutoCloseable, TransactionContext { */ @Override void close(); + + /** + * Returns client-side status information + * + * @return a {@code CompletableFuture} containing a JSON string with client status health information + */ + default CompletableFuture getClientStatus() { + return getClientStatus(getExecutor()); + } + + /** + * Returns client-side status information + * + * @param e the {@link Executor} to use for asynchronous callbacks + * @return a {@code CompletableFuture} containing a JSON string with client status health information + */ + CompletableFuture getClientStatus(Executor e); } diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java index 98c001a1b0..3ba848f330 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java @@ -27,7 +27,6 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import com.apple.foundationdb.async.AsyncUtil; -import com.apple.foundationdb.tuple.ByteArrayUtil; import com.apple.foundationdb.tuple.Tuple; class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsumer { @@ -270,6 +269,16 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume Database_dispose(cPtr); } + @Override + public CompletableFuture getClientStatus(Executor e) { + pointerReadLock.lock(); + try { + return new FutureKey(Database_getClientStatus(getPtr()), e, eventKeeper); + } finally { + pointerReadLock.unlock(); + } + } + private native long Database_openTenant(long cPtr, byte[] tenantName); private native long Database_createTransaction(long cPtr); private native void Database_dispose(long cPtr); @@ -281,4 +290,5 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume private native long Database_unblobbifyRange(long cPtr, byte[] beginKey, byte[] endKey); private native long Database_listBlobbifiedRanges(long cPtr, byte[] beginKey, byte[] endKey, int rangeLimit); private native long Database_verifyBlobRange(long cPtr, byte[] beginKey, byte[] endKey, long version); + private native long Database_getClientStatus(long cPtr); } \ No newline at end of file diff --git a/bindings/java/src/tests.cmake b/bindings/java/src/tests.cmake index 375f88053b..4da415ecbf 100644 --- a/bindings/java/src/tests.cmake +++ b/bindings/java/src/tests.cmake @@ -54,6 +54,7 @@ set(JAVA_INTEGRATION_TESTS src/integration/com/apple/foundationdb/RepeatableReadMultiThreadClientTest.java src/integration/com/apple/foundationdb/MappedRangeQueryIntegrationTest.java src/integration/com/apple/foundationdb/BlobGranuleIntegrationTest.java + src/integration/com/apple/foundationdb/GetClientStatusIntegrationTest.java ) # Resources that are used in integration testing, but are not explicitly test files (JUnit rules, diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index af281a7405..84dcfd75f1 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -75,3 +75,10 @@ add_custom_command(OUTPUT ${package_file} add_custom_target(python_package DEPENDS ${package_file}) add_dependencies(python_package python_binding) add_dependencies(packages python_package) + +add_fdbclient_test( + NAME python_unit_tests + COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/tests/unit_tests.py + --cluster-file @CLUSTER_FILE@ --verbose + DISABLE_LOG_DUMP +) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index d51411bac2..2a08a5960f 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -1326,6 +1326,9 @@ class Database(_TransactionCreator): self.capi.fdb_database_create_transaction(self.dpointer, ctypes.byref(pointer)) return Transaction(pointer.value, self) + def get_client_status(self): + return Key(self.capi.fdb_database_get_client_status(self.dpointer)) + class Tenant(_TransactionCreator): def __init__(self, tpointer): @@ -1456,7 +1459,7 @@ def check_error_code(code, func, arguments): return None -if sys.maxsize <= 2 ** 32: +if sys.maxsize <= 2**32: raise Exception("FoundationDB API requires a 64-bit python interpreter!") if platform.system() == "Windows": capi_name = "fdb_c.dll" @@ -1710,6 +1713,9 @@ def init_c_api(): _capi.fdb_database_set_option.restype = ctypes.c_int _capi.fdb_database_set_option.errcheck = check_error_code + _capi.fdb_database_get_client_status.argtypes = [ctypes.c_void_p] + _capi.fdb_database_get_client_status.restype = ctypes.c_void_p + _capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p] _capi.fdb_tenant_destroy.restype = None @@ -1891,7 +1897,6 @@ if hasattr(ctypes.pythonapi, "Py_IncRef"): def _unpin_callback(cb): ctypes.pythonapi.Py_DecRef(ctypes.py_object(cb)) - else: _active_callbacks = set() _pin_callback = _active_callbacks.add diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 92d317fea7..a856727b47 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -19,37 +19,24 @@ # limitations under the License. # - -import ctypes -import math -import sys -import os -import struct -import threading -import random -import time -import traceback import json +import math +import os +import random +import struct +import sys +import threading -sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..')] +sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..")] import fdb + fdb.api_version(int(sys.argv[2])) +import fdb.tuple +from directory_extension import DirectoryExtension from fdb import six from fdb.impl import strinc -import fdb.tuple - -from directory_extension import DirectoryExtension - -from cancellation_timeout_tests import test_timeouts -from cancellation_timeout_tests import test_db_timeouts -from cancellation_timeout_tests import test_cancellation -from cancellation_timeout_tests import test_retry_limits -from cancellation_timeout_tests import test_db_retry_limits -from cancellation_timeout_tests import test_combinations - -from size_limit_tests import test_size_limit_option, test_get_approximate_size -from tenant_tests import test_tenants +from unit_tests import run_unit_tests random.seed(0) @@ -92,13 +79,16 @@ class Stack: if isinstance(raw[i][1], fdb.Future): try: val = raw[i][1].wait() - if val is None or (hasattr(val, 'present') and not val.present()): - raw[i] = (raw[i][0], b'RESULT_NOT_PRESENT') + if val is None or (hasattr(val, "present") and not val.present()): + raw[i] = (raw[i][0], b"RESULT_NOT_PRESENT") else: raw[i] = (raw[i][0], val) except fdb.FDBError as e: # print('ERROR: %r' % e) - raw[i] = (raw[i][0], fdb.tuple.pack((b'ERROR', str(e.code).encode('ascii')))) + raw[i] = ( + raw[i][0], + fdb.tuple.pack((b"ERROR", str(e.code).encode("ascii"))), + ) if count is None: if with_idx: @@ -113,7 +103,9 @@ class Stack: class Instruction: - def __init__(self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False): + def __init__( + self, tr, stack, op, index, isDatabase=False, isTenant=False, isSnapshot=False + ): self.tr = tr self.stack = stack self.op = op @@ -129,151 +121,6 @@ class Instruction: self.stack.push(self.index, val) -def test_fdb_transactional_generator(db): - try: - @fdb.transactional - def function_that_yields(tr): - yield 0 - assert fdb.get_api_version() < 630, "Pre-6.3, a decorator may wrap a function that yields" - except ValueError: - assert fdb.get_api_version() >= 630, "Post-6.3, a decorator should throw if wrapped function yields" - - -def test_fdb_transactional_returns_generator(db): - try: - def function_that_yields(tr): - yield 0 - - @fdb.transactional - def function_that_returns(tr): - return function_that_yields(tr) - function_that_returns() - assert fdb.get_api_version() < 630, "Pre-6.3, returning a generator is allowed" - except ValueError: - assert fdb.get_api_version() >= 630, "Post-6.3, returning a generator should throw" - - -def test_db_options(db): - db.options.set_location_cache_size(100001) - db.options.set_max_watches(100001) - db.options.set_datacenter_id("dc_id") - db.options.set_machine_id("machine_id") - db.options.set_snapshot_ryw_enable() - db.options.set_snapshot_ryw_disable() - db.options.set_transaction_logging_max_field_length(1000) - db.options.set_transaction_timeout(100000) - db.options.set_transaction_timeout(0) - db.options.set_transaction_timeout(0) - db.options.set_transaction_max_retry_delay(100) - db.options.set_transaction_size_limit(100000) - db.options.set_transaction_retry_limit(10) - db.options.set_transaction_retry_limit(-1) - db.options.set_transaction_causal_read_risky() - db.options.set_transaction_include_port_in_address() - - -@fdb.transactional -def test_options(tr): - tr.options.set_priority_system_immediate() - tr.options.set_priority_batch() - tr.options.set_causal_read_risky() - tr.options.set_causal_write_risky() - tr.options.set_read_your_writes_disable() - tr.options.set_read_system_keys() - tr.options.set_access_system_keys() - tr.options.set_transaction_logging_max_field_length(1000) - tr.options.set_timeout(60 * 1000) - tr.options.set_retry_limit(50) - tr.options.set_max_retry_delay(100) - tr.options.set_used_during_commit_protection_disable() - tr.options.set_debug_transaction_identifier('my_transaction') - tr.options.set_log_transaction() - tr.options.set_read_lock_aware() - tr.options.set_lock_aware() - tr.options.set_include_port_in_address() - - tr.get(b'\xff').wait() - - -def check_watches(db, watches, expected): - for i, watch in enumerate(watches): - if watch.is_ready() or expected: - try: - watch.wait() - if not expected: - assert False, "Watch %d is ready" % i - except fdb.FDBError as e: - tr = db.create_transaction() - tr.on_error(e).wait() - return False - - return True - - -def test_watches(db): - while True: - db[b'w0'] = b'0' - db[b'w3'] = b'3' - - watches = [None] - - @fdb.transactional - def txn1(tr): - watches[0] = tr.watch(b'w0') - tr.set(b'w0', b'0') - assert not watches[0].is_ready() - - txn1(db) - - watches.append(db.clear_and_watch(b'w1')) - watches.append(db.set_and_watch(b'w2', b'2')) - watches.append(db.get_and_watch(b'w3')) - - assert watches[3][0] == b'3' - watches[3] = watches[3][1] - - time.sleep(1) - - if not check_watches(db, watches, False): - continue - - del db[b'w1'] - - time.sleep(5) - - if not check_watches(db, watches, False): - continue - - db[b'w0'] = b'a' - db[b'w1'] = b'b' - del db[b'w2'] - db.bit_xor(b'w3', b'\xff\xff') - - if check_watches(db, watches, True): - return - - -@fdb.transactional -def test_locality(tr): - tr.options.set_timeout(60 * 1000) - tr.options.set_read_system_keys() # We do this because the last shard (for now, someday the last N shards) is in the /FF/ keyspace - - # This isn't strictly transactional, thought we expect it to be given the size of our database - boundary_keys = list(fdb.locality.get_boundary_keys(tr, b'', b'\xff\xff')) + [b'\xff\xff'] - end_keys = [tr.get_key(fdb.KeySelector.last_less_than(k)) for k in boundary_keys[1:]] - - start_addresses = [fdb.locality.get_addresses_for_key(tr, k) for k in boundary_keys[:-1]] - end_addresses = [fdb.locality.get_addresses_for_key(tr, k) for k in end_keys] - - if [set(s.wait()) for s in start_addresses] != [set(e.wait()) for e in end_addresses]: - raise Exception("Locality not internally consistent.") - - -def test_predicates(): - assert fdb.predicates.is_retryable(fdb.FDBError(1020)) - assert not fdb.predicates.is_retryable(fdb.FDBError(10)) - - class Tester: tr_map = {} tr_map_lock = threading.RLock() @@ -339,9 +186,9 @@ class Tester: # if op != "PUSH" and op != "SWAP": # print("%d. Instruction is %s" % (idx, op)) - isDatabase = op.endswith(six.u('_DATABASE')) - isTenant = op.endswith(six.u('_TENANT')) - isSnapshot = op.endswith(six.u('_SNAPSHOT')) + isDatabase = op.endswith(six.u("_DATABASE")) + isTenant = op.endswith(six.u("_TENANT")) + isSnapshot = op.endswith(six.u("_SNAPSHOT")) if isDatabase: op = op[:-9] @@ -355,7 +202,9 @@ class Tester: else: obj = self.current_transaction() - inst = Instruction(obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot) + inst = Instruction( + obj, self.stack, op, idx, isDatabase, isTenant, isSnapshot + ) try: if inst.op == six.u("PUSH"): @@ -395,7 +244,7 @@ class Tester: f = obj.__getitem__(key) if f == None: - inst.push(b'RESULT_NOT_PRESENT') + inst.push(b"RESULT_NOT_PRESENT") else: inst.push(f) elif inst.op == six.u("GET_ESTIMATED_RANGE_SIZE"): @@ -429,9 +278,22 @@ class Tester: self.push_range(inst, r) elif inst.op == six.u("GET_RANGE_STARTS_WITH"): prefix, limit, reverse, mode = inst.pop(4) - self.push_range(inst, obj.get_range_startswith(prefix, limit, reverse, mode)) + self.push_range( + inst, obj.get_range_startswith(prefix, limit, reverse, mode) + ) elif inst.op == six.u("GET_RANGE_SELECTOR"): - begin_key, begin_or_equal, begin_offset, end_key, end_or_equal, end_offset, limit, reverse, mode, prefix = inst.pop(10) + ( + begin_key, + begin_or_equal, + begin_offset, + end_key, + end_or_equal, + end_offset, + limit, + reverse, + mode, + prefix, + ) = inst.pop(10) beginSel = fdb.KeySelector(begin_key, begin_or_equal, begin_offset) endSel = fdb.KeySelector(end_key, end_or_equal, end_offset) if limit == 0 and mode == -1 and random.random() < 0.5: @@ -534,11 +396,16 @@ class Tester: prefix = inst.pop() count = inst.pop() items = inst.pop(count) - if not fdb.tuple.has_incomplete_versionstamp(items) and random.random() < 0.5: + if ( + not fdb.tuple.has_incomplete_versionstamp(items) + and random.random() < 0.5 + ): inst.push(b"ERROR: NONE") else: try: - packed = fdb.tuple.pack_with_versionstamp(tuple(items), prefix=prefix) + packed = fdb.tuple.pack_with_versionstamp( + tuple(items), prefix=prefix + ) inst.push(b"OK") inst.push(packed) except ValueError as e: @@ -568,7 +435,12 @@ class Tester: elif inst.op == six.u("ENCODE_FLOAT"): f_bytes = inst.pop() f = struct.unpack(">f", f_bytes)[0] - if not math.isnan(f) and not math.isinf(f) and not f == -0.0 and f == int(f): + if ( + not math.isnan(f) + and not math.isinf(f) + and not f == -0.0 + and f == int(f) + ): f = int(f) inst.push(fdb.tuple.SingleFloat(f)) elif inst.op == six.u("ENCODE_DOUBLE"): @@ -609,7 +481,9 @@ class Tester: self.tenant = None elif inst.op == six.u("TENANT_LIST"): begin, end, limit = inst.pop(3) - tenant_list = fdb.tenant_management.list_tenants(self.db, begin, end, limit) + tenant_list = fdb.tenant_management.list_tenants( + self.db, begin, end, limit + ) result = [] for tenant in tenant_list: result += [tenant.key] @@ -627,37 +501,16 @@ class Tester: else: inst.push(b"NO_ACTIVE_TENANT") elif inst.op == six.u("UNIT_TESTS"): - try: - test_db_options(db) - test_options(db) - test_watches(db) - test_cancellation(db) - test_retry_limits(db) - test_db_retry_limits(db) - test_timeouts(db) - test_db_timeouts(db) - test_combinations(db) - test_locality(db) - test_predicates() - - test_size_limit_option(db) - test_get_approximate_size(db) - - if fdb.get_api_version() >= 710: - test_tenants(db) - - except fdb.FDBError as e: - print("Unit tests failed: %s" % e.description) - traceback.print_exc() - - raise Exception("Unit tests failed: %s" % e.description) - elif inst.op.startswith(six.u('DIRECTORY_')): + run_unit_tests(db) + elif inst.op.startswith(six.u("DIRECTORY_")): self.directory_extension.process_instruction(inst) else: raise Exception("Unknown op %s" % inst.op) except fdb.FDBError as e: # print('ERROR: %r' % e) - inst.stack.push(idx, fdb.tuple.pack((b"ERROR", str(e.code).encode('ascii')))) + inst.stack.push( + idx, fdb.tuple.pack((b"ERROR", str(e.code).encode("ascii"))) + ) # print(" to %s" % self.stack) # print() @@ -665,6 +518,6 @@ class Tester: [thr.join() for thr in self.threads] -if __name__ == '__main__': - t = Tester(db, sys.argv[1].encode('ascii')) +if __name__ == "__main__": + t = Tester(db, sys.argv[1].encode("ascii")) t.run() diff --git a/bindings/python/tests/unit_tests.py b/bindings/python/tests/unit_tests.py new file mode 100644 index 0000000000..617aa03144 --- /dev/null +++ b/bindings/python/tests/unit_tests.py @@ -0,0 +1,296 @@ +#!/usr/bin/python +# +# unit_tests.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2023 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import os +import sys +import time +import traceback +import json + +import fdb + +if __name__ == "__main__": + fdb.api_version(720) + +from cancellation_timeout_tests import test_timeouts +from cancellation_timeout_tests import test_db_timeouts +from cancellation_timeout_tests import test_cancellation +from cancellation_timeout_tests import test_retry_limits +from cancellation_timeout_tests import test_db_retry_limits +from cancellation_timeout_tests import test_combinations + +from size_limit_tests import test_size_limit_option, test_get_approximate_size +from tenant_tests import test_tenants + +VERBOSE = False + + +def log(msg): + if VERBOSE: + print(msg, file=sys.stderr, flush=True) + + +def test_fdb_transactional_generator(db): + try: + + @fdb.transactional + def function_that_yields(tr): + yield 0 + + assert ( + fdb.get_api_version() < 630 + ), "Pre-6.3, a decorator may wrap a function that yields" + except ValueError: + assert ( + fdb.get_api_version() >= 630 + ), "Post-6.3, a decorator should throw if wrapped function yields" + + +def test_fdb_transactional_returns_generator(db): + try: + + def function_that_yields(tr): + yield 0 + + @fdb.transactional + def function_that_returns(tr): + return function_that_yields(tr) + + function_that_returns() + assert fdb.get_api_version() < 630, "Pre-6.3, returning a generator is allowed" + except ValueError: + assert ( + fdb.get_api_version() >= 630 + ), "Post-6.3, returning a generator should throw" + + +def test_db_options(db): + db.options.set_location_cache_size(100001) + db.options.set_max_watches(100001) + db.options.set_datacenter_id("dc_id") + db.options.set_machine_id("machine_id") + db.options.set_snapshot_ryw_enable() + db.options.set_snapshot_ryw_disable() + db.options.set_transaction_logging_max_field_length(1000) + db.options.set_transaction_timeout(100000) + db.options.set_transaction_timeout(0) + db.options.set_transaction_timeout(0) + db.options.set_transaction_max_retry_delay(100) + db.options.set_transaction_size_limit(100000) + db.options.set_transaction_retry_limit(10) + db.options.set_transaction_retry_limit(-1) + db.options.set_transaction_causal_read_risky() + db.options.set_transaction_include_port_in_address() + + +@fdb.transactional +def test_options(tr): + tr.options.set_priority_system_immediate() + tr.options.set_priority_batch() + tr.options.set_causal_read_risky() + tr.options.set_causal_write_risky() + tr.options.set_read_your_writes_disable() + tr.options.set_read_system_keys() + tr.options.set_access_system_keys() + tr.options.set_transaction_logging_max_field_length(1000) + tr.options.set_timeout(60 * 1000) + tr.options.set_retry_limit(50) + tr.options.set_max_retry_delay(100) + tr.options.set_used_during_commit_protection_disable() + tr.options.set_debug_transaction_identifier("my_transaction") + tr.options.set_log_transaction() + tr.options.set_read_lock_aware() + tr.options.set_lock_aware() + tr.options.set_include_port_in_address() + tr.get(b"\xff").wait() + + +def check_watches(db, watches, expected): + for i, watch in enumerate(watches): + if watch.is_ready() or expected: + try: + watch.wait() + if not expected: + assert False, "Watch %d is ready" % i + except fdb.FDBError as e: + tr = db.create_transaction() + tr.on_error(e).wait() + return False + + return True + + +def test_watches(db): + while True: + db[b"w0"] = b"0" + db[b"w3"] = b"3" + + watches = [None] + + @fdb.transactional + def txn1(tr): + watches[0] = tr.watch(b"w0") + tr.set(b"w0", b"0") + assert not watches[0].is_ready() + + txn1(db) + + watches.append(db.clear_and_watch(b"w1")) + watches.append(db.set_and_watch(b"w2", b"2")) + watches.append(db.get_and_watch(b"w3")) + + assert watches[3][0] == b"3" + watches[3] = watches[3][1] + + time.sleep(1) + + if not check_watches(db, watches, False): + continue + + del db[b"w1"] + + time.sleep(5) + + if not check_watches(db, watches, False): + continue + + db[b"w0"] = b"a" + db[b"w1"] = b"b" + del db[b"w2"] + db.bit_xor(b"w3", b"\xff\xff") + + if check_watches(db, watches, True): + return + + +@fdb.transactional +def test_locality(tr): + tr.options.set_timeout(60 * 1000) + tr.options.set_read_system_keys() # We do this because the last shard (for now, someday the last N shards) is in the /FF/ keyspace + + # This isn't strictly transactional, thought we expect it to be given the size of our database + boundary_keys = list(fdb.locality.get_boundary_keys(tr, b"", b"\xff\xff")) + [ + b"\xff\xff" + ] + end_keys = [ + tr.get_key(fdb.KeySelector.last_less_than(k)) for k in boundary_keys[1:] + ] + + start_addresses = [ + fdb.locality.get_addresses_for_key(tr, k) for k in boundary_keys[:-1] + ] + end_addresses = [fdb.locality.get_addresses_for_key(tr, k) for k in end_keys] + + if [set(s.wait()) for s in start_addresses] != [ + set(e.wait()) for e in end_addresses + ]: + raise Exception("Locality not internally consistent.") + + +def test_predicates(): + assert fdb.predicates.is_retryable(fdb.FDBError(1020)) + assert not fdb.predicates.is_retryable(fdb.FDBError(10)) + + +def test_get_client_status(db): + @fdb.transactional + def simple_txn(tr): + tr.get_read_version().wait() + + # Execute a simple transaction + # to make sure the database is initialized + simple_txn(db) + # Here we just check if a meaningful client report status is returned + # Different report attributes and error cases are covered by C API tests + status_str = db.get_client_status().wait() + status = json.loads(status_str) + assert "Healthy" in status + assert status["Healthy"] + + +def run_unit_tests(db): + try: + log("test_db_options") + test_db_options(db) + log("test_options") + test_options(db) + log("test_watches") + test_watches(db) + log("test_cancellation") + test_cancellation(db) + log("test_retry_limits") + test_retry_limits(db) + log("test_db_retry_limits") + test_db_retry_limits(db) + log("test_timeouts") + test_timeouts(db) + log("test_db_timeouts") + test_db_timeouts(db) + log("test_combinations") + test_combinations(db) + log("test_locality") + test_locality(db) + log("test_predicates") + test_predicates() + log("test_size_limit_option") + test_size_limit_option(db) + log("test_get_approximate_size") + test_get_approximate_size(db) + log("test_get_client_status") + test_get_client_status(db) + + if fdb.get_api_version() >= 710: + log("test_tenants") + test_tenants(db) + + except fdb.FDBError as e: + print("Unit tests failed: %s" % e.description) + traceback.print_exc() + + raise Exception("Unit tests failed: %s" % e.description) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + Unit tests for python FDB API. + """, + ) + parser.add_argument( + "--cluster-file", + "-C", + help="FDB cluster file", + required=True, + ) + parser.add_argument( + "--verbose", + "-V", + help="Print diagnostic info", + action="store_true", + ) + args = parser.parse_args() + if args.verbose: + VERBOSE = True + log("Opening database {}".format(args.cluster_file)) + db = fdb.open(args.cluster_file) + run_unit_tests(db) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 1be2b1dfa5..8465e76fe5 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -563,6 +563,7 @@ string(APPEND test_venv_cmd "${Python3_EXECUTABLE} -m venv ${test_venv_dir} ") string(APPEND test_venv_cmd "&& ${test_venv_activate} ") string(APPEND test_venv_cmd "&& pip install --upgrade pip ") string(APPEND test_venv_cmd "&& pip install -r ${CMAKE_SOURCE_DIR}/tests/TestRunner/requirements.txt") +string(APPEND test_venv_cmd "&& (cd ${CMAKE_BINARY_DIR}/bindings/python && python3 setup.py install) ") add_test( NAME test_venv_setup COMMAND bash -c ${test_venv_cmd} @@ -602,6 +603,12 @@ function(add_python_venv_test) COMMAND ${shell_cmd} ${shell_opt} "${test_venv_activate} && ${T_COMMAND}") set_tests_properties(${T_NAME} PROPERTIES FIXTURES_REQUIRED test_virtual_env_setup TIMEOUT ${T_TEST_TIMEOUT}) set(test_env_vars "PYTHONPATH=${CMAKE_SOURCE_DIR}/tests/TestRunner:${CMAKE_BINARY_DIR}/tests/TestRunner") + if(APPLE) + set(ld_env_name "DYLD_LIBRARY_PATH") + else() + set(ld_env_name "LD_LIBRARY_PATH") + endif() + set(test_env_vars PROPERTIES ENVIRONMENT "${test_env_vars};${ld_env_name}=${CMAKE_BINARY_DIR}/lib:$ENV{${ld_env_name}}") if(USE_SANITIZER) set(test_env_vars "${test_env_vars};${SANITIZER_OPTIONS}") endif()