From ec79ecce7347d3c2f174360f0eeee86386ef3ef1 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 16:23:25 -0800 Subject: [PATCH 01/11] Add a boolean parameter for ForceRemove; rename ForceJoinNewMetacluster to ForceJoin --- fdbcli/MetaclusterCommands.actor.cpp | 24 ++++++++--------- fdbclient/Metacluster.cpp | 3 ++- .../fdbclient/MetaclusterManagement.actor.h | 23 ++++++++-------- ...terManagementConcurrencyWorkload.actor.cpp | 2 +- .../MetaclusterRestoreWorkload.actor.cpp | 26 +++++++++++-------- 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index c7ef3e9aa6..a600727291 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -171,14 +171,14 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - state bool force = tokens.size() == 4; + state ForceRemove forceRemove = ForceRemove(tokens.size() == 4); state ClusterType clusterType = wait(runTransaction(db, [](Reference tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); return TenantAPI::getClusterType(tr); })); - if (clusterType == ClusterType::METACLUSTER_DATA && !force) { + if (clusterType == ClusterType::METACLUSTER_DATA && !forceRemove) { if (tokens[2] == "FORCE"_sr) { fmt::print("ERROR: a cluster name must be specified.\n"); } else { @@ -190,8 +190,7 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector return false; } - bool updatedDataCluster = - wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, tokens.size() == 4, 15.0)); + bool updatedDataCluster = wait(MetaclusterAPI::removeCluster(db, clusterName, clusterType, forceRemove, 15.0)); if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) { fmt::print("The cluster `{}' has been removed\n", printable(clusterName).c_str()); @@ -211,7 +210,7 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector void printRestoreUsage() { fmt::print("Usage: metacluster restore [dryrun] connection_string=\n" - " [force_join_new_metacluster]\n\n"); + " [force_join]\n\n"); fmt::print("Add a restored data cluster back to a metacluster.\n\n"); @@ -223,8 +222,9 @@ void printRestoreUsage() { fmt::print("that the metacluster is already tracking. This mode should be used if only data\n"); fmt::print("clusters are being restored, and any discrepancies between the management and\n"); fmt::print("data clusters will be resolved using the management cluster metadata.\n"); - fmt::print("If `force_join_new_metacluster' is specified, the cluster will try to restore\n"); - fmt::print("to a different metacluster than it was originally registered to.\n\n"); + fmt::print("If `force_join' is specified, the cluster will try to restore to a different\n"); + fmt::print("metacluster than it was originally registered to or with a different ID than\n"); + fmt::print("is associated with the given cluster name.\n\n"); fmt::print("Use `repopulate_from_data_cluster' to rebuild a lost management cluster from the\n"); fmt::print("data clusters in a metacluster. This mode should be used if the management\n"); @@ -244,7 +244,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto } state bool dryRun = tokens[3] == "dryrun"_sr; - state bool forceJoin = tokens[tokens.size() - 1] == "force_join_new_metacluster"_sr; + state bool forceJoin = tokens[tokens.size() - 1] == "force_join"_sr; if (tokens.size() < 5 + (int)dryRun + (int)forceJoin) { printRestoreUsage(); @@ -274,7 +274,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto config.get().first.get(), ApplyManagementClusterUpdates::True, RestoreDryRun(dryRun), - ForceJoinNewMetacluster(forceJoin), + ForceJoin(forceJoin), &messages)); } else if (restoreType == "repopulate_from_data_cluster"_sr) { wait(MetaclusterAPI::restoreCluster(db, @@ -282,7 +282,7 @@ ACTOR Future metaclusterRestoreCommand(Reference db, std::vecto config.get().first.get(), ApplyManagementClusterUpdates::False, RestoreDryRun(dryRun), - ForceJoinNewMetacluster(forceJoin), + ForceJoin(forceJoin), &messages)); } else { fmt::print(stderr, "ERROR: unrecognized restore mode `{}'\n", printable(restoreType)); @@ -589,7 +589,7 @@ void metaclusterGenerator(const char* text, const char* opts[] = { "restore_known_data_cluster", "repopulate_from_data_cluster", nullptr }; arrayGenerator(text, line, opts, lc); } else if (tokens.size() == 5 + (int)dryrun) { - const char* opts[] = { "force_join_new_metacluster", nullptr }; + const char* opts[] = { "force_join", nullptr }; arrayGenerator(text, line, opts, lc); } } @@ -624,7 +624,7 @@ std::vector metaclusterHintGenerator(std::vector const& "[dryrun]", "connection_string=", "", - "[force_join_new_metacluster]" }; + "[force_join]" }; if (tokens.size() < 4 || (tokens[3].size() <= 6 && "dryrun"_sr.startsWith(tokens[3]))) { return std::vector(opts.begin() + tokens.size() - 2, opts.end()); } else if (tokens.size() < 6) { diff --git a/fdbclient/Metacluster.cpp b/fdbclient/Metacluster.cpp index 2357315c45..117ba18ff1 100644 --- a/fdbclient/Metacluster.cpp +++ b/fdbclient/Metacluster.cpp @@ -31,7 +31,8 @@ FDB_DEFINE_BOOLEAN_PARAM(IsRestoring); FDB_DEFINE_BOOLEAN_PARAM(RunOnDisconnectedCluster); FDB_DEFINE_BOOLEAN_PARAM(RunOnMismatchedCluster); FDB_DEFINE_BOOLEAN_PARAM(RestoreDryRun); -FDB_DEFINE_BOOLEAN_PARAM(ForceJoinNewMetacluster); +FDB_DEFINE_BOOLEAN_PARAM(ForceJoin); +FDB_DEFINE_BOOLEAN_PARAM(ForceRemove); namespace MetaclusterAPI { diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 9813856343..8dfc6d01c8 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -100,7 +100,8 @@ FDB_DECLARE_BOOLEAN_PARAM(IsRestoring); FDB_DECLARE_BOOLEAN_PARAM(RunOnDisconnectedCluster); FDB_DECLARE_BOOLEAN_PARAM(RunOnMismatchedCluster); FDB_DECLARE_BOOLEAN_PARAM(RestoreDryRun); -FDB_DECLARE_BOOLEAN_PARAM(ForceJoinNewMetacluster); +FDB_DECLARE_BOOLEAN_PARAM(ForceJoin); +FDB_DECLARE_BOOLEAN_PARAM(ForceRemove); namespace MetaclusterAPI { @@ -870,7 +871,7 @@ struct RemoveClusterImpl { Reference db; ClusterType clusterType; ClusterName clusterName; - bool forceRemove; + ForceRemove forceRemove; double dataClusterTimeout; // Parameters set in markClusterRemoving @@ -882,7 +883,7 @@ struct RemoveClusterImpl { RemoveClusterImpl(Reference db, ClusterName clusterName, ClusterType clusterType, - bool forceRemove, + ForceRemove forceRemove, double dataClusterTimeout) : ctx(db, Optional(), @@ -1191,7 +1192,7 @@ ACTOR template Future removeCluster(Reference db, ClusterName name, ClusterType clusterType, - bool forceRemove, + ForceRemove forceRemove, double dataClusterTimeout = 0) { state RemoveClusterImpl impl(db, name, clusterType, forceRemove, dataClusterTimeout); wait(impl.run()); @@ -1332,7 +1333,7 @@ struct RestoreClusterImpl { ClusterConnectionString connectionString; ApplyManagementClusterUpdates applyManagementClusterUpdates; RestoreDryRun restoreDryRun; - ForceJoinNewMetacluster forceJoinNewMetacluster; + ForceJoin forceJoin; std::vector& messages; // Unique ID generated for this restore. Used to avoid concurrent restores @@ -1352,11 +1353,11 @@ struct RestoreClusterImpl { ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates, RestoreDryRun restoreDryRun, - ForceJoinNewMetacluster forceJoinNewMetacluster, + ForceJoin forceJoin, std::vector& messages) : ctx(managementDb, {}, { DataClusterState::RESTORING }), clusterName(clusterName), connectionString(connectionString), applyManagementClusterUpdates(applyManagementClusterUpdates), - restoreDryRun(restoreDryRun), forceJoinNewMetacluster(forceJoinNewMetacluster), messages(messages) {} + restoreDryRun(restoreDryRun), forceJoin(forceJoin), messages(messages) {} ACTOR template static Future checkRestoreId(RestoreClusterImpl* self, Transaction tr) { @@ -1422,7 +1423,7 @@ struct RestoreClusterImpl { if (!metaclusterRegistration.present()) { throw invalid_data_cluster(); } else if (!metaclusterRegistration.get().matches(self->ctx.metaclusterRegistration.get())) { - if (!self->forceJoinNewMetacluster) { + if (!self->forceJoin) { TraceEvent(SevWarn, "MetaclusterRestoreClusterMismatch") .detail("ExistingRegistration", metaclusterRegistration.get()) .detail("ManagementClusterRegistration", self->ctx.metaclusterRegistration.get()); @@ -2073,7 +2074,7 @@ struct RestoreClusterImpl { wait(self->runRestoreDataClusterTransaction( [self = self](Reference tr) { return getTenantsFromDataCluster(self, tr); }, RunOnDisconnectedCluster::False, - RunOnMismatchedCluster(self->restoreDryRun && self->forceJoinNewMetacluster))); + RunOnMismatchedCluster(self->restoreDryRun && self->forceJoin))); // Fix any differences between the data cluster and the management cluster wait(reconcileTenants(self)); @@ -2164,10 +2165,10 @@ Future restoreCluster(Reference db, ClusterConnectionString connectionString, ApplyManagementClusterUpdates applyManagementClusterUpdates, RestoreDryRun restoreDryRun, - ForceJoinNewMetacluster forceJoinNewMetacluster, + ForceJoin forceJoin, std::vector* messages) { state RestoreClusterImpl impl( - db, name, connectionString, applyManagementClusterUpdates, restoreDryRun, forceJoinNewMetacluster, *messages); + db, name, connectionString, applyManagementClusterUpdates, restoreDryRun, forceJoin, *messages); wait(impl.run()); return Void(); } diff --git a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp index e3b923a012..4cf12323e9 100644 --- a/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementConcurrencyWorkload.actor.cpp @@ -143,7 +143,7 @@ struct MetaclusterManagementConcurrencyWorkload : TestWorkload { TraceEvent(SevDebug, "MetaclusterManagementConcurrencyRemovingCluster", debugId) .detail("ClusterName", clusterName); Future removeFuture = MetaclusterAPI::removeCluster( - self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, false); + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, ForceRemove::False); Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { ASSERT(result.get()); diff --git a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp index 80765a442e..c8bcced2a8 100644 --- a/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterRestoreWorkload.actor.cpp @@ -239,7 +239,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { Database dataDb, std::string backupUrl, bool addToMetacluster, - ForceJoinNewMetacluster forceJoinNewMetacluster, + ForceJoin forceJoin, int simultaneousRestoreCount, MetaclusterRestoreWorkload* self) { state FileBackupAgent backupAgent; @@ -274,7 +274,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { dataDb->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::True, RestoreDryRun::True, - forceJoinNewMetacluster, + forceJoin, &messages)); state MetaclusterData postDryRunMetaclusterData(self->managementDb); @@ -298,7 +298,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { dataDb->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::True, RestoreDryRun::False, - forceJoinNewMetacluster, + forceJoin, &messages)); TraceEvent("MetaclusterRestoreWorkloadRestoreComplete").detail("ClusterName", clusterName); } @@ -516,8 +516,10 @@ struct MetaclusterRestoreWorkload : TestWorkload { TraceEvent("MetaclusterRestoreWorkloadProcessDataCluster").detail("FromCluster", clusterItr->first); // Remove the data cluster from its old metacluster - wait(success(MetaclusterAPI::removeCluster( - clusterItr->second.db.getReference(), clusterItr->first, ClusterType::METACLUSTER_DATA, true))); + wait(success(MetaclusterAPI::removeCluster(clusterItr->second.db.getReference(), + clusterItr->first, + ClusterType::METACLUSTER_DATA, + ForceRemove::True))); TraceEvent("MetaclusterRestoreWorkloadForgotMetacluster").detail("ClusterName", clusterItr->first); state std::pair collisions = @@ -554,7 +556,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { clusterItr->second.db->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::False, RestoreDryRun::True, - ForceJoinNewMetacluster(deterministicRandom()->coinflip()), + ForceJoin(deterministicRandom()->coinflip()), &messages)); state MetaclusterData postDryRunMetaclusterData(self->managementDb); @@ -582,7 +584,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { clusterItr->second.db->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::False, RestoreDryRun::False, - ForceJoinNewMetacluster(deterministicRandom()->coinflip()), + ForceJoin(deterministicRandom()->coinflip()), &messages)); ASSERT(collisions.first.empty() && collisions.second.empty()); @@ -597,8 +599,10 @@ struct MetaclusterRestoreWorkload : TestWorkload { // If the restore did not succeed, remove the partially restored cluster try { - wait(success(MetaclusterAPI::removeCluster( - self->managementDb, clusterItr->first, ClusterType::METACLUSTER_MANAGEMENT, true))); + wait(success(MetaclusterAPI::removeCluster(self->managementDb, + clusterItr->first, + ClusterType::METACLUSTER_MANAGEMENT, + ForceRemove::True))); TraceEvent("MetaclusterRestoreWorkloadRemoveFailedCluster") .detail("ClusterName", clusterItr->first); } catch (Error& e) { @@ -928,7 +932,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { self->dataDbs[cluster].db, backupUrl.get(), !self->recoverManagementCluster, - ForceJoinNewMetacluster(deterministicRandom()->coinflip()), + ForceJoin(deterministicRandom()->coinflip()), backups.size(), self)); } @@ -945,7 +949,7 @@ struct MetaclusterRestoreWorkload : TestWorkload { self->dataDbs[cluster].db, backupUrl.get(), true, - ForceJoinNewMetacluster::True, + ForceJoin::True, backups.size(), self)); } From f06ed85044707c39775786e0be58a7684d26f187 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 16:24:13 -0800 Subject: [PATCH 02/11] Update the tenant and metacluster consistency checks to account for the possibility of renamed tenants or partially registered clusters --- .../workloads/MetaclusterConsistency.actor.h | 2 +- .../fdbserver/workloads/MetaclusterData.actor.h | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index eba282a02b..53fdc7b21e 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -98,7 +98,7 @@ private: auto allocatedItr = data.clusterAllocatedMap.find(clusterName); if (!clusterMetadata.entry.hasCapacity()) { ASSERT(allocatedItr == data.clusterAllocatedMap.end()); - } else { + } else if (clusterMetadata.entry.clusterState == DataClusterState::READY) { ASSERT(allocatedItr != data.clusterAllocatedMap.end()); ASSERT_EQ(allocatedItr->second, clusterMetadata.entry.allocated.numTenantGroups); ++numFoundInAllocatedMap; diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h index 8366a4e4bd..7bf8ee2e46 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h @@ -164,14 +164,19 @@ private: ASSERT_EQ(t.size(), 3); TenantName tenantName = t.getString(1); int64_t tenantId = t.getInt(2); - ASSERT(tenantName == self->managementMetadata.tenantData.tenantMap[tenantId].tenantName); - self->managementMetadata.clusterTenantMap[t.getString(0)].insert(tenantId); + MetaclusterTenantMapEntry const& entry = self->managementMetadata.tenantMap[tenantId]; + bool renaming = + entry.tenantState == MetaclusterAPI::TenantState::RENAMING && entry.renameDestination == tenantName; + ASSERT(tenantName == entry.tenantName || renaming); + if (!renaming) { + ASSERT(self->managementMetadata.clusterTenantMap[t.getString(0)].insert(tenantId).second); + } } for (auto t : clusterTenantGroupTuples.results) { ASSERT_EQ(t.size(), 2); TenantGroupName tenantGroupName = t.getString(1); - self->managementMetadata.clusterTenantGroupMap[t.getString(0)].insert(tenantGroupName); + ASSERT(self->managementMetadata.clusterTenantGroupMap[t.getString(0)].insert(tenantGroupName).second); } return Void(); From 7da247cde25639e0c97ca6e0208472ed8f7a6abc Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 16:31:08 -0800 Subject: [PATCH 03/11] Prevent operations on clusters that are restoring --- .../fdbclient/MetaclusterManagement.actor.h | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 8dfc6d01c8..99c123de1b 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -366,6 +366,12 @@ struct MetaclusterOperationContext { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state bool checkRestoring = !self->extraSupportedDataClusterStates.count(DataClusterState::RESTORING); + state Future>> activeRestoreIdFuture; + if (checkRestoring && self->clusterName.present()) { + activeRestoreIdFuture = MetaclusterMetadata::activeRestoreIds().getRange(tr, {}, {}, 1); + } + state Optional currentMetaclusterRegistration = wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); @@ -382,6 +388,13 @@ struct MetaclusterOperationContext { } } + if (checkRestoring) { + KeyBackedRangeResult> activeRestoreId = wait(activeRestoreIdFuture); + if (!activeRestoreId.results.empty()) { + throw cluster_restoring(); + } + } + self->dataClusterIsRegistered = currentMetaclusterRegistration.present(); state decltype(std::declval()(Reference()).getValue()) result = wait(func(tr)); @@ -791,6 +804,10 @@ struct RegisterClusterImpl { self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, self->clusterEntry.id)); + // If we happen to have any orphaned restore IDs from a previous time this cluster was in a metacluster, + // erase them now. + MetaclusterMetadata::activeRestoreIds().clear(tr); + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); TraceEvent("ConfiguredDataCluster") @@ -817,6 +834,8 @@ struct RegisterClusterImpl { throw cluster_already_exists(); } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::READY) { return Void(); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { + throw cluster_restoring(); } else { ASSERT(dataClusterMetadata.get().entry.clusterState == DataClusterState::REGISTERING); dataClusterMetadata.get().entry.clusterState = DataClusterState::READY; @@ -933,6 +952,7 @@ struct RemoveClusterImpl { if (self->ctx.dataClusterIsRegistered) { // Delete metacluster related metadata MetaclusterMetadata::metaclusterRegistration().clear(tr); + MetaclusterMetadata::activeRestoreIds().clear(tr); TenantMetadata::tenantTombstones().clear(tr); TenantMetadata::tombstoneCleanupData().clear(tr); @@ -1033,6 +1053,7 @@ struct RemoveClusterImpl { ManagementClusterMetadata::dataClusters().erase(tr, ctx.clusterName.get()); ManagementClusterMetadata::dataClusterConnectionRecords.erase(tr, ctx.clusterName.get()); ManagementClusterMetadata::clusterTenantCount.erase(tr, ctx.clusterName.get()); + MetaclusterMetadata::activeRestoreIds().erase(tr, ctx.clusterName.get()); } // Removes the next set of metadata from the management cluster; returns true when all specified From 1b9d4a8d5a347a72f1121aa4da9cc7dec7e3af33 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:19:56 -0800 Subject: [PATCH 04/11] Registering a restoring data cluster didn't detect cases where it should fail, such as if the cluster already existed --- .../fdbclient/MetaclusterManagement.actor.h | 74 ++++++++----------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 99c123de1b..ed09f75d45 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -672,39 +672,6 @@ void updateClusterMetadata(Transaction tr, } } -// Store the cluster entry for the new cluster -ACTOR template -static Future registerInManagementCluster(Transaction tr, - ClusterName clusterName, - DataClusterEntry clusterEntry, - ClusterConnectionString connectionString, - RestoreDryRun restoreDryRun) { - state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, clusterName)); - if (dataClusterMetadata.present() && - !dataClusterMetadata.get().matchesConfiguration(DataClusterMetadata(clusterEntry, connectionString))) { - TraceEvent("RegisterClusterAlreadyExists").detail("ClusterName", clusterName); - throw cluster_already_exists(); - } else if (!restoreDryRun && !dataClusterMetadata.present()) { - clusterEntry.allocated = ClusterUsage(); - - if (clusterEntry.hasCapacity()) { - ManagementClusterMetadata::clusterCapacityIndex.insert( - tr, Tuple::makeTuple(clusterEntry.allocated.numTenantGroups, clusterName)); - } - ManagementClusterMetadata::dataClusters().set(tr, clusterName, clusterEntry); - ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, clusterName, connectionString); - - TraceEvent("RegisteredDataCluster") - .detail("ClusterName", clusterName) - .detail("ClusterID", clusterEntry.id) - .detail("Capacity", clusterEntry.capacity) - .detail("Version", tr->getCommittedVersion()) - .detail("ConnectionString", connectionString.toString()); - } - - return Void(); -} - template struct RegisterClusterImpl { MetaclusterOperationContext ctx; @@ -1480,6 +1447,37 @@ struct RestoreClusterImpl { } } + // Store the cluster entry for the restored cluster + ACTOR static Future registerRestoringClusterInManagementCluster(RestoreClusterImpl* self, + Reference tr) { + state DataClusterEntry clusterEntry; + clusterEntry.id = self->dataClusterId; + clusterEntry.clusterState = DataClusterState::RESTORING; + + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (dataClusterMetadata.present() && + (dataClusterMetadata.get().entry.clusterState != DataClusterState::RESTORING || + !dataClusterMetadata.get().matchesConfiguration( + DataClusterMetadata(clusterEntry, self->connectionString)))) { + TraceEvent("RestoredClusterAlreadyExists").detail("ClusterName", self->clusterName); + throw cluster_already_exists(); + } else if (!self->restoreDryRun) { + MetaclusterMetadata::activeRestoreIds().set(tr, self->clusterName, self->restoreId); + + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + + TraceEvent("RegisteredRestoringDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", clusterEntry.id) + .detail("Capacity", clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + } + + return Void(); + } + // If adding a data cluster to a restored management cluster, write a metacluster registration entry // to attach it ACTOR static Future writeDataClusterRegistration(RestoreClusterImpl* self) { @@ -2122,15 +2120,7 @@ struct RestoreClusterImpl { // Record the data cluster in the management cluster wait(self->ctx.runManagementTransaction([self = self](Reference tr) { - if (!self->restoreDryRun) { - MetaclusterMetadata::activeRestoreIds().set(tr, self->clusterName, self->restoreId); - } - - DataClusterEntry entry; - entry.id = self->dataClusterId; - entry.clusterState = DataClusterState::RESTORING; - return registerInManagementCluster( - tr, self->clusterName, entry, self->connectionString, self->restoreDryRun); + return registerRestoringClusterInManagementCluster(self, tr); })); // Write a metacluster registration entry in the data cluster From 92011b9339082f404ee768c056115eb496f8be58 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:21:11 -0800 Subject: [PATCH 05/11] When erasing tenants during a force removal, tenants being renamed were double counted against the tenant count --- .../include/fdbclient/MetaclusterManagement.actor.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index ed09f75d45..ecadac905e 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -959,9 +959,12 @@ struct RemoveClusterImpl { state KeyBackedRangeResult tenantEntries = wait(tenantEntriesFuture); // Erase each tenant from the tenant map on the management cluster + std::set erasedTenants; for (Tuple entry : tenantEntries.results) { + int64_t tenantId = entry.getInt(2); ASSERT(entry.getString(0) == self->ctx.clusterName.get()); - ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, entry.getInt(2)); + erasedTenants.insert(tenantId); + ManagementClusterMetadata::tenantMetadata().tenantMap.erase(tr, tenantId); ManagementClusterMetadata::tenantMetadata().tenantNameIndex.erase(tr, entry.getString(1)); ManagementClusterMetadata::tenantMetadata().lastTenantModification.setVersionstamp(tr, Versionstamp(), 0); } @@ -975,9 +978,9 @@ struct RemoveClusterImpl { } ManagementClusterMetadata::tenantMetadata().tenantCount.atomicOp( - tr, -tenantEntries.results.size(), MutationRef::AddValue); + tr, -erasedTenants.size(), MutationRef::AddValue); ManagementClusterMetadata::clusterTenantCount.atomicOp( - tr, self->ctx.clusterName.get(), -tenantEntries.results.size(), MutationRef::AddValue); + tr, self->ctx.clusterName.get(), -erasedTenants.size(), MutationRef::AddValue); return !tenantEntries.more; } From c4ef32c77a3e31e8c132cfe7966e6ffab0885434 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:21:49 -0800 Subject: [PATCH 06/11] Check for registraton tombstones when registering a cluster during restores --- .../include/fdbclient/MetaclusterManagement.actor.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index ecadac905e..cdf3078054 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1490,9 +1490,18 @@ struct RestoreClusterImpl { loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + state Future tombstoneFuture = + MetaclusterMetadata::registrationTombstones().exists(tr, self->dataClusterId); + state Optional metaclusterRegistration = wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + // Check if the cluster was removed concurrently + bool tombstone = wait(tombstoneFuture); + if (tombstone) { + throw cluster_removed(); + } + MetaclusterRegistrationEntry dataClusterEntry = self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, self->dataClusterId); From 6dad4c5c603d216cfd542688875529abf04930eb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:23:12 -0800 Subject: [PATCH 07/11] Improve trace event --- fdbclient/include/fdbclient/MetaclusterManagement.actor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index cdf3078054..b3d7bf60dd 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -1512,7 +1512,8 @@ struct RestoreClusterImpl { } TraceEvent(SevWarn, "MetaclusterRestoreClusterAlreadyRegistered") - .detail("ExistingRegistration", metaclusterRegistration.get()); + .detail("ExistingRegistration", metaclusterRegistration.get()) + .detail("NewRegistration", dataClusterEntry); throw cluster_already_registered(); } From 587c47832c0c44144d0eda99640df73610dd9836 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:29:12 -0800 Subject: [PATCH 08/11] Swap the register and remove cluster opertations (no change to the code) --- .../fdbclient/MetaclusterManagement.actor.h | 354 +++++++++--------- 1 file changed, 177 insertions(+), 177 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index b3d7bf60dd..9ed7c23f3a 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -672,183 +672,6 @@ void updateClusterMetadata(Transaction tr, } } -template -struct RegisterClusterImpl { - MetaclusterOperationContext ctx; - - // Initialization parameters - ClusterName clusterName; - ClusterConnectionString connectionString; - DataClusterEntry clusterEntry; - - RegisterClusterImpl(Reference managementDb, - ClusterName clusterName, - ClusterConnectionString connectionString, - DataClusterEntry clusterEntry) - : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {} - - // Store the cluster entry for the new cluster in a registering state - ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, - Reference tr) { - state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); - if (!dataClusterMetadata.present()) { - self->clusterEntry.clusterState = DataClusterState::REGISTERING; - self->clusterEntry.allocated = ClusterUsage(); - self->clusterEntry.id = deterministicRandom()->randomUniqueID(); - - ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); - ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); - } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { - throw cluster_removed(); - } else if (!dataClusterMetadata.get().matchesConfiguration( - DataClusterMetadata(self->clusterEntry, self->connectionString)) || - dataClusterMetadata.get().entry.clusterState != DataClusterState::REGISTERING) { - throw cluster_already_exists(); - } else { - self->clusterEntry = dataClusterMetadata.get().entry; - } - - TraceEvent("RegisteringDataCluster") - .detail("ClusterName", self->clusterName) - .detail("ClusterID", self->clusterEntry.id) - .detail("Capacity", self->clusterEntry.capacity) - .detail("ConnectionString", self->connectionString.toString()); - - return Void(); - } - - ACTOR static Future configureDataCluster(RegisterClusterImpl* self) { - state Reference dataClusterDb = wait(openDatabase(self->connectionString)); - state Reference tr = dataClusterDb->createTransaction(); - loop { - try { - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - - state Future>> existingTenantsFuture = - TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1); - state ThreadFuture existingDataFuture = tr->getRange(normalKeys, 1); - state Future tombstoneFuture = - MetaclusterMetadata::registrationTombstones().exists(tr, self->clusterEntry.id); - - // Check whether this cluster has already been registered - state Optional existingRegistration = - wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); - if (existingRegistration.present()) { - if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA || - existingRegistration.get().name != self->clusterName || - !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || - existingRegistration.get().id != self->clusterEntry.id) { - throw cluster_already_registered(); - } else { - // We already successfully registered the cluster with these details, so there's nothing to - // do - return Void(); - } - } - - // Check if the cluster was removed concurrently - bool tombstone = wait(tombstoneFuture); - if (tombstone) { - throw cluster_removed(); - } - - // Check for any existing data - std::vector> existingTenants = - wait(safeThreadFutureToFuture(existingTenantsFuture)); - if (!existingTenants.empty()) { - TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName); - throw cluster_not_empty(); - } - - RangeResult existingData = wait(safeThreadFutureToFuture(existingDataFuture)); - if (!existingData.empty()) { - TraceEvent(SevWarn, "CannotRegisterClusterWithData").detail("ClusterName", self->clusterName); - throw cluster_not_empty(); - } - - MetaclusterMetadata::metaclusterRegistration().set( - tr, - self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, - self->clusterEntry.id)); - - // If we happen to have any orphaned restore IDs from a previous time this cluster was in a metacluster, - // erase them now. - MetaclusterMetadata::activeRestoreIds().clear(tr); - - wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); - - TraceEvent("ConfiguredDataCluster") - .detail("ClusterName", self->clusterName) - .detail("ClusterID", self->clusterEntry.id) - .detail("Capacity", self->clusterEntry.capacity) - .detail("Version", tr->getCommittedVersion()) - .detail("ConnectionString", self->connectionString.toString()); - - return Void(); - } catch (Error& e) { - wait(safeThreadFutureToFuture(tr->onError(e))); - } - } - } - - // Store the cluster entry for the new cluster - ACTOR static Future markClusterReady(RegisterClusterImpl* self, Reference tr) { - state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); - if (!dataClusterMetadata.present() || - dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { - throw cluster_removed(); - } else if (dataClusterMetadata.get().entry.id != self->clusterEntry.id) { - throw cluster_already_exists(); - } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::READY) { - return Void(); - } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { - throw cluster_restoring(); - } else { - ASSERT(dataClusterMetadata.get().entry.clusterState == DataClusterState::REGISTERING); - dataClusterMetadata.get().entry.clusterState = DataClusterState::READY; - - if (dataClusterMetadata.get().entry.hasCapacity()) { - ManagementClusterMetadata::clusterCapacityIndex.insert( - tr, Tuple::makeTuple(dataClusterMetadata.get().entry.allocated.numTenantGroups, self->clusterName)); - } - ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, dataClusterMetadata.get().entry); - ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); - } - - TraceEvent("RegisteredDataCluster") - .detail("ClusterName", self->clusterName) - .detail("ClusterID", self->clusterEntry.id) - .detail("Capacity", dataClusterMetadata.get().entry.capacity) - .detail("ConnectionString", self->connectionString.toString()); - - return Void(); - } - - ACTOR static Future run(RegisterClusterImpl* self) { - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); - - // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we - // don't have a metacluster registration on the data cluster - wait(configureDataCluster(self)); - wait(self->ctx.runManagementTransaction( - [self = self](Reference tr) { return markClusterReady(self, tr); })); - - return Void(); - } - Future run() { return run(this); } -}; - -ACTOR template -Future registerCluster(Reference db, - ClusterName name, - ClusterConnectionString connectionString, - DataClusterEntry entry) { - state RegisterClusterImpl impl(db, name, connectionString, entry); - wait(impl.run()); - return Void(); -} - template struct RemoveClusterImpl { MetaclusterOperationContext ctx; @@ -1190,6 +1013,183 @@ Future removeCluster(Reference db, return impl.dataClusterUpdated; } +template +struct RegisterClusterImpl { + MetaclusterOperationContext ctx; + + // Initialization parameters + ClusterName clusterName; + ClusterConnectionString connectionString; + DataClusterEntry clusterEntry; + + RegisterClusterImpl(Reference managementDb, + ClusterName clusterName, + ClusterConnectionString connectionString, + DataClusterEntry clusterEntry) + : ctx(managementDb), clusterName(clusterName), connectionString(connectionString), clusterEntry(clusterEntry) {} + + // Store the cluster entry for the new cluster in a registering state + ACTOR static Future registerInManagementCluster(RegisterClusterImpl* self, + Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (!dataClusterMetadata.present()) { + self->clusterEntry.clusterState = DataClusterState::REGISTERING; + self->clusterEntry.allocated = ClusterUsage(); + self->clusterEntry.id = deterministicRandom()->randomUniqueID(); + + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, self->clusterEntry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } else if (!dataClusterMetadata.get().matchesConfiguration( + DataClusterMetadata(self->clusterEntry, self->connectionString)) || + dataClusterMetadata.get().entry.clusterState != DataClusterState::REGISTERING) { + throw cluster_already_exists(); + } else { + self->clusterEntry = dataClusterMetadata.get().entry; + } + + TraceEvent("RegisteringDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } + + ACTOR static Future configureDataCluster(RegisterClusterImpl* self) { + state Reference dataClusterDb = wait(openDatabase(self->connectionString)); + state Reference tr = dataClusterDb->createTransaction(); + loop { + try { + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + state Future>> existingTenantsFuture = + TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1); + state ThreadFuture existingDataFuture = tr->getRange(normalKeys, 1); + state Future tombstoneFuture = + MetaclusterMetadata::registrationTombstones().exists(tr, self->clusterEntry.id); + + // Check whether this cluster has already been registered + state Optional existingRegistration = + wait(MetaclusterMetadata::metaclusterRegistration().get(tr)); + if (existingRegistration.present()) { + if (existingRegistration.get().clusterType != ClusterType::METACLUSTER_DATA || + existingRegistration.get().name != self->clusterName || + !existingRegistration.get().matches(self->ctx.metaclusterRegistration.get()) || + existingRegistration.get().id != self->clusterEntry.id) { + throw cluster_already_registered(); + } else { + // We already successfully registered the cluster with these details, so there's nothing to + // do + return Void(); + } + } + + // Check if the cluster was removed concurrently + bool tombstone = wait(tombstoneFuture); + if (tombstone) { + throw cluster_removed(); + } + + // Check for any existing data + std::vector> existingTenants = + wait(safeThreadFutureToFuture(existingTenantsFuture)); + if (!existingTenants.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + RangeResult existingData = wait(safeThreadFutureToFuture(existingDataFuture)); + if (!existingData.empty()) { + TraceEvent(SevWarn, "CannotRegisterClusterWithData").detail("ClusterName", self->clusterName); + throw cluster_not_empty(); + } + + MetaclusterMetadata::metaclusterRegistration().set( + tr, + self->ctx.metaclusterRegistration.get().toDataClusterRegistration(self->clusterName, + self->clusterEntry.id)); + + // If we happen to have any orphaned restore IDs from a previous time this cluster was in a metacluster, + // erase them now. + MetaclusterMetadata::activeRestoreIds().clear(tr); + + wait(buggifiedCommit(tr, BUGGIFY_WITH_PROB(0.1))); + + TraceEvent("ConfiguredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", self->clusterEntry.capacity) + .detail("Version", tr->getCommittedVersion()) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } catch (Error& e) { + wait(safeThreadFutureToFuture(tr->onError(e))); + } + } + } + + // Store the cluster entry for the new cluster + ACTOR static Future markClusterReady(RegisterClusterImpl* self, Reference tr) { + state Optional dataClusterMetadata = wait(tryGetClusterTransaction(tr, self->clusterName)); + if (!dataClusterMetadata.present() || + dataClusterMetadata.get().entry.clusterState == DataClusterState::REMOVING) { + throw cluster_removed(); + } else if (dataClusterMetadata.get().entry.id != self->clusterEntry.id) { + throw cluster_already_exists(); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::READY) { + return Void(); + } else if (dataClusterMetadata.get().entry.clusterState == DataClusterState::RESTORING) { + throw cluster_restoring(); + } else { + ASSERT(dataClusterMetadata.get().entry.clusterState == DataClusterState::REGISTERING); + dataClusterMetadata.get().entry.clusterState = DataClusterState::READY; + + if (dataClusterMetadata.get().entry.hasCapacity()) { + ManagementClusterMetadata::clusterCapacityIndex.insert( + tr, Tuple::makeTuple(dataClusterMetadata.get().entry.allocated.numTenantGroups, self->clusterName)); + } + ManagementClusterMetadata::dataClusters().set(tr, self->clusterName, dataClusterMetadata.get().entry); + ManagementClusterMetadata::dataClusterConnectionRecords.set(tr, self->clusterName, self->connectionString); + } + + TraceEvent("RegisteredDataCluster") + .detail("ClusterName", self->clusterName) + .detail("ClusterID", self->clusterEntry.id) + .detail("Capacity", dataClusterMetadata.get().entry.capacity) + .detail("ConnectionString", self->connectionString.toString()); + + return Void(); + } + + ACTOR static Future run(RegisterClusterImpl* self) { + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); + + // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we + // don't have a metacluster registration on the data cluster + wait(configureDataCluster(self)); + wait(self->ctx.runManagementTransaction( + [self = self](Reference tr) { return markClusterReady(self, tr); })); + + return Void(); + } + Future run() { return run(this); } +}; + +ACTOR template +Future registerCluster(Reference db, + ClusterName name, + ClusterConnectionString connectionString, + DataClusterEntry entry) { + state RegisterClusterImpl impl(db, name, connectionString, entry); + wait(impl.run()); + return Void(); +} + ACTOR template Future> listClustersTransaction(Transaction tr, ClusterNameRef begin, From 91df95e8165f592db417e4d44409b730ccdc90fb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 21 Feb 2023 19:35:00 -0800 Subject: [PATCH 09/11] If registering a cluster fails on the data cluster, try to rollback the registration on the management cluster --- .../fdbclient/MetaclusterManagement.actor.h | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 9ed7c23f3a..3258f1916b 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -683,6 +683,10 @@ struct RemoveClusterImpl { ForceRemove forceRemove; double dataClusterTimeout; + // Optional parameters that are set by internal users + Optional clusterId; + std::set legalClusterStates; + // Parameters set in markClusterRemoving Optional lastTenantId; @@ -705,6 +709,12 @@ struct RemoveClusterImpl { state DataClusterMetadata clusterMetadata = wait(getClusterTransaction(tr, self->clusterName)); wait(self->ctx.setCluster(tr, self->clusterName)); + if ((self->clusterId.present() && clusterMetadata.entry.id != self->clusterId.get()) || + (!self->legalClusterStates.empty() && + !self->legalClusterStates.count(clusterMetadata.entry.clusterState))) { + throw cluster_already_exists(); + } + if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { throw cluster_not_empty(); } else if (self->ctx.dataClusterMetadata.get().entry.clusterState != DataClusterState::REMOVING) { @@ -1166,12 +1176,33 @@ struct RegisterClusterImpl { } ACTOR static Future run(RegisterClusterImpl* self) { + // Used if we need to rollback + state RemoveClusterImpl removeCluster( + self->ctx.managementDb, self->clusterName, ClusterType::METACLUSTER_MANAGEMENT, ForceRemove::True, 5.0); + wait(self->ctx.runManagementTransaction( [self = self](Reference tr) { return registerInManagementCluster(self, tr); })); // Don't use ctx to run this transaction because we have not set up the data cluster metadata on it and we // don't have a metacluster registration on the data cluster - wait(configureDataCluster(self)); + try { + wait(configureDataCluster(self)); + } catch (Error& e) { + state Error error = e; + try { + // Attempt to unregister the cluster if we could not configure the data cluster. We should only do this + // if the data cluster state matches our ID and is in the REGISTERING in case somebody else has + // attempted to complete the registration or start a new one. + removeCluster.clusterId = self->clusterEntry.id; + removeCluster.legalClusterStates.insert(DataClusterState::REGISTERING); + wait(removeCluster.run()); + } catch (Error& e) { + // Removing the cluster after failing to register the data cluster is a best effort attempt. If it + // fails, the operator will need to remove it (or re-register it) themselves. + } + throw error; + } + wait(self->ctx.runManagementTransaction( [self = self](Reference tr) { return markClusterReady(self, tr); })); From a5a8c57a38a17fc9b2885920f837a808df031ab4 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 22 Feb 2023 12:52:58 -0800 Subject: [PATCH 10/11] Fix some merge issues and missing updates to use new boolean parameters --- fdbcli/MetaclusterCommands.actor.cpp | 2 +- .../include/fdbserver/workloads/MetaclusterData.actor.h | 2 +- fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbcli/MetaclusterCommands.actor.cpp b/fdbcli/MetaclusterCommands.actor.cpp index a600727291..4eb6c11175 100644 --- a/fdbcli/MetaclusterCommands.actor.cpp +++ b/fdbcli/MetaclusterCommands.actor.cpp @@ -171,13 +171,13 @@ ACTOR Future metaclusterRemoveCommand(Reference db, std::vector } state ClusterNameRef clusterName = tokens[tokens.size() - 1]; - state ForceRemove forceRemove = ForceRemove(tokens.size() == 4); state ClusterType clusterType = wait(runTransaction(db, [](Reference tr) { tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); return TenantAPI::getClusterType(tr); })); + ForceRemove forceRemove(tokens.size() == 4); if (clusterType == ClusterType::METACLUSTER_DATA && !forceRemove) { if (tokens[2] == "FORCE"_sr) { fmt::print("ERROR: a cluster name must be specified.\n"); diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h index 7bf8ee2e46..27fec348fb 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterData.actor.h @@ -164,7 +164,7 @@ private: ASSERT_EQ(t.size(), 3); TenantName tenantName = t.getString(1); int64_t tenantId = t.getInt(2); - MetaclusterTenantMapEntry const& entry = self->managementMetadata.tenantMap[tenantId]; + MetaclusterTenantMapEntry const& entry = self->managementMetadata.tenantData.tenantMap[tenantId]; bool renaming = entry.tenantState == MetaclusterAPI::TenantState::RENAMING && entry.renameDestination == tenantName; ASSERT(tenantName == entry.tenantName || renaming); diff --git a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp index 058d8498a7..54df69b326 100644 --- a/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp +++ b/fdbserver/workloads/MetaclusterManagementWorkload.actor.cpp @@ -219,7 +219,7 @@ struct MetaclusterManagementWorkload : TestWorkload { try { loop { Future removeFuture = MetaclusterAPI::removeCluster( - self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, detachCluster); + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, ForceRemove(detachCluster)); try { Optional result = wait(timeout(removeFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -288,7 +288,7 @@ struct MetaclusterManagementWorkload : TestWorkload { dataDb->db->getConnectionRecord()->getConnectionString(), ApplyManagementClusterUpdates::True, RestoreDryRun(dryRun), - ForceJoinNewMetacluster(forceJoin), + ForceJoin(forceJoin), &messages); Optional result = wait(timeout(restoreFuture, deterministicRandom()->randomInt(1, 30))); if (result.present()) { @@ -1001,7 +1001,7 @@ struct MetaclusterManagementWorkload : TestWorkload { std::vector> removeClusterFutures; for (auto [clusterName, clusterMetadata] : dataClusters) { removeClusterFutures.push_back(success(MetaclusterAPI::removeCluster( - self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, !deleteTenants))); + self->managementDb, clusterName, ClusterType::METACLUSTER_MANAGEMENT, ForceRemove(!deleteTenants)))); } wait(waitForAll(removeClusterFutures)); From 33431f062d684c2cacfdf59270d2ed45a940e990 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 22 Feb 2023 14:39:51 -0800 Subject: [PATCH 11/11] Add some trace events, use a more appropriate error, and improve a check of allocated tenant groups --- .../include/fdbclient/MetaclusterManagement.actor.h | 10 +++++++++- .../fdbserver/workloads/MetaclusterConsistency.actor.h | 5 +++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h index 3258f1916b..0439d97299 100644 --- a/fdbclient/include/fdbclient/MetaclusterManagement.actor.h +++ b/fdbclient/include/fdbclient/MetaclusterManagement.actor.h @@ -712,7 +712,9 @@ struct RemoveClusterImpl { if ((self->clusterId.present() && clusterMetadata.entry.id != self->clusterId.get()) || (!self->legalClusterStates.empty() && !self->legalClusterStates.count(clusterMetadata.entry.clusterState))) { - throw cluster_already_exists(); + // The type of error is currently ignored, and this is only used to terminate the remove operation. + // If that changes in the future, we may want to introduce a more suitable error type. + throw operation_failed(); } if (!self->forceRemove && self->ctx.dataClusterMetadata.get().entry.allocated.numTenantGroups > 0) { @@ -1196,9 +1198,15 @@ struct RegisterClusterImpl { removeCluster.clusterId = self->clusterEntry.id; removeCluster.legalClusterStates.insert(DataClusterState::REGISTERING); wait(removeCluster.run()); + TraceEvent("RegisterClusterRolledBack") + .detail("ClusterName", self->clusterName) + .detail("ConnectionString", self->connectionString.toString()); } catch (Error& e) { // Removing the cluster after failing to register the data cluster is a best effort attempt. If it // fails, the operator will need to remove it (or re-register it) themselves. + TraceEvent(SevWarn, "RegisterClusterRollbackFailed") + .detail("ClusterName", self->clusterName) + .detail("ConnectionString", self->connectionString.toString()); } throw error; } diff --git a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h index 53fdc7b21e..5db4a3d876 100644 --- a/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h +++ b/fdbserver/include/fdbserver/workloads/MetaclusterConsistency.actor.h @@ -98,10 +98,11 @@ private: auto allocatedItr = data.clusterAllocatedMap.find(clusterName); if (!clusterMetadata.entry.hasCapacity()) { ASSERT(allocatedItr == data.clusterAllocatedMap.end()); - } else if (clusterMetadata.entry.clusterState == DataClusterState::READY) { - ASSERT(allocatedItr != data.clusterAllocatedMap.end()); + } else if (allocatedItr != data.clusterAllocatedMap.end()) { ASSERT_EQ(allocatedItr->second, clusterMetadata.entry.allocated.numTenantGroups); ++numFoundInAllocatedMap; + } else { + ASSERT_NE(clusterMetadata.entry.clusterState, DataClusterState::READY); } // Check that the number of tenant groups in the cluster is smaller than the allocated number of tenant