From 1c5c8d26d8fd59a3e94eb072d4633d2275f045a9 Mon Sep 17 00:00:00 2001 From: Dmitry Wagin Date: Sat, 3 Sep 2022 23:21:01 +0300 Subject: [PATCH 01/34] Fixed FreeBSD build --- fdbserver/fdbserver.actor.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index cbcfd04e45..c15b0b0896 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -88,7 +88,11 @@ #if defined(__linux__) || defined(__FreeBSD__) #include #include +#if defined(__linux__) #include +#elif defined(__FreeBSD__) +#include +#endif #ifdef ALLOC_INSTRUMENTATION #include #endif @@ -2394,10 +2398,15 @@ int main(int argc, char* argv[]) { g_network->getLocalAddress(), opts.rollsize, opts.maxLogsSize, opts.logFolder, "trace", opts.logGroup); auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId); TraceEvent(SevDebug, "StartingFlowProcess").detail("FlowProcessName", opts.flowProcessName); -#if defined(__linux__) || defined(__FreeBSD__) +#if defined(__linux__) prctl(PR_SET_PDEATHSIG, SIGTERM); if (getppid() == 1) /* parent already died before prctl */ flushAndExit(FDB_EXIT_SUCCESS); +#elif defined(__FreeBSD__) + const int sig = SIGTERM; + procctl(P_PID, 0, PROC_PDEATHSIG_CTL, (void *)&sig); + if (getppid() == 1) /* parent already died before procctl */ + flushAndExit(FDB_EXIT_SUCCESS); #endif if (opts.flowProcessName == "KeyValueStoreProcess") { From 0e86d32a8796a90adcab5a13eb25e55db9360053 Mon Sep 17 00:00:00 2001 From: Dmitry Wagin Date: Sat, 3 Sep 2022 23:23:18 +0300 Subject: [PATCH 02/34] Fixed FreeBSD build N2 --- fdbserver/FDBExecHelper.actor.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index a8110f438e..5c88ff19ac 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -18,6 +18,14 @@ * limitations under the License. */ +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__INTEL_COMPILER) +#define BOOST_SYSTEM_NO_LIB +#define BOOST_DATE_TIME_NO_LIB +#define BOOST_REGEX_NO_LIB +#include +#endif +#include + #include "flow/TLSConfig.actor.h" #include "flow/Trace.h" #include "flow/Platform.h" @@ -34,14 +42,6 @@ #include "fdbserver/Knobs.h" #include "fdbserver/RemoteIKeyValueStore.actor.h" -#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__INTEL_COMPILER) -#define BOOST_SYSTEM_NO_LIB -#define BOOST_DATE_TIME_NO_LIB -#define BOOST_REGEX_NO_LIB -#include -#endif -#include - #include "flow/actorcompiler.h" // This must be the last #include. ExecCmdValueString::ExecCmdValueString(StringRef pCmdValueString) { From db26b3a1f795a5e0dfde7bc1d670e8c8bd247a79 Mon Sep 17 00:00:00 2001 From: Dmitry Wagin Date: Sun, 4 Sep 2022 12:58:35 +0300 Subject: [PATCH 03/34] Fixed code style --- fdbserver/fdbserver.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index c15b0b0896..f18c239be9 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -2404,7 +2404,7 @@ int main(int argc, char* argv[]) { flushAndExit(FDB_EXIT_SUCCESS); #elif defined(__FreeBSD__) const int sig = SIGTERM; - procctl(P_PID, 0, PROC_PDEATHSIG_CTL, (void *)&sig); + procctl(P_PID, 0, PROC_PDEATHSIG_CTL, (void*)&sig); if (getppid() == 1) /* parent already died before procctl */ flushAndExit(FDB_EXIT_SUCCESS); #endif From 97fd5878d9ed014a4d4a28ac52c34d48c40cf1b8 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 20 Sep 2022 10:55:23 -0700 Subject: [PATCH 04/34] change DDTeamCollection constructor --- fdbserver/DDTeamCollection.actor.cpp | 23 +++++++++++-------- fdbserver/DataDistribution.actor.cpp | 4 ++-- .../include/fdbserver/DDTeamCollection.h | 3 ++- fdbserver/include/fdbserver/DDTxnProcessor.h | 5 ++++ 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 5507adeed8..1a08c9f097 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -3581,7 +3581,7 @@ bool DDTeamCollection::satisfiesPolicy(const std::vector return result && resultEntries.size() == 0; } -DDTeamCollection::DDTeamCollection(Database const& cx, +DDTeamCollection::DDTeamCollection(const std::shared_ptr& dbProcessor, UID distributorId, MoveKeysLock const& lock, PromiseStream const& output, @@ -3597,9 +3597,9 @@ DDTeamCollection::DDTeamCollection(Database const& cx, PromiseStream getShardMetrics, Promise removeFailedServer, PromiseStream> getUnhealthyRelocationCount) - : doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), lock(lock), output(output), - unhealthyServers(0), storageWiggler(makeReference(this)), processingWiggle(processingWiggle), - shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), + : dbProcessor(dbProcessor), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), + lock(lock), output(output), unhealthyServers(0), storageWiggler(makeReference(this)), + processingWiggle(processingWiggle), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), initialFailureReactionDelay( delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)), initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay)), recruitingStream(0), @@ -3615,8 +3615,13 @@ DDTeamCollection::DDTeamCollection(Database const& cx, teamCollectionInfoEventHolder(makeReference("TeamCollectionInfo")), storageServerRecruitmentEventHolder( makeReference("StorageServerRecruitment_" + distributorId.toString())), - primary(primary), distributorId(distributorId), cx(cx), configuration(configuration), + primary(primary), distributorId(distributorId), configuration(configuration), storageServerSet(new LocalityMap()) { + + if (!dbProcessor->isMocked()) { + cx = this->dbProcessor->getDb(); + } + if (!primary || configuration.usableRegions == 1) { TraceEvent("DDTrackerStarting", distributorId) .detail("State", "Inactive") @@ -5147,13 +5152,13 @@ public: int processCount) { Database database = DatabaseContext::create( makeReference>(), Never(), LocalityData(), EnableLocalityLoadBalance::False); - + auto txnProcessor = std::shared_ptr(new DDTxnProcessor(database)); DatabaseConfiguration conf; conf.storageTeamSize = teamSize; conf.storagePolicy = policy; auto collection = - std::unique_ptr(new DDTeamCollection(database, + std::unique_ptr(new DDTeamCollection(txnProcessor, UID(0, 0), MoveKeysLock(), PromiseStream(), @@ -5191,13 +5196,13 @@ public: int processCount) { Database database = DatabaseContext::create( makeReference>(), Never(), LocalityData(), EnableLocalityLoadBalance::False); - + auto txnProcessor = std::shared_ptr(new DDTxnProcessor(database)); DatabaseConfiguration conf; conf.storageTeamSize = teamSize; conf.storagePolicy = policy; auto collection = - std::unique_ptr(new DDTeamCollection(database, + std::unique_ptr(new DDTeamCollection(txnProcessor, UID(0, 0), MoveKeysLock(), PromiseStream(), diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4301584306..8e70b62ea4 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -679,7 +679,7 @@ ACTOR Future dataDistribution(Reference self, std::vector teamCollectionsPtrs; primaryTeamCollection = makeReference( - cx, + self->txnProcessor, self->ddId, self->lock, self->relocationProducer, @@ -700,7 +700,7 @@ ACTOR Future dataDistribution(Reference self, self->dbInfo, [](auto const& info) { return info.clusterInterface.recruitStorage; }); if (self->configuration.usableRegions > 1) { remoteTeamCollection = - makeReference(cx, + makeReference(self->txnProcessor, self->ddId, self->lock, self->relocationProducer, diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index c10ab86af5..9dc9faf326 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -603,6 +603,7 @@ class DDTeamCollection : public ReferenceCounted { int addTeamsBestOf(int teamsToBuild, int desiredTeams, int maxTeams); public: + std::shared_ptr dbProcessor; Database cx; DatabaseConfiguration configuration; @@ -620,7 +621,7 @@ public: AsyncTrigger printDetailedTeamsInfo; Reference storageServerSet; - DDTeamCollection(Database const& cx, + DDTeamCollection(const std::shared_ptr& dbProcessor, UID distributorId, MoveKeysLock const& lock, PromiseStream const& output, diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index 00410a54f8..75bee7b568 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -36,6 +36,8 @@ public: struct SourceServers { std::vector srcServers, completeSources; // the same as RelocateData.src, RelocateData.completeSources; }; + virtual Database getDb() const = 0; + virtual bool isMocked() const = 0; // get the source server list and complete source server list for range virtual Future getSourceServersForRange(const KeyRangeRef range) = 0; @@ -89,6 +91,9 @@ public: DDTxnProcessor() = default; explicit DDTxnProcessor(Database cx) : cx(cx) {} + Database getDb() const override { return cx; }; + bool isMocked() const override { return false; }; + Future getSourceServersForRange(const KeyRangeRef range) override; // Call NativeAPI implementation directly From f9e0230b86c9a83891584e7fec2efd38d94cb6c5 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Sep 2022 10:56:22 -0700 Subject: [PATCH 05/34] DDQueue constructor with ITxnProcessor --- .../include/fdbclient/RunTransaction.actor.h | 17 +++++++++++++++++ fdbserver/DDRelocationQueue.actor.cpp | 11 ++++++----- fdbserver/DDTxnProcessor.actor.cpp | 16 +++++++++++++++- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/include/fdbserver/DDTxnProcessor.h | 8 ++++++++ .../include/fdbserver/DataDistribution.actor.h | 4 ++-- 6 files changed, 49 insertions(+), 9 deletions(-) diff --git a/fdbclient/include/fdbclient/RunTransaction.actor.h b/fdbclient/include/fdbclient/RunTransaction.actor.h index 83c8ab2223..15966f3d61 100644 --- a/fdbclient/include/fdbclient/RunTransaction.actor.h +++ b/fdbclient/include/fdbclient/RunTransaction.actor.h @@ -98,5 +98,22 @@ Future()(Reference()) return result; } +ACTOR template +Future()(Reference()).getValue())> runReadOnlyTransactionNoRetry( + Reference db, + Function func) { + state Reference tr = db->createTransaction(); + try { + // func should be idempotent; otherwise, retry will get undefined result + state decltype(std::declval()(Reference()).getValue()) result = + wait(func(tr)); + return result; + } catch (Error& e_) { + state Error e = e_; + wait(safeThreadFutureToFuture(tr->onError(e))); + throw e; + } +} + #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index c5243af77c..27e4b0c7e4 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -728,7 +728,7 @@ struct DDQueue : public IDDRelocationQueue { DDQueue(UID mid, MoveKeysLock lock, - Database cx, + const std::shared_ptr& dbProcessor, std::vector teamCollections, Reference sABTF, Reference physicalShardCollection, @@ -739,7 +739,7 @@ struct DDQueue : public IDDRelocationQueue { FutureStream input, PromiseStream getShardMetrics, PromiseStream getTopKMetrics) - : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(cx), txnProcessor(new DDTxnProcessor(cx)), + : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(dbProcessor->getDb()), txnProcessor(dbProcessor), teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF), physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes), startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM), @@ -1293,6 +1293,7 @@ struct DDQueue : public IDDRelocationQueue { // Schedules cancellation of a data move. void enqueueCancelledDataMove(UID dataMoveId, KeyRange range, const DDEnabledState* ddEnabledState) { + ASSERT(!txnProcessor->isMocked()); // the mock implementation currently doesn't support data move std::vector> cleanup; auto f = this->dataMoves.intersectingRanges(range); for (auto it = f.begin(); it != f.end(); ++it) { @@ -2016,7 +2017,7 @@ ACTOR Future rebalanceReadLoad(DDQueue* self, } // randomly choose topK shards int topK = std::max(1, std::min(int(0.1 * shards.size()), SERVER_KNOBS->READ_REBALANCE_SHARD_TOPK)); - state Future healthMetrics = self->cx->getHealthMetrics(true); + state Future healthMetrics = self->txnProcessor->getHealthMetrics(true); state GetTopKMetricsRequest req( shards, topK, (srcLoad - destLoad) * SERVER_KNOBS->READ_REBALANCE_MAX_SHARD_FRAC, srcLoad / shards.size()); state GetTopKMetricsReply reply = wait(brokenPromiseToNever(self->getTopKMetrics.getReply(req))); @@ -2462,7 +2463,7 @@ ACTOR Future BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) { } } -ACTOR Future dataDistributionQueue(Database cx, +ACTOR Future dataDistributionQueue(std::shared_ptr dbProcessor, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, @@ -2481,7 +2482,7 @@ ACTOR Future dataDistributionQueue(Database cx, const DDEnabledState* ddEnabledState) { state DDQueue self(distributorId, lock, - cx, + dbProcessor, teamCollections, shardsAffectedByTeamFailure, physicalShardCollection, diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index 962f69e73a..f4d52f964c 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -22,6 +22,8 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/DataDistribution.actor.h" +#include "fdbclient/DatabaseContext.h" +#include "fdbclient/RunTransaction.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. class DDTxnProcessorImpl { @@ -493,4 +495,16 @@ Future DDTxnProcessor::isDataDistributionEnabled(const DDEnabledState* ddE Future DDTxnProcessor::pollMoveKeysLock(const MoveKeysLock& lock, const DDEnabledState* ddEnabledState) const { return DDTxnProcessorImpl::pollMoveKeysLock(cx, lock, ddEnabledState); -} \ No newline at end of file +} + +Future DDTxnProcessor::getHealthMetrics(bool detailed) const { + return cx->getHealthMetrics(detailed); +} + +Future> DDTxnProcessor::readRebalanceDDIgnoreKey() const { + return runReadOnlyTransactionNoRetry(cx.getReference(), [](Reference tr) { + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + return tr->get(rebalanceDDIgnoreKey); + }); +} diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 8e70b62ea4..3b57eaea50 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -644,7 +644,7 @@ ACTOR Future dataDistribution(Reference self, "DDTracker", self->ddId, &normalDDQueueErrors())); - actors.push_back(reportErrorsExcept(dataDistributionQueue(cx, + actors.push_back(reportErrorsExcept(dataDistributionQueue(self->txnProcessor, self->relocationProducer, self->relocationConsumer.getFuture(), getShardMetrics, diff --git a/fdbserver/include/fdbserver/DDTxnProcessor.h b/fdbserver/include/fdbserver/DDTxnProcessor.h index 75bee7b568..4f8c5da41d 100644 --- a/fdbserver/include/fdbserver/DDTxnProcessor.h +++ b/fdbserver/include/fdbserver/DDTxnProcessor.h @@ -78,6 +78,10 @@ public: const DDEnabledState* ddEnabledState) const = 0; virtual Future moveKeys(const MoveKeysParams& params) const = 0; + + virtual Future getHealthMetrics(bool detailed = false) const = 0; + + virtual Future> readRebalanceDDIgnoreKey() const { return {}; } }; class DDTxnProcessorImpl; @@ -134,6 +138,10 @@ public: } Future moveKeys(const MoveKeysParams& params) const override { return ::moveKeys(cx, params); } + + Future getHealthMetrics(bool detailed) const override; + + Future> readRebalanceDDIgnoreKey() const override; }; // A mock transaction implementation for test usage. diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 6340bb4daa..465a474665 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -514,14 +514,14 @@ ACTOR Future dataDistributionTracker(Reference in bool* trackerCancelled, Optional> ddTenantCache); -ACTOR Future dataDistributionQueue(Database cx, +ACTOR Future dataDistributionQueue(std::shared_ptr dbProcessor, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, PromiseStream getTopKMetrics, Reference> processingUnhealthy, Reference> processingWiggle, - std::vector teamCollection, + std::vector teamCollections, Reference shardsAffectedByTeamFailure, Reference physicalShardCollection, MoveKeysLock lock, From c5366e1116b4eba43c527942ab8120344f450120 Mon Sep 17 00:00:00 2001 From: Dan Adkins Date: Wed, 21 Sep 2022 13:01:47 -0700 Subject: [PATCH 06/34] sim: don't allow calls to killProcess or killInterface to kill protected processes. killProcess_internal assumes that we've already checked and won't attempt to kill a protected process. It's up to killProcess and killInterface, which are invoked externally from simulated workloads, to enforce that constraint. --- fdbrpc/sim2.actor.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 9124a92880..7b8cdf5644 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1688,15 +1688,20 @@ public: } void killProcess(ProcessInfo* machine, KillType kt) override { TraceEvent("AttemptingKillProcess").detail("ProcessInfo", machine->toString()); - if (kt < RebootAndDelete) { + // Refuse to kill a protected process. + if (kt < RebootAndDelete && protectedAddresses.count(machine->address) == 0) { killProcess_internal(machine, kt); } } void killInterface(NetworkAddress address, KillType kt) override { if (kt < RebootAndDelete) { std::vector& processes = machines[addressMap[address]->locality.machineId()].processes; - for (int i = 0; i < processes.size(); i++) - killProcess_internal(processes[i], kt); + for (auto& process : processes) { + // Refuse to kill a protected process. + if (protectedAddresses.count(process->address) == 0) { + killProcess_internal(process, kt); + } + } } } bool killZone(Optional> zoneId, KillType kt, bool forceKill, KillType* ktFinal) override { From 9358aea0972d7ef1f6808cd3f7dc4e00362ce60e Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Sep 2022 14:58:34 -0700 Subject: [PATCH 07/34] fix busy loop with correct error handling in valley filler --- .../include/fdbclient/RunTransaction.actor.h | 17 -- fdbserver/DDRelocationQueue.actor.cpp | 148 +++++++++--------- fdbserver/DDTxnProcessor.actor.cpp | 21 ++- 3 files changed, 89 insertions(+), 97 deletions(-) diff --git a/fdbclient/include/fdbclient/RunTransaction.actor.h b/fdbclient/include/fdbclient/RunTransaction.actor.h index 15966f3d61..83c8ab2223 100644 --- a/fdbclient/include/fdbclient/RunTransaction.actor.h +++ b/fdbclient/include/fdbclient/RunTransaction.actor.h @@ -98,22 +98,5 @@ Future()(Reference()) return result; } -ACTOR template -Future()(Reference()).getValue())> runReadOnlyTransactionNoRetry( - Reference db, - Function func) { - state Reference tr = db->createTransaction(); - try { - // func should be idempotent; otherwise, retry will get undefined result - state decltype(std::declval()(Reference()).getValue()) result = - wait(func(tr)); - return result; - } catch (Error& e_) { - state Error e = e_; - wait(safeThreadFutureToFuture(tr->onError(e))); - throw e; - } -} - #include "flow/unactorcompiler.h" #endif diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 27e4b0c7e4..bfb3f31101 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -2165,16 +2165,35 @@ ACTOR Future getSrcDestTeams(DDQueue* self, return {}; } +ACTOR Future getSkipRebalanceValue(std::shared_ptr txnProcessor, bool readRebalance) { + Optional val = wait(txnProcessor->readRebalanceDDIgnoreKey()); + + bool skipCurrentLoop = false; + if (val.present()) { + // NOTE: check special value "" and "on" might written in old version < 7.2 + if (val.get().size() > 0 && val.get() != "on"_sr) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + if (readRebalance) { + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; + } else { + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; + } + } else { + skipCurrentLoop = true; + } + } + return skipCurrentLoop; +} + ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, DataMovementReason reason) { - state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; - state Transaction tr(self->cx); + state int resetCount = 0; state double lastRead = 0; state bool skipCurrentLoop = false; - state Future delayF = Never(); state const bool readRebalance = isDataMovementForReadBalancing(reason); - state const char* eventName = - isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper_New" : "BgDDValleyFiller_New"; + state const char* eventName = isDataMovementForMountainChopper(reason) ? "BgDDMountainChopper" : "BgDDValleyFiller"; state int ddPriority = dataMovementPriority(reason); + state double rebalancePollingInterval = 0; + loop { state bool moved = false; state Reference sourceTeam; @@ -2183,42 +2202,27 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat state GetTeamRequest destReq; state TraceEvent traceEvent(eventName, self->distributorId); traceEvent.suppressFor(5.0) - .detail("PollingInterval", SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL) + .detail("PollingInterval", rebalancePollingInterval) .detail("Rebalance", readRebalance ? "Read" : "Disk"); + // NOTE: the DD throttling relies on DDQueue, so here just trigger the balancer periodically + wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch)); + + if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { + wait(store(skipCurrentLoop, getSkipRebalanceValue(self->txnProcessor, readRebalance))); + lastRead = now(); + } + traceEvent.detail("Enabled", !skipCurrentLoop); + + if (skipCurrentLoop) { + rebalancePollingInterval = + std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + continue; + } else { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + try { - // NOTE: the DD throttling relies on DDQueue - delayF = delay(SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL, TaskPriority::DataDistributionLaunch); - if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - Optional val = wait(tr.get(rebalanceDDIgnoreKey)); - lastRead = now(); - if (!val.present()) { - skipCurrentLoop = false; - } else { - // NOTE: check special value "" and "on" might written in old version < 7.2 - if (val.get().size() > 0 && val.get() != "on"_sr) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - if (readRebalance) { - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; - } else { - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - } - } else { - skipCurrentLoop = true; - } - } - } - - traceEvent.detail("Enabled", !skipCurrentLoop); - - wait(delayF); - if (skipCurrentLoop) { - tr.reset(); - continue; - } - traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]); if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { @@ -2254,11 +2258,11 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat } traceEvent.detail("ResetCount", resetCount); - tr.reset(); } catch (Error& e) { // Log actor_cancelled because it's not legal to suppress an event that's initialized traceEvent.errorUnsuppressed(e); - wait(tr.onError(e)); + traceEvent.log(); + throw; } traceEvent.detail("Moved", moved); @@ -2366,7 +2370,6 @@ ACTOR Future BgDDMountainChopper(DDQueue* self, int teamCollectionIndex) { ACTOR Future BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) { state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - state Transaction tr(self->cx); state double lastRead = 0; state bool skipCurrentLoop = false; @@ -2374,43 +2377,40 @@ ACTOR Future BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) { state std::pair>, bool> randomTeam; state bool moved = false; state TraceEvent traceEvent("BgDDValleyFiller_Old", self->distributorId); + + wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch)); + traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); - try { - state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); - if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - Optional val = wait(tr.get(rebalanceDDIgnoreKey)); - lastRead = now(); - if (!val.present()) { - // reset loop interval - if (skipCurrentLoop) { - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - } - skipCurrentLoop = false; + if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { + Optional val = wait(self->txnProcessor->readRebalanceDDIgnoreKey()); + lastRead = now(); + if (!val.present()) { + // reset loop interval + if (skipCurrentLoop) { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + skipCurrentLoop = false; + } else { + // NOTE: check special value "" and "on" might written in old version < 7.2 + if (val.get().size() > 0 && val.get() != "on"_sr) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; } else { - // NOTE: check special value "" and "on" might written in old version < 7.2 - if (val.get().size() > 0 && val.get() != "on"_sr) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - } else { - skipCurrentLoop = true; - } + skipCurrentLoop = true; } } + } - traceEvent.detail("Enabled", !skipCurrentLoop); - - wait(delayF); - if (skipCurrentLoop) { - // set loop interval to avoid busy wait here. - rebalancePollingInterval = - std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); - tr.reset(); - continue; - } + traceEvent.detail("Enabled", !skipCurrentLoop); + if (skipCurrentLoop) { + // set loop interval to avoid busy wait here. + rebalancePollingInterval = + std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + continue; + } + try { traceEvent.detail("QueuedRelocations", self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM]); if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < @@ -2450,12 +2450,12 @@ ACTOR Future BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) { } } } - - tr.reset(); } catch (Error& e) { // Log actor_cancelled because it's not legal to suppress an event that's initialized traceEvent.errorUnsuppressed(e); - wait(tr.onError(e)); + traceEvent.detail("Moved", moved); + traceEvent.log(); + throw; } traceEvent.detail("Moved", moved); diff --git a/fdbserver/DDTxnProcessor.actor.cpp b/fdbserver/DDTxnProcessor.actor.cpp index f4d52f964c..6a0d988ea7 100644 --- a/fdbserver/DDTxnProcessor.actor.cpp +++ b/fdbserver/DDTxnProcessor.actor.cpp @@ -23,7 +23,6 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbserver/DataDistribution.actor.h" #include "fdbclient/DatabaseContext.h" -#include "fdbclient/RunTransaction.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. class DDTxnProcessorImpl { @@ -452,6 +451,20 @@ class DDTxnProcessorImpl { } } } + + ACTOR static Future> readRebalanceDDIgnoreKey(Database cx) { + state Transaction tr(cx); + loop { + try { + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + Optional res = wait(tr.get(rebalanceDDIgnoreKey)); + return res; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } }; Future DDTxnProcessor::getSourceServersForRange(const KeyRangeRef range) { @@ -502,9 +515,5 @@ Future DDTxnProcessor::getHealthMetrics(bool detailed) const { } Future> DDTxnProcessor::readRebalanceDDIgnoreKey() const { - return runReadOnlyTransactionNoRetry(cx.getReference(), [](Reference tr) { - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - return tr->get(rebalanceDDIgnoreKey); - }); + return DDTxnProcessorImpl::readRebalanceDDIgnoreKey(cx); } From 5500ec81262acba63d259d0875c7d2fd9628c520 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Sep 2022 15:11:04 -0700 Subject: [PATCH 08/34] replace BgDDMountainChopper and BgDDValleyFiller with BgDDLoadRebalance --- fdbserver/DDRelocationQueue.actor.cpp | 201 +------------------------- 1 file changed, 2 insertions(+), 199 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index bfb3f31101..9cac7ebd2e 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -2261,200 +2261,6 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat } catch (Error& e) { // Log actor_cancelled because it's not legal to suppress an event that's initialized traceEvent.errorUnsuppressed(e); - traceEvent.log(); - throw; - } - - traceEvent.detail("Moved", moved); - traceEvent.log(); - } -} - -ACTOR Future BgDDMountainChopper(DDQueue* self, int teamCollectionIndex) { - state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - state Transaction tr(self->cx); - state double lastRead = 0; - state bool skipCurrentLoop = false; - loop { - state std::pair>, bool> randomTeam; - state bool moved = false; - state TraceEvent traceEvent("BgDDMountainChopper_Old", self->distributorId); - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); - - try { - state Future delayF = delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch); - if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { - tr.setOption(FDBTransactionOptions::LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - Optional val = wait(tr.get(rebalanceDDIgnoreKey)); - lastRead = now(); - if (!val.present()) { - // reset loop interval - if (skipCurrentLoop) { - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - } - skipCurrentLoop = false; - } else { - // NOTE: check special value "" and "on" might written in old version < 7.2 - if (val.get().size() > 0 && val.get() != "on"_sr) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - } else { - skipCurrentLoop = true; - } - } - } - - traceEvent.detail("Enabled", !skipCurrentLoop); - - wait(delayF); - if (skipCurrentLoop) { - // set loop interval to avoid busy wait here. - rebalancePollingInterval = - std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); - tr.reset(); - continue; - } - - traceEvent.detail("QueuedRelocations", - self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM]); - if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < - SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { - std::pair>, bool> _randomTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(WantNewServers::True, - WantTrueBest::False, - PreferLowerDiskUtil::True, - TeamMustHaveShards::False)))); - randomTeam = _randomTeam; - traceEvent.detail("DestTeam", - printable(randomTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (randomTeam.first.present()) { - std::pair>, bool> loadedTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(WantNewServers::True, - WantTrueBest::True, - PreferLowerDiskUtil::False, - TeamMustHaveShards::True)))); - - traceEvent.detail( - "SourceTeam", - printable(loadedTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (loadedTeam.first.present()) { - bool _moved = wait(rebalanceTeams(self, - DataMovementReason::REBALANCE_OVERUTILIZED_TEAM, - loadedTeam.first.get(), - randomTeam.first.get(), - teamCollectionIndex == 0, - &traceEvent)); - moved = _moved; - } - } - } - - tr.reset(); - } catch (Error& e) { - // Log actor_cancelled because it's not legal to suppress an event that's initialized - traceEvent.errorUnsuppressed(e); - wait(tr.onError(e)); - } - - traceEvent.detail("Moved", moved); - traceEvent.log(); - } -} - -ACTOR Future BgDDValleyFiller(DDQueue* self, int teamCollectionIndex) { - state double rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - state double lastRead = 0; - state bool skipCurrentLoop = false; - - loop { - state std::pair>, bool> randomTeam; - state bool moved = false; - state TraceEvent traceEvent("BgDDValleyFiller_Old", self->distributorId); - - wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch)); - - traceEvent.suppressFor(5.0).detail("PollingInterval", rebalancePollingInterval).detail("Rebalance", "Disk"); - - if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { - Optional val = wait(self->txnProcessor->readRebalanceDDIgnoreKey()); - lastRead = now(); - if (!val.present()) { - // reset loop interval - if (skipCurrentLoop) { - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - } - skipCurrentLoop = false; - } else { - // NOTE: check special value "" and "on" might written in old version < 7.2 - if (val.get().size() > 0 && val.get() != "on"_sr) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - } else { - skipCurrentLoop = true; - } - } - } - - traceEvent.detail("Enabled", !skipCurrentLoop); - if (skipCurrentLoop) { - // set loop interval to avoid busy wait here. - rebalancePollingInterval = - std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); - continue; - } - - try { - traceEvent.detail("QueuedRelocations", - self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM]); - if (self->priority_relocations[SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < - SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { - std::pair>, bool> _randomTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(WantNewServers::True, - WantTrueBest::False, - PreferLowerDiskUtil::False, - TeamMustHaveShards::True)))); - randomTeam = _randomTeam; - traceEvent.detail("SourceTeam", - printable(randomTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (randomTeam.first.present()) { - std::pair>, bool> unloadedTeam = - wait(brokenPromiseToNever(self->teamCollections[teamCollectionIndex].getTeam.getReply( - GetTeamRequest(WantNewServers::True, - WantTrueBest::True, - PreferLowerDiskUtil::True, - TeamMustHaveShards::False)))); - - traceEvent.detail( - "DestTeam", - printable(unloadedTeam.first.map( - [](const Reference& team) { return team->getDesc(); }))); - - if (unloadedTeam.first.present()) { - bool _moved = wait(rebalanceTeams(self, - DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM, - randomTeam.first.get(), - unloadedTeam.first.get(), - teamCollectionIndex == 0, - &traceEvent)); - moved = _moved; - } - } - } - } catch (Error& e) { - // Log actor_cancelled because it's not legal to suppress an event that's initialized - traceEvent.errorUnsuppressed(e); - traceEvent.detail("Moved", moved); - traceEvent.log(); throw; } @@ -2504,15 +2310,12 @@ ACTOR Future dataDistributionQueue(std::shared_ptr dbProc state Future launchQueuedWorkTimeout = Never(); for (int i = 0; i < teamCollections.size(); i++) { - // FIXME: Use BgDDLoadBalance for disk rebalance too after DD simulation test proof. - // ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM)); - // ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM)); + ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_OVERUTILIZED_TEAM)); + ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_UNDERUTILIZED_TEAM)); if (SERVER_KNOBS->READ_SAMPLING_ENABLED) { ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_OVERUTIL_TEAM)); ddQueueFutures.push_back(BgDDLoadRebalance(&self, i, DataMovementReason::REBALANCE_READ_UNDERUTIL_TEAM)); } - ddQueueFutures.push_back(BgDDMountainChopper(&self, i)); - ddQueueFutures.push_back(BgDDValleyFiller(&self, i)); } ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingUnhealthy, processingUnhealthy, 0)); ddQueueFutures.push_back(delayedAsyncVar(self.rawProcessingWiggle, processingWiggle, 0)); From 5aa9a36559958aa4346d2c429e3f6c9410520805 Mon Sep 17 00:00:00 2001 From: Dan Adkins Date: Wed, 21 Sep 2022 15:18:36 -0700 Subject: [PATCH 09/34] Downgrade kill to reboot in Rollback workload. Indiscriminately killing machines can lead to unrecoverable clusters, even if we avoid killing the protected coordinators. --- fdbserver/workloads/Rollback.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/workloads/Rollback.actor.cpp b/fdbserver/workloads/Rollback.actor.cpp index cadff1a1f4..d9e0314925 100644 --- a/fdbserver/workloads/Rollback.actor.cpp +++ b/fdbserver/workloads/Rollback.actor.cpp @@ -104,7 +104,7 @@ struct RollbackWorkload : FailureInjectionWorkload { // Kill the proxy and clog the unclogged tlog if (self->enableFailures) { - g_simulator->killProcess(g_simulator->getProcessByAddress(proxy.address()), ISimulator::KillInstantly); + g_simulator->rebootProcess(g_simulator->getProcessByAddress(proxy.address()), ISimulator::Reboot); g_simulator->clogInterface(uncloggedTLog.ip, self->clogDuration, ClogAll); } else { g_simulator->clogInterface(proxy.address().ip, self->clogDuration, ClogAll); From aaed0c8b7ed8f32a36d71e6b96e7ae14d3d889ff Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Sep 2022 17:57:40 -0700 Subject: [PATCH 10/34] Actor to DDQueue methods --- fdbserver/DDRelocationQueue.actor.cpp | 76 ++++++++++++++++++++------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 9cac7ebd2e..48ad5f5ab0 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -1325,6 +1325,24 @@ struct DDQueue : public IDDRelocationQueue { } int getUnhealthyRelocationCount() override { return unhealthyRelocations; } + + Future getSrcDestTeams(const int& teamCollectionIndex, + const GetTeamRequest& srcReq, + const GetTeamRequest& destReq, + const int& priority, + TraceEvent* traceEvent); + + Future rebalanceReadLoad(DataMovementReason moveReason, + Reference sourceTeam, + Reference destTeam, + bool primary, + TraceEvent* traceEvent); + + Future rebalanceTeams(DataMovementReason moveReason, + Reference sourceTeam, + Reference destTeam, + bool primary, + TraceEvent* traceEvent); }; ACTOR Future cancelDataMove(struct DDQueue* self, KeyRange range, const DDEnabledState* ddEnabledState) { @@ -2165,6 +2183,29 @@ ACTOR Future getSrcDestTeams(DDQueue* self, return {}; } +Future DDQueue::getSrcDestTeams(const int& teamCollectionIndex, + const GetTeamRequest& srcReq, + const GetTeamRequest& destReq, + const int& priority, + TraceEvent* traceEvent) { + return ::getSrcDestTeams(this, teamCollectionIndex, srcReq, destReq, priority, traceEvent); +} +Future DDQueue::rebalanceReadLoad(DataMovementReason moveReason, + Reference sourceTeam, + Reference destTeam, + bool primary, + TraceEvent* traceEvent) { + return ::rebalanceReadLoad(this, moveReason, sourceTeam, destTeam, primary, traceEvent); +} + +Future DDQueue::rebalanceTeams(DataMovementReason moveReason, + Reference sourceTeam, + Reference destTeam, + bool primary, + TraceEvent* traceEvent) { + return ::rebalanceTeams(this, moveReason, sourceTeam, destTeam, primary, traceEvent); +} + ACTOR Future getSkipRebalanceValue(std::shared_ptr txnProcessor, bool readRebalance) { Optional val = wait(txnProcessor->readRebalanceDDIgnoreKey()); @@ -2207,22 +2248,21 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat // NOTE: the DD throttling relies on DDQueue, so here just trigger the balancer periodically wait(delay(rebalancePollingInterval, TaskPriority::DataDistributionLaunch)); - - if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { - wait(store(skipCurrentLoop, getSkipRebalanceValue(self->txnProcessor, readRebalance))); - lastRead = now(); - } - traceEvent.detail("Enabled", !skipCurrentLoop); - - if (skipCurrentLoop) { - rebalancePollingInterval = - std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); - continue; - } else { - rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; - } - try { + if ((now() - lastRead) > SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL) { + wait(store(skipCurrentLoop, getSkipRebalanceValue(self->txnProcessor, readRebalance))); + lastRead = now(); + } + traceEvent.detail("Enabled", !skipCurrentLoop); + + if (skipCurrentLoop) { + rebalancePollingInterval = + std::max(rebalancePollingInterval, SERVER_KNOBS->BG_REBALANCE_SWITCH_CHECK_INTERVAL); + continue; + } else { + rebalancePollingInterval = SERVER_KNOBS->BG_REBALANCE_POLLING_INTERVAL; + } + traceEvent.detail("QueuedRelocations", self->priority_relocations[ddPriority]); if (self->priority_relocations[ddPriority] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { @@ -2240,7 +2280,7 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat ForReadBalance(readRebalance), PreferLowerReadUtil::True); state Future getTeamFuture = - getSrcDestTeams(self, teamCollectionIndex, srcReq, destReq, ddPriority, &traceEvent); + self->getSrcDestTeams(teamCollectionIndex, srcReq, destReq, ddPriority, &traceEvent); wait(ready(getTeamFuture)); sourceTeam = getTeamFuture.get().first; destTeam = getTeamFuture.get().second; @@ -2248,9 +2288,9 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat // clang-format off if (sourceTeam.isValid() && destTeam.isValid()) { if (readRebalance) { - wait(store(moved,rebalanceReadLoad(self, reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + wait(store(moved,self->rebalanceReadLoad( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } else { - wait(store(moved,rebalanceTeams(self, reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); + wait(store(moved,self->rebalanceTeams( reason, sourceTeam, destTeam, teamCollectionIndex == 0, &traceEvent))); } } // clang-format on From e7a280ec03bbeb3afe78b7c26ab4dbeb9cca2e4f Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 21 Sep 2022 20:49:39 -0700 Subject: [PATCH 11/34] format code --- fdbserver/DDTeamCollection.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 1a08c9f097..0df42fbeca 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -3597,8 +3597,8 @@ DDTeamCollection::DDTeamCollection(const std::shared_ptr& dbPro PromiseStream getShardMetrics, Promise removeFailedServer, PromiseStream> getUnhealthyRelocationCount) - : dbProcessor(dbProcessor), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), - lock(lock), output(output), unhealthyServers(0), storageWiggler(makeReference(this)), + : dbProcessor(dbProcessor), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), lock(lock), + output(output), unhealthyServers(0), storageWiggler(makeReference(this)), processingWiggle(processingWiggle), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), initialFailureReactionDelay( delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)), From 129edebb912be9cad9452ceb74dc928f515659c2 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 22 Sep 2022 09:35:38 -0700 Subject: [PATCH 12/34] Limit BUILD_AWS_BACKUP to x86_64 for now With aarch64, in the current docker image, linking curl statically doesn't work yet. This is the diagnostic: ``` fdbclient/awssdk-build/curl/lib/strerror.c:32:6: error: #error "strerror_r MUST be either POSIX, glibc or vxworks-style" 2022-09-22 16:05:30 # error "strerror_r MUST be either POSIX, glibc or vxworks-style" 2022-09-22 16:05:30 ^~~~~ ``` The root cause is that curl's cmake feature detection logic gets confused since it can't build binaries that execute successfully with the link flags it wants: ``` $ cc test.c -lssl $ ./a.out ./a.out: error while loading shared libraries: libssl.so.1.1: cannot open shared object file: No such file or directory ``` --- cmake/FDBComponents.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/FDBComponents.cmake b/cmake/FDBComponents.cmake index 208ac2c3e3..292d58c454 100644 --- a/cmake/FDBComponents.cmake +++ b/cmake/FDBComponents.cmake @@ -232,7 +232,12 @@ set(COROUTINE_IMPL ${DEFAULT_COROUTINE_IMPL} CACHE STRING "Which coroutine imple set(BUILD_AWS_BACKUP OFF CACHE BOOL "Build AWS S3 SDK backup client") if (BUILD_AWS_BACKUP) - set(WITH_AWS_BACKUP ON) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(WITH_AWS_BACKUP ON) + else() + message(WARNING "BUILD_AWS_BACKUP set but ignored ${CMAKE_SYSTEM_PROCESSOR} is not supported yet") + set(WITH_AWS_BACKUP OFF) + endif() else() set(WITH_AWS_BACKUP OFF) endif() From 21c627ba973f5fe1bce1a17bf90d50e634f86e09 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 10:08:54 -0500 Subject: [PATCH 13/34] Adding blobkey history fdbcli debug command --- fdbcli/BlobKeyCommand.actor.cpp | 188 ++++++++++++++++++ fdbcli/fdbcli.actor.cpp | 7 + fdbcli/include/fdbcli/fdbcli.actor.h | 5 + .../include/fdbclient/BlobGranuleCommon.h | 11 + .../fdbserver/BlobGranuleServerCommon.actor.h | 11 - 5 files changed, 211 insertions(+), 11 deletions(-) create mode 100644 fdbcli/BlobKeyCommand.actor.cpp diff --git a/fdbcli/BlobKeyCommand.actor.cpp b/fdbcli/BlobKeyCommand.actor.cpp new file mode 100644 index 0000000000..2f4af52017 --- /dev/null +++ b/fdbcli/BlobKeyCommand.actor.cpp @@ -0,0 +1,188 @@ +/* + * BlobKeyCommand.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbcli/fdbcli.actor.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + +namespace { + +ACTOR Future printBlobHistory(Database db, Key key, Optional version) { + fmt::print("Printing blob history for {0}", key.printable()); + if (version.present()) { + fmt::print(" @ {0}", version.get()); + } + fmt::print("\n"); + + state Transaction tr(db); + state KeyRange activeGranule; + state KeyRange queryRange(KeyRangeRef(key, keyAfter(key))); + loop { + try { + Standalone> granules = wait(tr.getBlobGranuleRanges(queryRange, 2)); + if (granules.empty()) { + fmt::print("No active granule for {0}\n", key.printable()); + return false; + } + ASSERT(granules.size() == 1); + activeGranule = granules[0]; + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + fmt::print("Active granule: [{0} - {1})\n", activeGranule.begin.printable(), activeGranule.end.printable()); + + // get latest history entry for range + state GranuleHistory history; + loop { + try { + RangeResult result = + wait(tr.getRange(blobGranuleHistoryKeyRangeFor(activeGranule), 1, Snapshot::False, Reverse::True)); + ASSERT(result.size() <= 1); + + if (result.empty()) { + fmt::print("No history entry found\n"); + return true; + } + + std::pair decodedKey = decodeBlobGranuleHistoryKey(result[0].key); + ASSERT(activeGranule == decodedKey.first); + history = GranuleHistory(activeGranule, decodedKey.second, decodeBlobGranuleHistoryValue(result[0].value)); + + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + fmt::print("History:\n\n"); + loop { + // print history + std::string boundaryChangeAction; + if (history.value.parentVersions.empty()) { + boundaryChangeAction = "root"; + } else if (history.value.parentVersions.size() == 1) { + boundaryChangeAction = "split"; + } else { + boundaryChangeAction = "merge"; + } + fmt::print("{0}) {1}\n\t{2}\n\t{3}\n({4})\n\n", + history.version, + history.value.granuleID.toString(), + history.range.begin.printable(), + history.range.end.printable(), + boundaryChangeAction); + // traverse back + + if (history.value.parentVersions.empty() || (version.present() && history.version <= version.get())) { + break; + } + + int i; + for (i = 0; i < history.value.parentBoundaries.size(); i++) { + if (history.value.parentBoundaries[i] <= key) { + break; + } + } + // key should fall between boundaries + ASSERT(i < history.value.parentBoundaries.size()); + KeyRangeRef parentRange(history.value.parentBoundaries[i], history.value.parentBoundaries[i + 1]); + Version parentVersion = history.value.parentVersions[i]; + state Key parentHistoryKey = blobGranuleHistoryKeyFor(parentRange, parentVersion); + state bool foundParent; + + loop { + try { + Optional parentHistoryValue = wait(tr.get(parentHistoryKey)); + foundParent = parentHistoryValue.present(); + if (foundParent) { + std::pair decodedKey = decodeBlobGranuleHistoryKey(parentHistoryKey); + history = GranuleHistory( + decodedKey.first, decodedKey.second, decodeBlobGranuleHistoryValue(parentHistoryValue.get())); + } + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + if (!foundParent) { + break; + } + } + + fmt::print("Done\n"); + return true; +} + +} // namespace + +namespace fdb_cli { + +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens) { + // enables blob writing for the given range + if (tokens.size() != 3 && tokens.size() != 4) { + printUsage(tokens[0]); + return false; + } + + ASSERT(tokens[1] == "history"_sr); + + Key key; + Optional version; + + if (tenantEntry.present()) { + key = tokens[2].withPrefix(tenantEntry.get().prefix); + } else { + key = tokens[2]; + } + + if (tokens.size() > 3) { + Version v; + int n = 0; + if (sscanf(tokens[3].toString().c_str(), "%" PRId64 "%n", &v, &n) != 1 || n != tokens[3].size()) { + printUsage(tokens[0]); + return false; + } + version = v; + } + + if (key >= LiteralStringRef("\xff")) { + fmt::print("No blob history for system keyspace\n", key.printable()); + return false; + } else { + bool result = wait(printBlobHistory(localDb, key, version)); + return result; + } +} + +// can extend to other blobkey commands later +CommandFactory blobKeyFactory("blobkey", CommandHelp("blobkey history [version]", "", "")); +} // namespace fdb_cli diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index a2258bcdd5..5fef2de554 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -1390,6 +1390,13 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise, Reference changeFeedCommandActor(Database localDb, ACTOR Future blobRangeCommandActor(Database localDb, Optional tenantEntry, std::vector tokens); + +// blobkey command +ACTOR Future blobKeyCommandActor(Database localDb, + Optional tenantEntry, + std::vector tokens); // maintenance command ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future clearHealthyZone(Reference db, diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 394e5fadd0..a907fc897f 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -276,6 +276,17 @@ struct BlobGranuleHistoryValue { } }; +struct GranuleHistory { + KeyRange range; + Version version; + Standalone value; + + GranuleHistory() {} + + GranuleHistory(KeyRange range, Version version, Standalone value) + : range(range), version(version), value(value) {} +}; + // A manifest to assist full fdb restore from blob granule files struct BlobManifest { constexpr static FileIdentifier file_identifier = 298872; diff --git a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h index 9b761393b7..dbd7195e76 100644 --- a/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h +++ b/fdbserver/include/fdbserver/BlobGranuleServerCommon.actor.h @@ -38,17 +38,6 @@ #include "flow/actorcompiler.h" // has to be last include -struct GranuleHistory { - KeyRange range; - Version version; - Standalone value; - - GranuleHistory() {} - - GranuleHistory(KeyRange range, Version version, Standalone value) - : range(range), version(version), value(value) {} -}; - // Stores info about a file in blob storage struct BlobFileIndex { Version version; From 949a5581ba4c0e8104125a5295ab794dc03f8c27 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 15:50:17 -0500 Subject: [PATCH 14/34] fix format --- fdbcli/BlobKeyCommand.actor.cpp | 4 ++-- fdbcli/include/fdbcli/fdbcli.actor.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fdbcli/BlobKeyCommand.actor.cpp b/fdbcli/BlobKeyCommand.actor.cpp index 2f4af52017..06269d69d3 100644 --- a/fdbcli/BlobKeyCommand.actor.cpp +++ b/fdbcli/BlobKeyCommand.actor.cpp @@ -145,8 +145,8 @@ ACTOR Future printBlobHistory(Database db, Key key, Optional vers namespace fdb_cli { ACTOR Future blobKeyCommandActor(Database localDb, - Optional tenantEntry, - std::vector tokens) { + Optional tenantEntry, + std::vector tokens) { // enables blob writing for the given range if (tokens.size() != 3 && tokens.size() != 4) { printUsage(tokens[0]); diff --git a/fdbcli/include/fdbcli/fdbcli.actor.h b/fdbcli/include/fdbcli/fdbcli.actor.h index 757dfd4ac1..6a73705534 100644 --- a/fdbcli/include/fdbcli/fdbcli.actor.h +++ b/fdbcli/include/fdbcli/fdbcli.actor.h @@ -211,8 +211,8 @@ ACTOR Future blobRangeCommandActor(Database localDb, // blobkey command ACTOR Future blobKeyCommandActor(Database localDb, - Optional tenantEntry, - std::vector tokens); + Optional tenantEntry, + std::vector tokens); // maintenance command ACTOR Future setHealthyZone(Reference db, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future clearHealthyZone(Reference db, From 2e41eb8a4a1a66644bc94e43f1cd02d9dcbd3a35 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 22 Sep 2022 15:17:04 -0700 Subject: [PATCH 15/34] Move fdbcli tests into the fdbcli directory --- bindings/python/CMakeLists.txt | 35 ------------------ fdbcli/CMakeLists.txt | 36 +++++++++++++++++++ .../python => fdbcli}/tests/fdbcli_tests.py | 0 3 files changed, 36 insertions(+), 35 deletions(-) rename {bindings/python => fdbcli}/tests/fdbcli_tests.py (100%) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 4f5022aa4e..af281a7405 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -75,38 +75,3 @@ add_custom_command(OUTPUT ${package_file} add_custom_target(python_package DEPENDS ${package_file}) add_dependencies(python_package python_binding) add_dependencies(packages python_package) - -if (NOT WIN32 AND NOT OPEN_FOR_IDE) - add_fdbclient_test( - NAME single_process_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - ) - add_fdbclient_test( - NAME multi_process_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - ) - if (TARGET external_client) # external_client copies fdb_c to bindings/c/libfdb_c_external.so - add_fdbclient_test( - NAME single_process_external_client_fdbcli_tests - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - add_fdbclient_test( - NAME multi_process_external_client_fdbcli_tests - PROCESS_NUMBER 5 - COMMAND ${CMAKE_SOURCE_DIR}/bindings/python/tests/fdbcli_tests.py - ${CMAKE_BINARY_DIR} - @CLUSTER_FILE@ - 5 - --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so - ) - endif() -endif() diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index d2966df807..c25c335a15 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -1,3 +1,4 @@ +include(AddFdbTest) fdb_find_sources(FDBCLI_SRCS) add_flow_target(EXECUTABLE NAME fdbcli SRCS ${FDBCLI_SRCS}) @@ -23,3 +24,38 @@ if(NOT OPEN_FOR_IDE) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbcli DESTINATION bin COMPONENT clients) endif() endif() + +if (NOT WIN32 AND NOT OPEN_FOR_IDE) + add_dependencies(fdbcli external_client) + + add_fdbclient_test( + NAME single_process_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + ) + add_fdbclient_test( + NAME multi_process_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + ) + add_fdbclient_test( + NAME single_process_external_client_fdbcli_tests + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) + add_fdbclient_test( + NAME multi_process_external_client_fdbcli_tests + PROCESS_NUMBER 5 + COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/fdbcli_tests.py + ${CMAKE_BINARY_DIR} + @CLUSTER_FILE@ + 5 + --external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so + ) +endif() diff --git a/bindings/python/tests/fdbcli_tests.py b/fdbcli/tests/fdbcli_tests.py similarity index 100% rename from bindings/python/tests/fdbcli_tests.py rename to fdbcli/tests/fdbcli_tests.py From 339183228d58a359facd3a3adb5687c62463d453 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 18:31:06 -0500 Subject: [PATCH 16/34] allowing ss feed fetches to go over hard limit (#8243) --- fdbserver/storageserver.actor.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d9bc998cae..3e9baf493b 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -2978,8 +2978,10 @@ ACTOR Future changeFeedStreamQ(StorageServer* data, ChangeFeedStreamReques ++data->counters.feedStreamQueries; // FIXME: do something more sophisticated here besides hard limit - if (data->activeFeedQueries >= SERVER_KNOBS->STORAGE_FEED_QUERY_HARD_LIMIT || - (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.005))) { + // Allow other storage servers fetching feeds to go above this limit. currently, req.canReadPopped == read is a + // fetch from another ss + if (!req.canReadPopped && (data->activeFeedQueries >= SERVER_KNOBS->STORAGE_FEED_QUERY_HARD_LIMIT || + (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.005)))) { req.reply.sendError(storage_too_many_feed_streams()); ++data->counters.rejectedFeedStreamQueries; return Void(); From f78eb8c778f032d33487a7cb4dc9889a20e3eef5 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 18:31:27 -0500 Subject: [PATCH 17/34] Adding bg read amp metrics (#8275) --- fdbclient/BlobGranuleFiles.cpp | 13 +++++++- fdbclient/NativeAPI.actor.cpp | 33 +++++++++++-------- fdbclient/ReadYourWrites.actor.cpp | 7 ++++ fdbclient/ThreadSafeTransaction.cpp | 8 ++++- .../include/fdbclient/BlobGranuleCommon.h | 7 ++++ .../include/fdbclient/BlobGranuleFiles.h | 3 +- fdbclient/include/fdbclient/DatabaseContext.h | 2 ++ .../include/fdbclient/IConfigTransaction.h | 1 + .../fdbclient/ISingleThreadTransaction.h | 1 + fdbclient/include/fdbclient/NativeAPI.actor.h | 2 ++ fdbclient/include/fdbclient/ReadYourWrites.h | 1 + 11 files changed, 61 insertions(+), 17 deletions(-) diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 517174dc17..04edc8e90c 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -1545,7 +1545,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone loadAndMaterializeBlobGranules(const Standalone 1 @@ -1579,6 +1582,7 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += snapshotData.get().size(); } // +1 to avoid UBSAN variable length array of size zero @@ -1591,18 +1595,25 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone(blob_granule_file_load_error()); } + inputBytes += deltaData[i].size(); } + inputBytes += files[chunkIdx].newDeltas.expectedSize(); + // materialize rows from chunk chunkRows = materializeBlobGranule(files[chunkIdx], keyRange, beginVersion, readVersion, snapshotData, deltaData); + outputBytes += chunkRows.expectedSize(); + results.arena().dependsOn(chunkRows.arena()); results.append(results.arena(), chunkRows.begin(), chunkRows.size()); // free once done by forcing FreeHandles to trigger loadIds[chunkIdx].freeHandles.clear(); } + stats.inputBytes = inputBytes; + stats.outputBytes = outputBytes; return ErrorOr(results); } catch (Error& e) { return ErrorOr(e); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index be33bc73c2..b3cb5ae65c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1470,13 +1470,13 @@ DatabaseContext::DatabaseContext(ReferenceSHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) {} @@ -8077,6 +8077,11 @@ Transaction::summarizeBlobGranules(const KeyRange& range, Optional summ return summarizeBlobGranulesActor(this, range, summaryVersion, rangeLimit); } +void Transaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + trState->cx->bgReadInputBytes += stats.inputBytes; + trState->cx->bgReadOutputBytes += stats.outputBytes; +} + ACTOR Future setPerpetualStorageWiggle(Database cx, bool enable, LockAware lockAware) { state ReadYourWritesTransaction tr(cx); state Version version = invalidVersion; diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 7f7a6dc41d..3597611838 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1842,6 +1842,13 @@ Future>> ReadYourWritesTransaction:: return waitOrError(tr.summarizeBlobGranules(range, summaryVersion, rangeLimit), resetPromise.getFuture()); } +void ReadYourWritesTransaction::addGranuleMaterializeStats(const GranuleMaterializeStats& stats) { + if (checkUsedDuringCommit()) { + throw used_during_commit(); + } + tr.addGranuleMaterializeStats(stats); +} + void ReadYourWritesTransaction::addReadConflictRange(KeyRangeRef const& keys) { if (checkUsedDuringCommit()) { throw used_during_commit(); diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index 13736155e6..eec04e46e3 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -454,7 +454,13 @@ ThreadResult ThreadSafeTransaction::readBlobGranulesFinish( ReadBlobGranuleContext granuleContext) { // do this work off of fdb network threads for performance! Standalone> files = startFuture.get(); - return loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext); + GranuleMaterializeStats stats; + auto ret = loadAndMaterializeBlobGranules(files, keyRange, beginVersion, readVersion, granuleContext, stats); + if (!ret.isError()) { + ISingleThreadTransaction* tr = this->tr; + onMainThreadVoid([tr, stats]() { tr->addGranuleMaterializeStats(stats); }); + } + return ret; } ThreadFuture>> ThreadSafeTransaction::summarizeBlobGranules( diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index a907fc897f..6f530f020d 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -55,6 +55,13 @@ struct GranuleDeltas : VectorRef { } }; +struct GranuleMaterializeStats { + int64_t inputBytes; + int64_t outputBytes; + + GranuleMaterializeStats() : inputBytes(0), outputBytes(0) {} +}; + struct BlobGranuleCipherKeysMeta { EncryptCipherDomainId textDomainId; EncryptCipherBaseKeyId textBaseCipherId; diff --git a/fdbclient/include/fdbclient/BlobGranuleFiles.h b/fdbclient/include/fdbclient/BlobGranuleFiles.h index f6a159a7fa..23faff3d03 100644 --- a/fdbclient/include/fdbclient/BlobGranuleFiles.h +++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h @@ -43,7 +43,8 @@ ErrorOr loadAndMaterializeBlobGranules(const Standalone getEstimatedRangeSizeBytes(KeyRange const& keys) override { throw client_invalid_operation(); } + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override { throw client_invalid_operation(); } void addReadConflictRange(KeyRangeRef const& keys) override { throw client_invalid_operation(); } void makeSelfConflicting() override { throw client_invalid_operation(); } void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) override { diff --git a/fdbclient/include/fdbclient/ISingleThreadTransaction.h b/fdbclient/include/fdbclient/ISingleThreadTransaction.h index 095789f99e..71a0897693 100644 --- a/fdbclient/include/fdbclient/ISingleThreadTransaction.h +++ b/fdbclient/include/fdbclient/ISingleThreadTransaction.h @@ -88,6 +88,7 @@ public: virtual Future>> summarizeBlobGranules(KeyRange const& range, Optional summaryVersion, int rangeLimit) = 0; + virtual void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) = 0; virtual void addReadConflictRange(KeyRangeRef const& keys) = 0; virtual void makeSelfConflicting() = 0; virtual void atomicOp(KeyRef const& key, ValueRef const& operand, uint32_t operationType) = 0; diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 4fb2920667..82b6fd89cd 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -423,6 +423,8 @@ public: Optional summaryVersion, int rangeLimit); + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats); + // If checkWriteConflictRanges is true, existing write conflict ranges will be searched for this key void set(const KeyRef& key, const ValueRef& value, AddConflictRange = AddConflictRange::True); void atomicOp(const KeyRef& key, diff --git a/fdbclient/include/fdbclient/ReadYourWrites.h b/fdbclient/include/fdbclient/ReadYourWrites.h index 0d47457de8..659f7768ae 100644 --- a/fdbclient/include/fdbclient/ReadYourWrites.h +++ b/fdbclient/include/fdbclient/ReadYourWrites.h @@ -130,6 +130,7 @@ public: Future>> summarizeBlobGranules(const KeyRange& range, Optional summaryVersion, int rangeLimit) override; + void addGranuleMaterializeStats(const GranuleMaterializeStats& stats) override; void addReadConflictRange(KeyRangeRef const& keys) override; void makeSelfConflicting() override { tr.makeSelfConflicting(); } From 9122e6e5febf749100a7b616e1e95184b5791cb9 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 18:31:55 -0500 Subject: [PATCH 18/34] fixing off by one (#8279) --- fdbserver/BlobWorker.actor.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index d9c10c4701..e14aeb0975 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -2437,10 +2437,13 @@ ACTOR Future blobGranuleUpdateFiles(Reference bwData, previousFuture = Future(BlobFileIndex()); } + // The last version included in the old change feed is startState.cfStartVersion - 1. + // So if the previous delta file did not include this version, and the new delta file does, the old + // change feed is considered complete. Optional> oldChangeFeedDataComplete; if (startState.splitParentGranule.present() && - metadata->pendingDeltaVersion < startState.changeFeedStartVersion && - lastDeltaVersion >= startState.changeFeedStartVersion) { + metadata->pendingDeltaVersion + 1 < startState.changeFeedStartVersion && + lastDeltaVersion + 1 >= startState.changeFeedStartVersion) { oldChangeFeedDataComplete = startState.splitParentGranule.get(); } From 430f6e96704d5bee74a1355c2372a3e24b0d9cc7 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Thu, 22 Sep 2022 18:32:33 -0500 Subject: [PATCH 19/34] Fix purge at latest racing with other checking threads at the end of BlobGranuleVerify (#8281) --- fdbserver/workloads/BlobGranuleVerifier.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp index 00b8d5a610..7825fda778 100644 --- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp +++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp @@ -1218,7 +1218,7 @@ struct BlobGranuleVerifierWorkload : TestWorkload { } Future check(Database const& cx) override { - if (clientId == 0 || !doForcePurge) { + if (clientId == 0 || (!doForcePurge && !purgeAtLatest)) { return _check(cx, this); } return true; From c9b4fc57611eb20cbb64debfc58e693b72a48254 Mon Sep 17 00:00:00 2001 From: Hui Liu Date: Thu, 22 Sep 2022 12:48:24 -0700 Subject: [PATCH 20/34] disallow MoveKeysWorkload running in parallel --- .../fdbserver/workloads/workloads.actor.h | 7 +++- fdbserver/tester.actor.cpp | 37 ++++++++++--------- .../workloads/DiskFailureInjection.actor.cpp | 2 +- .../workloads/MachineAttrition.actor.cpp | 31 ++++++---------- fdbserver/workloads/RandomClogging.actor.cpp | 29 ++++++--------- fdbserver/workloads/RandomMoveKeys.actor.cpp | 7 ++++ fdbserver/workloads/Rollback.actor.cpp | 2 +- 7 files changed, 56 insertions(+), 59 deletions(-) diff --git a/fdbserver/include/fdbserver/workloads/workloads.actor.h b/fdbserver/include/fdbserver/workloads/workloads.actor.h index 854963bb93..6896e100f8 100644 --- a/fdbserver/include/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/include/fdbserver/workloads/workloads.actor.h @@ -101,8 +101,8 @@ struct NoOptions {}; struct FailureInjectionWorkload : TestWorkload { FailureInjectionWorkload(WorkloadContext const&); virtual ~FailureInjectionWorkload() {} - virtual bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload); - virtual void initFailureInjectionMode(DeterministicRandom& random, unsigned count); + virtual void initFailureInjectionMode(DeterministicRandom& random); + virtual bool shouldInject(DeterministicRandom& random, const WorkloadRequest& work, const unsigned count) const; Future setupInjectionWorkload(Database const& cx, Future done); Future startInjectionWorkload(Database const& cx, Future done); @@ -137,6 +137,9 @@ struct CompoundWorkload : TestWorkload { CompoundWorkload(WorkloadContext& wcx); CompoundWorkload* add(Reference&& w); void addFailureInjection(WorkloadRequest& work); + bool shouldInjectFailure(DeterministicRandom& random, + const WorkloadRequest& work, + Reference failureInjection) const; std::string description() const override; diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 9d96e14e84..92889bb596 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -399,13 +399,25 @@ void CompoundWorkload::addFailureInjection(WorkloadRequest& work) { if (disabledWorkloads.count(workload->description()) > 0) { continue; } - while (workload->add(random, work, *this)) { + while (shouldInjectFailure(random, work, workload)) { + workload->initFailureInjectionMode(random); failureInjection.push_back(workload); workload = factory->create(*this); } } } +bool CompoundWorkload::shouldInjectFailure(DeterministicRandom& random, + const WorkloadRequest& work, + Reference failure) const { + auto desc = failure->description(); + unsigned alreadyAdded = + std::count_if(workloads.begin(), workloads.end(), [&desc](auto const& w) { return w->description() == desc; }); + alreadyAdded += std::count_if( + failureInjection.begin(), failureInjection.end(), [&desc](auto const& w) { return w->description() == desc; }); + return failure->shouldInject(random, work, alreadyAdded); +} + Future> CompoundWorkload::getMetrics() { return getMetricsCompoundWorkload(this); } @@ -425,24 +437,13 @@ void TestWorkload::disableFailureInjectionWorkloads(std::set& out) FailureInjectionWorkload::FailureInjectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {} -bool FailureInjectionWorkload::add(DeterministicRandom& random, - const WorkloadRequest& work, - const CompoundWorkload& workload) { - auto desc = description(); - unsigned alreadyAdded = std::count_if(workload.workloads.begin(), workload.workloads.end(), [&desc](auto const& w) { - return w->description() == desc; - }); - alreadyAdded += std::count_if(workload.failureInjection.begin(), - workload.failureInjection.end(), - [&desc](auto const& w) { return w->description() == desc; }); - bool willAdd = alreadyAdded < 3 && work.useDatabase && 0.1 / (1 + alreadyAdded) > random.random01(); - if (willAdd) { - initFailureInjectionMode(random, alreadyAdded); - } - return willAdd; -} +void FailureInjectionWorkload::initFailureInjectionMode(DeterministicRandom& random) {} -void FailureInjectionWorkload::initFailureInjectionMode(DeterministicRandom& random, unsigned count) {} +bool FailureInjectionWorkload::shouldInject(DeterministicRandom& random, + const WorkloadRequest& work, + const unsigned alreadyAdded) const { + return alreadyAdded < 3 && work.useDatabase && 0.1 / (1 + alreadyAdded) > random.random01(); +} Future FailureInjectionWorkload::setupInjectionWorkload(const Database& cx, Future done) { return holdWhile(this->setup(cx), done); diff --git a/fdbserver/workloads/DiskFailureInjection.actor.cpp b/fdbserver/workloads/DiskFailureInjection.actor.cpp index 2a832674bd..63dd4063fb 100644 --- a/fdbserver/workloads/DiskFailureInjection.actor.cpp +++ b/fdbserver/workloads/DiskFailureInjection.actor.cpp @@ -66,7 +66,7 @@ struct DiskFailureInjectionWorkload : FailureInjectionWorkload { periodicBroadcastInterval = getOption(options, "periodicBroadcastInterval"_sr, periodicBroadcastInterval); } - void initFailureInjectionMode(DeterministicRandom& random, unsigned count) override { enabled = clientId == 0; } + void initFailureInjectionMode(DeterministicRandom& random) override { enabled = clientId == 0; } std::string description() const override { if (g_simulator == g_network) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index fb6679feb3..6953a9541c 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -112,26 +112,10 @@ struct MachineAttritionWorkload : FailureInjectionWorkload { allowFaultInjection = getOption(options, "allowFaultInjection"_sr, allowFaultInjection); } - bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload) override { - auto desc = this->description(); - unsigned alreadyAdded = std::count_if(workload.workloads.begin(), - workload.workloads.end(), - [&desc](auto const& w) { return w->description() == desc; }); - alreadyAdded += std::count_if(workload.failureInjection.begin(), - workload.failureInjection.end(), - [&desc](auto const& w) { return w->description() == desc; }); - auto res = work.useDatabase && random.random01() < 1.0 / (2.0 + alreadyAdded); - if (res) { - initializeForInjection(random); - } - TraceEvent("AddingFailureInjection") - .detail("Reboot", reboot) - .detail("Replacement", replacement) - .detail("AllowFaultInjection", allowFaultInjection) - .detail("KillDC", killDc) - .detail("KillDataHall", killDatahall) - .detail("KillZone", killZone); - return res; + bool shouldInject(DeterministicRandom& random, + const WorkloadRequest& work, + const unsigned alreadyAdded) const override { + return work.useDatabase && random.random01() < 1.0 / (2.0 + alreadyAdded); } void initializeForInjection(DeterministicRandom& random) { @@ -152,6 +136,13 @@ struct MachineAttritionWorkload : FailureInjectionWorkload { killDatahall = dataHalls.size() > 0 && killDc && random.random01() < 0.5; killZone = zones.size() > 0 && random.random01() < 0.2; } + TraceEvent("AddingFailureInjection") + .detail("Reboot", reboot) + .detail("Replacement", replacement) + .detail("AllowFaultInjection", allowFaultInjection) + .detail("KillDC", killDc) + .detail("KillDataHall", killDatahall) + .detail("KillZone", killZone); } static std::vector getServers() { diff --git a/fdbserver/workloads/RandomClogging.actor.cpp b/fdbserver/workloads/RandomClogging.actor.cpp index f842983234..0cf6796fea 100644 --- a/fdbserver/workloads/RandomClogging.actor.cpp +++ b/fdbserver/workloads/RandomClogging.actor.cpp @@ -43,23 +43,18 @@ struct RandomCloggingWorkload : FailureInjectionWorkload { swizzleClog = getOption(options, "swizzle"_sr, swizzleClog); } - bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload) override { - auto desc = description(); - unsigned alreadyAdded = std::count_if(workload.workloads.begin(), - workload.workloads.end(), - [&desc](auto const& w) { return w->description() == desc; }); - alreadyAdded += std::count_if(workload.failureInjection.begin(), - workload.failureInjection.end(), - [&desc](auto const& w) { return w->description() == desc; }); - bool willAdd = work.useDatabase && 0.25 / (1 + alreadyAdded) > random.random01(); - if (willAdd) { - enabled = this->clientId == 0; - scale = std::max(random.random01(), 0.1); - clogginess = std::max(random.random01(), 0.1); - swizzleClog = random.random01() < 0.3; - iterate = random.random01() < 0.5; - } - return willAdd; + bool shouldInject(DeterministicRandom& random, + const WorkloadRequest& work, + const unsigned alreadyAdded) const override { + return work.useDatabase && 0.25 / (1 + alreadyAdded) > random.random01(); + } + + void initFailureInjectionMode(DeterministicRandom& random) override { + enabled = this->clientId == 0; + scale = std::max(random.random01(), 0.1); + clogginess = std::max(random.random01(), 0.1); + swizzleClog = random.random01() < 0.3; + iterate = random.random01() < 0.5; } std::string description() const override { diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp index fc407efb1a..05b4ffd5c6 100644 --- a/fdbserver/workloads/RandomMoveKeys.actor.cpp +++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp @@ -27,6 +27,7 @@ #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/QuietDatabase.h" +#include "flow/DeterministicRandom.h" #include "flow/actorcompiler.h" // This must be the last #include. struct MoveKeysWorkload : FailureInjectionWorkload { @@ -50,6 +51,12 @@ struct MoveKeysWorkload : FailureInjectionWorkload { Future setup(Database const& cx) override { return Void(); } Future start(Database const& cx) override { return _start(cx, this); } + bool shouldInject(DeterministicRandom& random, + const WorkloadRequest& work, + const unsigned alreadyAdded) const override { + return alreadyAdded < 1 && work.useDatabase && 0.1 / (1 + alreadyAdded) > random.random01(); + } + ACTOR Future _start(Database cx, MoveKeysWorkload* self) { if (self->enabled) { // Get the database configuration so as to use proper team size diff --git a/fdbserver/workloads/Rollback.actor.cpp b/fdbserver/workloads/Rollback.actor.cpp index cadff1a1f4..30f4d6e0dc 100644 --- a/fdbserver/workloads/Rollback.actor.cpp +++ b/fdbserver/workloads/Rollback.actor.cpp @@ -47,7 +47,7 @@ struct RollbackWorkload : FailureInjectionWorkload { multiple = getOption(options, "multiple"_sr, multiple); } - void initFailureInjectionMode(DeterministicRandom& random, unsigned count) override { + void initFailureInjectionMode(DeterministicRandom& random) override { enabled = clientId == 0; multiple = random.coinflip(); enableFailures = random.random01() < 0.2; From 48e1b06d291b3d81d7dd458ba8d42a04b36bee9c Mon Sep 17 00:00:00 2001 From: Dan Adkins Date: Thu, 22 Sep 2022 17:01:01 -0700 Subject: [PATCH 21/34] Add comments about the use of protectedAddresses. --- fdbrpc/include/fdbrpc/simulator.h | 6 ++++++ fdbserver/SimulatedCluster.actor.cpp | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 5228ff7b92..494de44452 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -451,7 +451,13 @@ public: int physicalDatacenters; int processesPerMachine; int listenersPerProcess; + + // We won't kill machines in this set, but we might reboot + // them. This is a conservatie mechanism to prevent the + // simulator from killing off imporant processes and rendering + // the cluster unrecoverable, e.g. a quorum of coordinators. std::set protectedAddresses; + std::map currentlyRebootingProcesses; std::vector extraDatabases; Reference storagePolicy; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 513990c581..55050f412c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -2172,6 +2172,10 @@ void setupSimulatedSystem(std::vector>* systemActors, } ASSERT(coordinatorAddresses.size() > 0); + + // Mark a random majority of the coordinators as protected, so + // we won't accidently kill off a quorum and render the + // cluster unrecoverable. deterministicRandom()->randomShuffle(coordinatorAddresses); for (int i = 0; i < (coordinatorAddresses.size() / 2) + 1; i++) { TraceEvent("ProtectCoordinator") From 11a6cba2c6a0c89723fabbbe69358e148fa8fcca Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 22 Sep 2022 17:11:07 -0700 Subject: [PATCH 22/34] rename dbProcessor to db; readability improvement --- fdbserver/DDRelocationQueue.actor.cpp | 30 ++++++++++--------- fdbserver/DDTeamCollection.actor.cpp | 12 ++++---- .../include/fdbserver/DDTeamCollection.h | 4 +-- .../fdbserver/DataDistribution.actor.h | 2 +- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/fdbserver/DDRelocationQueue.actor.cpp b/fdbserver/DDRelocationQueue.actor.cpp index 48ad5f5ab0..0f944c5dae 100644 --- a/fdbserver/DDRelocationQueue.actor.cpp +++ b/fdbserver/DDRelocationQueue.actor.cpp @@ -728,7 +728,7 @@ struct DDQueue : public IDDRelocationQueue { DDQueue(UID mid, MoveKeysLock lock, - const std::shared_ptr& dbProcessor, + const std::shared_ptr& db, std::vector teamCollections, Reference sABTF, Reference physicalShardCollection, @@ -739,7 +739,7 @@ struct DDQueue : public IDDRelocationQueue { FutureStream input, PromiseStream getShardMetrics, PromiseStream getTopKMetrics) - : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(dbProcessor->getDb()), txnProcessor(dbProcessor), + : IDDRelocationQueue(), distributorId(mid), lock(lock), cx(db->getDb()), txnProcessor(db), teamCollections(teamCollections), shardsAffectedByTeamFailure(sABTF), physicalShardCollection(physicalShardCollection), getAverageShardBytes(getAverageShardBytes), startMoveKeysParallelismLock(SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM), @@ -2209,20 +2209,22 @@ Future DDQueue::rebalanceTeams(DataMovementReason moveReason, ACTOR Future getSkipRebalanceValue(std::shared_ptr txnProcessor, bool readRebalance) { Optional val = wait(txnProcessor->readRebalanceDDIgnoreKey()); + if (!val.present()) + return false; + bool skipCurrentLoop = false; - if (val.present()) { - // NOTE: check special value "" and "on" might written in old version < 7.2 - if (val.get().size() > 0 && val.get() != "on"_sr) { - int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); - if (readRebalance) { - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; - } else { - skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; - } + // NOTE: check special value "" and "on" might written in old version < 7.2 + if (val.get().size() > 0 && val.get() != "on"_sr) { + int ddIgnore = BinaryReader::fromStringRef(val.get(), Unversioned()); + if (readRebalance) { + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_READ) > 0; } else { - skipCurrentLoop = true; + skipCurrentLoop = (ddIgnore & DDIgnore::REBALANCE_DISK) > 0; } + } else { + skipCurrentLoop = true; } + return skipCurrentLoop; } @@ -2309,7 +2311,7 @@ ACTOR Future BgDDLoadRebalance(DDQueue* self, int teamCollectionIndex, Dat } } -ACTOR Future dataDistributionQueue(std::shared_ptr dbProcessor, +ACTOR Future dataDistributionQueue(std::shared_ptr db, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, @@ -2328,7 +2330,7 @@ ACTOR Future dataDistributionQueue(std::shared_ptr dbProc const DDEnabledState* ddEnabledState) { state DDQueue self(distributorId, lock, - dbProcessor, + db, teamCollections, shardsAffectedByTeamFailure, physicalShardCollection, diff --git a/fdbserver/DDTeamCollection.actor.cpp b/fdbserver/DDTeamCollection.actor.cpp index 0df42fbeca..774cfaa043 100644 --- a/fdbserver/DDTeamCollection.actor.cpp +++ b/fdbserver/DDTeamCollection.actor.cpp @@ -3581,7 +3581,7 @@ bool DDTeamCollection::satisfiesPolicy(const std::vector return result && resultEntries.size() == 0; } -DDTeamCollection::DDTeamCollection(const std::shared_ptr& dbProcessor, +DDTeamCollection::DDTeamCollection(const std::shared_ptr& db, UID distributorId, MoveKeysLock const& lock, PromiseStream const& output, @@ -3597,9 +3597,9 @@ DDTeamCollection::DDTeamCollection(const std::shared_ptr& dbPro PromiseStream getShardMetrics, Promise removeFailedServer, PromiseStream> getUnhealthyRelocationCount) - : dbProcessor(dbProcessor), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), lock(lock), - output(output), unhealthyServers(0), storageWiggler(makeReference(this)), - processingWiggle(processingWiggle), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), + : db(db), doBuildTeams(true), lastBuildTeamsFailed(false), teamBuilder(Void()), lock(lock), output(output), + unhealthyServers(0), storageWiggler(makeReference(this)), processingWiggle(processingWiggle), + shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), initialFailureReactionDelay( delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)), initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay)), recruitingStream(0), @@ -3618,8 +3618,8 @@ DDTeamCollection::DDTeamCollection(const std::shared_ptr& dbPro primary(primary), distributorId(distributorId), configuration(configuration), storageServerSet(new LocalityMap()) { - if (!dbProcessor->isMocked()) { - cx = this->dbProcessor->getDb(); + if (!db->isMocked()) { + cx = this->db->getDb(); } if (!primary || configuration.usableRegions == 1) { diff --git a/fdbserver/include/fdbserver/DDTeamCollection.h b/fdbserver/include/fdbserver/DDTeamCollection.h index 9dc9faf326..df11684256 100644 --- a/fdbserver/include/fdbserver/DDTeamCollection.h +++ b/fdbserver/include/fdbserver/DDTeamCollection.h @@ -603,7 +603,7 @@ class DDTeamCollection : public ReferenceCounted { int addTeamsBestOf(int teamsToBuild, int desiredTeams, int maxTeams); public: - std::shared_ptr dbProcessor; + std::shared_ptr db; Database cx; DatabaseConfiguration configuration; @@ -621,7 +621,7 @@ public: AsyncTrigger printDetailedTeamsInfo; Reference storageServerSet; - DDTeamCollection(const std::shared_ptr& dbProcessor, + DDTeamCollection(const std::shared_ptr& db, UID distributorId, MoveKeysLock const& lock, PromiseStream const& output, diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 465a474665..67e3b57f67 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -514,7 +514,7 @@ ACTOR Future dataDistributionTracker(Reference in bool* trackerCancelled, Optional> ddTenantCache); -ACTOR Future dataDistributionQueue(std::shared_ptr dbProcessor, +ACTOR Future dataDistributionQueue(std::shared_ptr db, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, From 9fd16bd38e49a9aef4e66d501d2808982694c11a Mon Sep 17 00:00:00 2001 From: Hui Liu Date: Fri, 23 Sep 2022 08:55:18 -0700 Subject: [PATCH 23/34] disable MoveKeysWorkload for a few tests that need to manipulate dd --- fdbserver/workloads/DataLossRecovery.actor.cpp | 2 ++ fdbserver/workloads/PhysicalShardMove.actor.cpp | 2 ++ .../workloads/StorageServerCheckpointRestoreTest.actor.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/fdbserver/workloads/DataLossRecovery.actor.cpp b/fdbserver/workloads/DataLossRecovery.actor.cpp index 86b7f9de3f..996c403efb 100644 --- a/fdbserver/workloads/DataLossRecovery.actor.cpp +++ b/fdbserver/workloads/DataLossRecovery.actor.cpp @@ -63,6 +63,8 @@ struct DataLossRecoveryWorkload : TestWorkload { Future setup(Database const& cx) override { return Void(); } + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("MoveKeysWorkload"); } + Future start(Database const& cx) override { if (!enabled) { return Void(); diff --git a/fdbserver/workloads/PhysicalShardMove.actor.cpp b/fdbserver/workloads/PhysicalShardMove.actor.cpp index 80bdcddbee..d8b767e322 100644 --- a/fdbserver/workloads/PhysicalShardMove.actor.cpp +++ b/fdbserver/workloads/PhysicalShardMove.actor.cpp @@ -70,6 +70,8 @@ struct PhysicalShardMoveWorkLoad : TestWorkload { return _start(this, cx); } + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("MoveKeysWorkload"); } + ACTOR Future _start(PhysicalShardMoveWorkLoad* self, Database cx) { int ignore = wait(setDDMode(cx, 0)); state std::map kvs({ { "TestKeyA"_sr, "TestValueA"_sr }, diff --git a/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp b/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp index 42b93b7654..e79a3784de 100644 --- a/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp +++ b/fdbserver/workloads/StorageServerCheckpointRestoreTest.actor.cpp @@ -67,6 +67,8 @@ struct SSCheckpointRestoreWorkload : TestWorkload { return _start(this, cx); } + void disableFailureInjectionWorkloads(std::set& out) const override { out.insert("MoveKeysWorkload"); } + ACTOR Future _start(SSCheckpointRestoreWorkload* self, Database cx) { state Key key = "TestKey"_sr; state Key endKey = "TestKey0"_sr; From fee77c8702bb5e49a99b9770cb5a365b2109892c Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 23 Sep 2022 11:06:53 -0700 Subject: [PATCH 24/34] Change crashAndDie to abort instead of segv (#7827) * Change crashAndDie to abort instead of segv This way we still terminate early when `--crash` is set, but folks inspecting error reports don't get mislead and confuse assertion failures with genuine memory errors. * Add SIGABRT to crash handler --- flow/Platform.actor.cpp | 1 + flow/include/flow/Platform.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 4b88fe3985..0bb0552631 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3729,6 +3729,7 @@ void registerCrashHandler() { sigaction(SIGBUS, &action, nullptr); sigaction(SIGUSR2, &action, nullptr); sigaction(SIGTERM, &action, nullptr); + sigaction(SIGABRT, &action, nullptr); #else // No crash handler for other platforms! #endif diff --git a/flow/include/flow/Platform.h b/flow/include/flow/Platform.h index 563d59e253..d43949a8ed 100644 --- a/flow/include/flow/Platform.h +++ b/flow/include/flow/Platform.h @@ -678,7 +678,7 @@ inline static void flushOutputStreams() { #error Missing symbol export #endif -#define crashAndDie() (*(volatile int*)0 = 0) +#define crashAndDie() std::abort() #ifdef _WIN32 #define strcasecmp stricmp From 5774249e5be23fb2eebe8b1a8e5d529e5eab5478 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 23 Sep 2022 12:22:47 -0600 Subject: [PATCH 25/34] Revert "[DRAFT] Redwood PriorityMultiLock enable different launch limits to be specified based on different priority level." --- fdbclient/NativeAPI.actor.cpp | 1 - fdbclient/ServerKnobs.cpp | 6 +- fdbclient/include/fdbclient/FDBTypes.h | 8 +- fdbclient/include/fdbclient/NativeAPI.actor.h | 1 - fdbclient/include/fdbclient/ServerKnobs.h | 6 +- fdbserver/VersionedBTree.actor.cpp | 309 ++++++++++----- fdbserver/storageserver.actor.cpp | 85 ++--- fdbserver/workloads/ReadWrite.actor.cpp | 10 +- flow/include/flow/PriorityMultiLock.actor.h | 358 ------------------ flow/include/flow/genericactors.actor.h | 15 - flowbench/BenchPriorityMultiLock.actor.cpp | 87 ----- 11 files changed, 237 insertions(+), 649 deletions(-) delete mode 100644 flow/include/flow/PriorityMultiLock.actor.h delete mode 100644 flowbench/BenchPriorityMultiLock.actor.cpp diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index b3cb5ae65c..881e3e2c87 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3936,7 +3936,6 @@ Future getExactRange(Reference trState, req.version = version; req.begin = firstGreaterOrEqual(range.begin); req.end = firstGreaterOrEqual(range.end); - setMatchIndex(req, matchIndex); req.spanContext = span.context; trState->cx->getLatestCommitVersions( diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 06b2f75999..067e5f624f 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -783,10 +783,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 ); - init( STORAGE_SERVER_READ_CONCURRENCY, 70 ); - // Priorities which each ReadType maps to, in enumeration order - init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" ); - init( STORAGESERVER_READ_PRIORITIES, "48,32,8" ); //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; @@ -898,6 +894,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 ); init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 ); init( REDWOOD_EXTENT_CONCURRENT_READS, 4 ); + init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); init( REDWOOD_KVSTORE_RANGE_PREFETCH, true ); init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 ); init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 ); @@ -910,7 +907,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } - init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" ); // Server request latency measurement init( LATENCY_SAMPLE_SIZE, 100000 ); diff --git a/fdbclient/include/fdbclient/FDBTypes.h b/fdbclient/include/fdbclient/FDBTypes.h index 7f215b86a0..89ddacad2d 100644 --- a/fdbclient/include/fdbclient/FDBTypes.h +++ b/fdbclient/include/fdbclient/FDBTypes.h @@ -1581,7 +1581,13 @@ struct StorageWiggleValue { } }; -enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH }; +enum class ReadType { + EAGER, + FETCH, + LOW, + NORMAL, + HIGH, +}; FDB_DECLARE_BOOLEAN_PARAM(CacheResult); diff --git a/fdbclient/include/fdbclient/NativeAPI.actor.h b/fdbclient/include/fdbclient/NativeAPI.actor.h index 82b6fd89cd..f85154691b 100644 --- a/fdbclient/include/fdbclient/NativeAPI.actor.h +++ b/fdbclient/include/fdbclient/NativeAPI.actor.h @@ -479,7 +479,6 @@ public: Database getDatabase() const { return trState->cx; } static Reference createTrLogInfoProbabilistically(const Database& cx); - Transaction& getTransaction() { return *this; } void setTransactionID(UID id); void setToken(uint64_t token); diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 07cf7941da..abcdc0279c 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -735,9 +735,6 @@ public: int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT_BYTES; int STORAGE_FEED_QUERY_HARD_LIMIT; - int STORAGE_SERVER_READ_CONCURRENCY; - std::string STORAGESERVER_READ_RANKS; - std::string STORAGESERVER_READ_PRIORITIES; // Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; @@ -867,6 +864,7 @@ public: int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress. + int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. bool REDWOOD_KVSTORE_RANGE_PREFETCH; // Whether to use range read prefetching double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at @@ -885,8 +883,6 @@ public: bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache. int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches - std::string REDWOOD_PRIORITY_LAUNCHS; - // Server request latency measurement int LATENCY_SAMPLE_SIZE; double LATENCY_METRICS_LOGGING_INTERVAL; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index d92391a437..ccf2329314 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -28,7 +28,6 @@ #include "flow/Trace.h" #include "flow/flow.h" #include "flow/Histogram.h" -#include "flow/PriorityMultiLock.actor.h" #include #include #include "fdbrpc/ContinuousSample.h" @@ -103,6 +102,201 @@ std::string addPrefix(std::string prefix, std::string lines) { return s; } +#define PRIORITYMULTILOCK_DEBUG 0 + +// A multi user lock with a concurrent holder limit where waiters are granted the lock according to +// an integer priority from 0 to maxPriority, inclusive, where higher integers are given priority. +// +// The interface is similar to FlowMutex except that lock holders can drop the lock to release it. +// +// Usage: +// Lock lock = wait(prioritylock.lock(priorityLevel)); +// lock.release(); // Explicit release, or +// // let lock and all copies of lock go out of scope to release +class PriorityMultiLock { + +public: + // Waiting on the lock returns a Lock, which is really just a Promise + // Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release + // the Lock before it goes out of scope. + struct Lock { + void release() { promise.send(Void()); } + + // This is exposed in case the caller wants to use/copy it directly + Promise promise; + }; + +private: + struct Waiter { + Waiter() : queuedTime(now()) {} + Promise lockPromise; + double queuedTime; + }; + + typedef Deque Queue; + +#if PRIORITYMULTILOCK_DEBUG +#define prioritylock_printf(...) printf(__VA_ARGS__) +#else +#define prioritylock_printf(...) +#endif + +public: + PriorityMultiLock(int concurrency, int maxPriority, int launchLimit = std::numeric_limits::max()) + : concurrency(concurrency), available(concurrency), waiting(0), launchLimit(launchLimit) { + waiters.resize(maxPriority + 1); + fRunner = runner(this); + } + + ~PriorityMultiLock() { prioritylock_printf("destruct"); } + + Future lock(int priority = 0) { + prioritylock_printf("lock begin %s\n", toString().c_str()); + + // This shortcut may enable a waiter to jump the line when the releaser loop yields + if (available > 0) { + --available; + Lock p; + addRunner(p); + prioritylock_printf("lock exit immediate %s\n", toString().c_str()); + return p; + } + + Waiter w; + waiters[priority].push_back(w); + ++waiting; + prioritylock_printf("lock exit queued %s\n", toString().c_str()); + return w.lockPromise.getFuture(); + } + + std::string toString() const { + int runnersDone = 0; + for (int i = 0; i < runners.size(); ++i) { + if (runners[i].isReady()) { + ++runnersDone; + } + } + + std::string s = + format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ", + this, + concurrency, + available, + concurrency - available, + waiting, + runners.size(), + runnersDone); + + for (int i = 0; i < waiters.size(); ++i) { + s += format("p%d_waiters=%u ", i, waiters[i].size()); + } + + s += "}"; + return s; + } + +private: + void addRunner(Lock& lock) { + runners.push_back(map(ready(lock.promise.getFuture()), [=](Void) { + prioritylock_printf("Lock released\n"); + ++available; + if (waiting > 0 || runners.size() > 100) { + release.trigger(); + } + return Void(); + })); + } + + ACTOR static Future runner(PriorityMultiLock* self) { + state int sinceYield = 0; + state Future error = self->brokenOnDestruct.getFuture(); + state int maxPriority = self->waiters.size() - 1; + + // Priority to try to run tasks from next + state int priority = maxPriority; + state Queue* pQueue = &self->waiters[maxPriority]; + + // Track the number of waiters unlocked at the same priority in a row + state int lastPriorityCount = 0; + + loop { + // Cleanup finished runner futures at the front of the runner queue. + while (!self->runners.empty() && self->runners.front().isReady()) { + self->runners.pop_front(); + } + + // Wait for a runner to release its lock + wait(self->release.onTrigger()); + prioritylock_printf("runner wakeup %s\n", self->toString().c_str()); + + if (++sinceYield == 1000) { + sinceYield = 0; + wait(delay(0)); + } + + // While there are available slots and there are waiters, launch tasks + while (self->available > 0 && self->waiting > 0) { + prioritylock_printf("Checking priority=%d lastPriorityCount=%d %s\n", + priority, + lastPriorityCount, + self->toString().c_str()); + + while (!pQueue->empty() && ++lastPriorityCount < self->launchLimit) { + Waiter w = pQueue->front(); + pQueue->pop_front(); + --self->waiting; + Lock lock; + prioritylock_printf(" Running waiter priority=%d wait=%f %s\n", + priority, + now() - w.queuedTime, + self->toString().c_str()); + w.lockPromise.send(lock); + + // Self may have been destructed during the lock callback + if (error.isReady()) { + throw error.getError(); + } + + // If the lock was not already released, add it to the runners future queue + if (lock.promise.canBeSet()) { + self->addRunner(lock); + + // A slot has been consumed, so stop reading from this queue if there aren't any more + if (--self->available == 0) { + break; + } + } + } + + // If there are no more slots available, then don't move to the next priority + if (self->available == 0) { + break; + } + + // Decrease priority, wrapping around to max from 0 + if (priority == 0) { + priority = maxPriority; + } else { + --priority; + } + + pQueue = &self->waiters[priority]; + lastPriorityCount = 0; + } + } + } + + int concurrency; + int available; + int waiting; + int launchLimit; + std::vector waiters; + Deque> runners; + Future fRunner; + AsyncTrigger release; + Promise brokenOnDestruct; +}; + // Some convenience functions for debugging to stringify various structures // Classes can add compatibility by either specializing toString or implementing // std::string toString() const; @@ -1471,8 +1665,6 @@ struct RedwoodMetrics { kvSizeReadByGetRange = Reference( new Histogram(Reference(), "kvSize", "ReadByGetRange", Histogram::Unit::bytes)); - ioLock = nullptr; - // These histograms are used for Btree events, hence level > 0 unsigned int levelCounter = 0; for (RedwoodMetrics::Level& level : levels) { @@ -1515,8 +1707,6 @@ struct RedwoodMetrics { // btree levels and one extra level for non btree level. Level levels[btreeLevels + 1]; metrics metric; - // pointer to the priority multi lock used in pager - PriorityMultiLock* ioLock; Reference kvSizeWritten; Reference kvSizeReadByGet; @@ -1571,12 +1761,9 @@ struct RedwoodMetrics { // The string is a reasonably well formatted page of information void getFields(TraceEvent* e, std::string* s = nullptr, bool skipZeroes = false); - void getIOLockFields(TraceEvent* e, std::string* s = nullptr); - std::string toString(bool clearAfter) { std::string s; getFields(nullptr, &s); - getIOLockFields(nullptr, &s); if (clearAfter) { clear(); @@ -1611,7 +1798,6 @@ ACTOR Future redwoodMetricsLogger() { double elapsed = now() - g_redwoodMetrics.startTime; e.detail("Elapsed", elapsed); g_redwoodMetrics.getFields(&e); - g_redwoodMetrics.getIOLockFields(&e); g_redwoodMetrics.clear(); } } @@ -2008,7 +2194,7 @@ public: bool memoryOnly, Reference keyProvider, Promise errorPromise = {}) - : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS), + : keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, ioMaxPriority, FLOW_KNOBS->MAX_OUTSTANDING / 2), pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { @@ -2020,7 +2206,6 @@ public: // This sets the page cache size for all PageCacheT instances using the same evictor pageCache.evictor().sizeLimit = pageCacheBytes; - g_redwoodMetrics.ioLock = &ioLock; if (!g_redwoodMetricsActor.isValid()) { g_redwoodMetricsActor = redwoodMetricsLogger(); } @@ -7510,7 +7695,8 @@ RedwoodRecordRef VersionedBTree::dbEnd("\xff\xff\xff\xff\xff"_sr); class KeyValueStoreRedwood : public IKeyValueStore { public: KeyValueStoreRedwood(std::string filename, UID logID, Reference encryptionKeyProvider) - : m_filename(filename), prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) { + : m_filename(filename), m_concurrentReads(SERVER_KNOBS->REDWOOD_KVSTORE_CONCURRENT_READS, 0), + prefetch(SERVER_KNOBS->REDWOOD_KVSTORE_RANGE_PREFETCH) { int pageSize = BUGGIFY ? deterministicRandom()->randomInt(1000, 4096 * 4) : SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE; @@ -7570,8 +7756,6 @@ public: ACTOR void shutdown(KeyValueStoreRedwood* self, bool dispose) { TraceEvent(SevInfo, "RedwoodShutdown").detail("Filename", self->m_filename).detail("Dispose", dispose); - g_redwoodMetrics.ioLock = nullptr; - // In simulation, if the instance is being disposed of then sometimes run destructive sanity check. if (g_network->isSimulated() && dispose && BUGGIFY) { // Only proceed if the last commit is a success, but don't throw if it's not because shutdown @@ -7672,6 +7856,7 @@ public: f.get(); } else { CODE_PROBE(true, "Uncached forward range read seek"); + wait(store(lock, self->m_concurrentReads.lock())); wait(f); } @@ -7727,6 +7912,7 @@ public: f.get(); } else { CODE_PROBE(true, "Uncached reverse range read seek"); + wait(store(lock, self->m_concurrentReads.lock())); wait(f); } @@ -7793,6 +7979,9 @@ public: wait(self->m_tree->initBTreeCursor( &cur, self->m_tree->getLastCommittedVersion(), PagerEventReasons::PointRead, options)); + // Not locking for point reads, instead relying on IO priority lock + // state PriorityMultiLock::Lock lock = wait(self->m_concurrentReads.lock()); + ++g_redwoodMetrics.metric.opGet; wait(cur.seekGTE(key)); if (cur.isValid() && cur.get().key == key) { @@ -7828,6 +8017,7 @@ private: Future m_init; Promise m_closed; Promise m_error; + PriorityMultiLock m_concurrentReads; bool prefetch; Version m_nextCommitVersion; Reference m_keyProvider; @@ -8463,43 +8653,6 @@ void RedwoodMetrics::getFields(TraceEvent* e, std::string* s, bool skipZeroes) { } } -void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) { - if (ioLock == nullptr) - return; - - int maxPriority = ioLock->maxPriority(); - - if (e != nullptr) { - e->detail("ActiveReads", ioLock->totalRunners()); - e->detail("AwaitReads", ioLock->totalWaiters()); - - for (int priority = 0; priority <= maxPriority; ++priority) { - e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority)); - e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority)); - } - } - - if (s != nullptr) { - std::string active = "Active"; - std::string await = "Await"; - - *s += "\n"; - *s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners()); - *s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters()); - *s += "\n"; - - for (int priority = 0; priority <= maxPriority; ++priority) { - *s += - format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority)); - } - *s += "\n"; - for (int priority = 0; priority <= maxPriority; ++priority) { - *s += - format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority)); - } - } -} - TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3); ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4); @@ -10977,57 +11130,3 @@ TEST_CASE(":/redwood/performance/histograms") { return Void(); } - -ACTOR Future waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) { - state PriorityMultiLock::Lock lock = wait(pml->lock(priority)); - wait(delay(deterministicRandom()->random01() * .1)); - ++*pout; - return Void(); -} - -TEST_CASE("/redwood/PriorityMultiLock") { - state std::vector priorities = { 10, 20, 40 }; - state int concurrency = 25; - state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities); - state std::vector counts; - counts.resize(priorities.size(), 0); - - // Clog the lock buy taking concurrency locks at each level - state std::vector> lockFutures; - for (int i = 0; i < priorities.size(); ++i) { - for (int j = 0; j < concurrency; ++j) { - lockFutures.push_back(pml->lock(i)); - } - } - - // Wait for n = concurrency locks to be acquired - wait(quorum(lockFutures, concurrency)); - - state std::vector> futures; - for (int i = 0; i < 10e3; ++i) { - int p = i % priorities.size(); - futures.push_back(waitLockIncrement(pml, p, &counts[p])); - } - - state Future f = waitForAll(futures); - - // Release the locks - lockFutures.clear(); - - // Print stats and wait for all futures to be ready - loop { - choose { - when(wait(delay(1))) { - printf("counts: "); - for (auto c : counts) { - printf("%d ", c); - } - printf(" pml: %s\n", pml->toString().c_str()); - } - when(wait(f)) { break; } - } - } - - delete pml; - return Void(); -} diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3e9baf493b..2cd231a9a5 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -37,7 +37,6 @@ #include "flow/Error.h" #include "flow/Hash3.h" #include "flow/Histogram.h" -#include "flow/PriorityMultiLock.actor.h" #include "flow/IRandom.h" #include "flow/IndexedSet.h" #include "flow/SystemMonitor.h" @@ -1023,9 +1022,6 @@ public: FlowLock serveFetchCheckpointParallelismLock; - PriorityMultiLock ssLock; - std::vector readPriorityRanks; - int64_t instanceID; Promise otherError; @@ -1291,15 +1287,13 @@ public: changeFeedDiskReadsLock(SERVER_KNOBS->CHANGE_FEED_DISK_READS_PARALLELISM), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), - ssLock(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), lastDurableVersionEBrake(0), maxQueryQueue(0), transactionTagCounter(ssi.id()), busiestWriteTagContext(ssi.id()), counters(this), storageServerSourceTLogIDEventHolder( makeReference(ssi.id().toString() + "/StorageServerSourceTLogID")) { - readPriorityRanks = parseStringToVector(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ','); - ASSERT(readPriorityRanks.size() > (int)ReadType::MAX); + version.initMetric("StorageServer.Version"_sr, counters.cc.id); oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.id); durableVersion.initMetric("StorageServer.DurableVersion"_sr, counters.cc.id); @@ -1856,7 +1850,6 @@ std::vector StorageServer::getStorageServerShards(KeyRangeRe ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state int64_t resultSize = 0; - state PriorityMultiLock::Lock lock; Span span("SS:getValue"_loc, req.spanContext); if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -1865,8 +1858,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // Temporarily disabled -- this path is hit a lot // getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.first(); - state ReadType type = req.options.present() ? req.options.get().type : ReadType::NORMAL; - try { ++data->counters.getValueQueries; ++data->counters.allQueries; @@ -1877,8 +1868,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)type]))); - // Track time from requestTime through now as read queueing wait time state double queueWaitEnd = g_network->timer(); data->counters.readQueueWaitSample.addMeasurement(queueWaitEnd - req.requestTime()); @@ -3734,7 +3723,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) state Span span("SS:getKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state Optional options = req.options; - state ReadType type = options.present() ? options.get().type : ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -3749,13 +3737,12 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); - if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { - type = ReadType::NORMAL; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.options.present() && req.options.get().type == ReadType::FETCH) { + wait(delay(0, TaskPriority::FetchKeys)); + } else { + wait(data->getQueryDelay()); } - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); - // Track time from requestTime through now as read queueing wait time state double queueWaitEnd = g_network->timer(); data->counters.readQueueWaitSample.addMeasurement(queueWaitEnd - req.requestTime()); @@ -4480,7 +4467,6 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe state Span span("SS:getMappedKeyValues"_loc, req.spanContext); state int64_t resultSize = 0; state Optional options = req.options; - state ReadType type = options.present() ? options.get().type : ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4495,13 +4481,12 @@ ACTOR Future getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(data->getQueryDelay()); - if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { - type = ReadType::NORMAL; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.options.present() && req.options.get().type == ReadType::FETCH) { + wait(delay(0, TaskPriority::FetchKeys)); + } else { + wait(data->getQueryDelay()); } - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(data->readPriorityRanks[(int)type])); - // Track time from requestTime through now as read queueing wait time state double queueWaitEnd = g_network->timer(); data->counters.readQueueWaitSample.addMeasurement(queueWaitEnd - req.requestTime()); @@ -4698,7 +4683,6 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe state Span span("SS:getKeyValuesStream"_loc, req.spanContext); state int64_t resultSize = 0; state Optional options = req.options; - state ReadType type = options.present() ? options.get().type : ReadType::NORMAL; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); @@ -4712,14 +4696,12 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait(delay(0, TaskPriority::DefaultEndpoint)); - if (!SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && type == ReadType::FETCH) { - type = ReadType::NORMAL; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.options.present() && req.options.get().type == ReadType::FETCH) { + wait(delay(0, TaskPriority::FetchKeys)); + } else { + wait(delay(0, TaskPriority::DefaultEndpoint)); } - state int readPriority = data->readPriorityRanks[(int)type]; - state PriorityMultiLock::Lock lock = wait(data->ssLock.lock(readPriority)); - try { if (req.options.present() && req.options.get().debugID.present()) g_traceBatch.addEvent( @@ -4890,8 +4872,12 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe end = lastKey; } - lock.release(); - wait(store(lock, data->ssLock.lock(readPriority))); + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.options.present() && + req.options.get().type == ReadType::FETCH) { + wait(delay(0, TaskPriority::FetchKeys)); + } else { + wait(delay(0, TaskPriority::DefaultEndpoint)); + } data->transactionTagCounter.addRequest(req.tags, resultSize); } @@ -4912,19 +4898,14 @@ ACTOR Future getKeyValuesStreamQ(StorageServer* data, GetKeyValuesStreamRe ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { state Span span("SS:getKey"_loc, req.spanContext); - state PriorityMultiLock::Lock lock; if (req.tenantInfo.name.present()) { span.addAttribute("tenant"_sr, req.tenantInfo.name.get()); } state int64_t resultSize = 0; state ReadOptions options; - state ReadType type = ReadType::NORMAL; - if (req.options.present()) { options = req.options.get(); - type = options.type; } - getCurrentLineage()->modify(&TransactionLineage::txID) = req.spanContext.traceID; ++data->counters.getKeyQueries; @@ -4936,8 +4917,6 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); - wait(store(lock, data->ssLock.lock(data->readPriorityRanks[(int)type]))); - // Track time from requestTime through now as read queueing wait time state double queueWaitEnd = g_network->timer(); data->counters.readQueueWaitSample.addMeasurement(queueWaitEnd - req.requestTime()); @@ -6492,8 +6471,11 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { state int debug_nextRetryToLog = 1; state Error lastError; + // TODO: update to FETCH once the priority multi lock is used. + // leaving the readtype off for now to prevent data fetches stall under heavy load // it is used to inform the storage that the rangeRead is for Fetch - state ReadOptions options = ReadOptions(fetchKeysID, ReadType::FETCH); + // state ReadOptions options = ReadOptions(Optional(), ReadType::FETCH); + state ReadOptions options = ReadOptions(Optional(), ReadType::NORMAL); // FIXME: The client cache does not notice when servers are added to a team. To read from a local storage server // we must refresh the cache manually. @@ -9844,21 +9826,6 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) [self = self](TraceEvent& te) { te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString()); te.detail("Tag", self->tag.toString()); - std::vector rpr = self->readPriorityRanks; - te.detail("ReadsActive", self->ssLock.totalRunners()); - te.detail("ReadsWaiting", self->ssLock.totalWaiters()); - int type = (int)ReadType::FETCH; - te.detail("ReadFetchActive", self->ssLock.numRunners(rpr[type])); - te.detail("ReadFetchWaiting", self->ssLock.numWaiters(rpr[type])); - type = (int)ReadType::LOW; - te.detail("ReadLowActive", self->ssLock.numRunners(rpr[type])); - te.detail("ReadLowWaiting", self->ssLock.numWaiters(rpr[type])); - type = (int)ReadType::NORMAL; - te.detail("ReadNormalActive", self->ssLock.numRunners(rpr[type])); - te.detail("ReadNormalWaiting", self->ssLock.numWaiters(rpr[type])); - type = (int)ReadType::HIGH; - te.detail("ReadHighActive", self->ssLock.numRunners(rpr[type])); - te.detail("ReadHighWaiting", self->ssLock.numWaiters(rpr[type])); StorageBytes sb = self->storage.getStorageBytes(); te.detail("KvstoreBytesUsed", sb.used); te.detail("KvstoreBytesFree", sb.free); @@ -10715,9 +10682,6 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, // If the storage server dies while something that uses self is still on the stack, // we want that actor to complete before we terminate and that memory goes out of scope - - self.ssLock.kill(); - state Error err = e; if (storageServerTerminated(self, persistentData, err)) { ssCore.cancel(); @@ -10838,9 +10802,6 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, throw internal_error(); } catch (Error& e) { - - self.ssLock.kill(); - if (self.byteSampleRecovery.isValid()) { self.byteSampleRecovery.cancel(); } diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index e958a8da8f..d7a44cb6e1 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -22,7 +22,6 @@ #include #include -#include "fdbclient/FDBTypes.h" #include "fdbrpc/ContinuousSample.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/TesterInterface.actor.h" @@ -378,8 +377,6 @@ struct ReadWriteWorkload : ReadWriteCommon { bool adjacentReads; // keys are adjacent within a transaction bool adjacentWrites; int extraReadConflictRangesPerTransaction, extraWriteConflictRangesPerTransaction; - int readType; - bool cacheResult; Optional transactionTag; int transactionsTagThrottled{ 0 }; @@ -402,8 +399,6 @@ struct ReadWriteWorkload : ReadWriteCommon { rampUpConcurrency = getOption(options, "rampUpConcurrency"_sr, false); batchPriority = getOption(options, "batchPriority"_sr, false); descriptionString = getOption(options, "description"_sr, "ReadWrite"_sr); - readType = getOption(options, "readType"_sr, 3); - cacheResult = getOption(options, "cacheResult"_sr, true); if (hasOption(options, "transactionTag"_sr)) { transactionTag = getOption(options, "transactionTag"_sr, ""_sr); } @@ -433,10 +428,6 @@ struct ReadWriteWorkload : ReadWriteCommon { if (transactionTag.present() && tr.getTags().size() == 0) { tr.setOption(FDBTransactionOptions::AUTO_THROTTLE_TAG, transactionTag.get()); } - ReadOptions options; - options.type = static_cast(readType); - options.cacheResult = cacheResult; - tr.getTransaction().trState->readOptions = options; } std::string description() const override { return descriptionString.toString(); } @@ -512,6 +503,7 @@ struct ReadWriteWorkload : ReadWriteCommon { state double startTime = now(); loop { state Transaction tr(cx); + try { self->setupTransaction(tr); wait(self->readOp(&tr, keys, self, false)); diff --git a/flow/include/flow/PriorityMultiLock.actor.h b/flow/include/flow/PriorityMultiLock.actor.h deleted file mode 100644 index 363b3c4e27..0000000000 --- a/flow/include/flow/PriorityMultiLock.actor.h +++ /dev/null @@ -1,358 +0,0 @@ -/* - * PriorityMultiLock.actor.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source -// version. -#if defined(NO_INTELLISENSE) && !defined(FLOW_PRIORITYMULTILOCK_ACTOR_G_H) -#define FLOW_PRIORITYMULTILOCK_ACTOR_G_H -#include "flow/PriorityMultiLock.actor.g.h" -#elif !defined(PRIORITYMULTILOCK_ACTOR_H) -#define PRIORITYMULTILOCK_ACTOR_H - -#include "flow/flow.h" -#include "flow/actorcompiler.h" // This must be the last #include. - -#define PRIORITYMULTILOCK_DEBUG 0 - -#if PRIORITYMULTILOCK_DEBUG -#define pml_debug_printf(...) printf(__VA_ARGS__) -#else -#define pml_debug_printf(...) -#endif - -// A multi user lock with a concurrent holder limit where waiters request a lock with a priority -// id and are granted locks based on a total concurrency and relative importants of the priority -// ids defined. -// -// Scheduling logic -// launchLimits[n] = configured amount from the launchLimit vector for priority n -// waiters[n] = the number of waiters for priority n -// runnerCounts[n] = number of runners at priority n -// -// totalActiveLaunchLimits = sum of limits for all priorities with waiters -// When waiters[n] becomes == 0, totalActiveLaunchLimits -= launchLimits[n] -// When waiters[n] becomes > 0, totalActiveLaunchLimits += launchLimits[n] -// -// The total capacity of a priority to be considered when launching tasks is -// ceil(launchLimits[n] / totalLimits * concurrency) -// -// The interface is similar to FlowMutex except that lock holders can just drop the lock to release it. -// -// Usage: -// Lock lock = wait(prioritylock.lock(priorityLevel)); -// lock.release(); // Explicit release, or -// // let lock and all copies of lock go out of scope to release -class PriorityMultiLock { - -public: - // Waiting on the lock returns a Lock, which is really just a Promise - // Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release - // the Lock before it goes out of scope. - struct Lock { - void release() { promise.send(Void()); } - - // This is exposed in case the caller wants to use/copy it directly - Promise promise; - }; - - PriorityMultiLock(int concurrency, std::string launchLimits) - : PriorityMultiLock(concurrency, parseStringToVector(launchLimits, ',')) {} - - PriorityMultiLock(int concurrency, std::vector launchLimitsByPriority) - : concurrency(concurrency), available(concurrency), waiting(0), totalActiveLaunchLimits(0) { - - priorities.resize(launchLimitsByPriority.size()); - for (int i = 0; i < priorities.size(); ++i) { - priorities[i].launchLimit = launchLimitsByPriority[i]; - } - - fRunner = runner(this); - } - - ~PriorityMultiLock() {} - - Future lock(int priority = 0) { - Priority& p = priorities[priority]; - Queue& q = p.queue; - Waiter w; - - // If this priority currently has no waiters - if (q.empty()) { - // Add this priority's launch limit to totalLimits - totalActiveLaunchLimits += p.launchLimit; - - // If there are slots available and the priority has capacity then don't make the caller wait - if (available > 0 && p.runners < currentCapacity(p.launchLimit)) { - // Remove this priority's launch limit from the total since it will remain empty - totalActiveLaunchLimits -= p.launchLimit; - - // Return a Lock to the caller - Lock lock; - addRunner(lock, &p); - - pml_debug_printf("lock nowait line %d priority %d %s\n", __LINE__, priority, toString().c_str()); - return lock; - } - } - q.push_back(w); - ++waiting; - - pml_debug_printf("lock wait line %d priority %d %s\n", __LINE__, priority, toString().c_str()); - return w.lockPromise.getFuture(); - } - - void kill() { - for (int i = 0; i < runners.size(); ++i) { - if (!runners[i].isReady()) { - runners[i].cancel(); - } - } - runners.clear(); - brokenOnDestruct.sendError(broken_promise()); - waiting = 0; - priorities.clear(); - } - - std::string toString() const { - int runnersDone = 0; - for (int i = 0; i < runners.size(); ++i) { - if (runners[i].isReady()) { - ++runnersDone; - } - } - - std::string s = - format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d runnersDone=%d ", - this, - concurrency, - available, - concurrency - available, - waiting, - runners.size(), - runnersDone); - - for (int i = 0; i < priorities.size(); ++i) { - s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str()); - } - - s += "}"; - return s; - } - int maxPriority() const { return priorities.size() - 1; } - - int totalWaiters() const { return waiting; } - - int numWaiters(const unsigned int priority) const { - ASSERT(priority < priorities.size()); - return priorities[priority].queue.size(); - } - - int totalRunners() const { return concurrency - available; } - - int numRunners(const unsigned int priority) const { - ASSERT(priority < priorities.size()); - return priorities[priority].runners; - } - -private: - struct Waiter { - Waiter() : queuedTime(now()) {} - Promise lockPromise; - double queuedTime; - }; - - // Total execution slots allowed across all priorities - int concurrency; - // Current available execution slots - int available; - // Total waiters across all priorities - int waiting; - // Sum of launch limits for all priorities with 1 or more waiters - int totalActiveLaunchLimits; - - typedef Deque Queue; - - struct Priority { - Priority() : runners(0), launchLimit(0) {} - - // Queue of waiters at this priority - Queue queue; - // Number of runners at this priority - int runners; - // Configured launch limit for this priority - int launchLimit; - - std::string toString(const PriorityMultiLock* pml) const { - return format("limit=%d run=%d wait=%d cap=%d", - launchLimit, - runners, - queue.size(), - queue.empty() ? 0 : pml->currentCapacity(launchLimit)); - } - }; - - std::vector priorities; - - // Current or recent (ended) runners - Deque> runners; - - Future fRunner; - AsyncTrigger wakeRunner; - Promise brokenOnDestruct; - - ACTOR static Future handleRelease(PriorityMultiLock* self, Future f, Priority* priority) { - try { - wait(f); - } catch (Error& e) { - } - - ++self->available; - priority->runners -= 1; - - pml_debug_printf("lock release line %d priority %d %s\n", - __LINE__, - (int)(priority - &self->priorities.front()), - self->toString().c_str()); - - // If there are any waiters or if the runners array is getting large, trigger the runner loop - if (self->waiting > 0 || self->runners.size() > 1000) { - self->wakeRunner.trigger(); - } - return Void(); - } - - void addRunner(Lock& lock, Priority* p) { - p->runners += 1; - --available; - runners.push_back(handleRelease(this, lock.promise.getFuture(), p)); - } - - // Current maximum running tasks for the specified priority, which must have waiters - // or the result is undefined - int currentCapacity(int launchLimit) const { - // The total concurrency allowed for this priority at present is the total concurrency times - // priority's launch limit divided by the total launch limits for all priorities with waiters. - return ceil((float)launchLimit / totalActiveLaunchLimits * concurrency); - } - - ACTOR static Future runner(PriorityMultiLock* self) { - state int sinceYield = 0; - state Future error = self->brokenOnDestruct.getFuture(); - - // Priority to try to run tasks from next - state int priority = 0; - - loop { - pml_debug_printf( - "runner loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - - // Cleanup finished runner futures at the front of the runner queue. - while (!self->runners.empty() && self->runners.front().isReady()) { - self->runners.pop_front(); - } - - // Wait for a runner to release its lock - pml_debug_printf( - "runner loop waitTrigger line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - wait(self->wakeRunner.onTrigger()); - pml_debug_printf( - "runner loop wake line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - - if (++sinceYield == 100) { - sinceYield = 0; - pml_debug_printf( - " runner waitDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - wait(delay(0)); - pml_debug_printf( - " runner afterDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - } - - // While there are available slots and there are waiters, launch tasks - while (self->available > 0 && self->waiting > 0) { - pml_debug_printf( - " launch loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str()); - - Priority* pPriority; - - // Find the next priority with waiters and capacity. There must be at least one. - loop { - // Rotate to next priority - if (++priority == self->priorities.size()) { - priority = 0; - } - - pPriority = &self->priorities[priority]; - - pml_debug_printf(" launch loop scan line %d priority=%d %s\n", - __LINE__, - priority, - self->toString().c_str()); - - if (!pPriority->queue.empty() && - pPriority->runners < self->currentCapacity(pPriority->launchLimit)) { - break; - } - } - - Queue& queue = pPriority->queue; - - Waiter w = queue.front(); - queue.pop_front(); - - // If this priority is now empty, subtract its launch limit from totalLimits - if (queue.empty()) { - self->totalActiveLaunchLimits -= pPriority->launchLimit; - - pml_debug_printf(" emptied priority line %d priority=%d %s\n", - __LINE__, - priority, - self->toString().c_str()); - } - - --self->waiting; - Lock lock; - - w.lockPromise.send(lock); - - // Self may have been destructed during the lock callback - if (error.isReady()) { - throw error.getError(); - } - - // If the lock was not already released, add it to the runners future queue - if (lock.promise.canBeSet()) { - self->addRunner(lock, pPriority); - } - - pml_debug_printf(" launched line %d alreadyDone=%d priority=%d %s\n", - __LINE__, - !lock.promise.canBeSet(), - priority, - self->toString().c_str()); - } - } - } -}; - -#include "flow/unactorcompiler.h" - -#endif diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index 8bdbcdbd53..5aecc04215 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -100,21 +100,6 @@ T sorted(T range) { return range; } -template -std::vector parseStringToVector(std::string str, char delim) { - std::vector result; - std::stringstream stream(str); - std::string token; - while (stream.good()) { - getline(stream, token, delim); - std::istringstream tokenStream(token); - T item; - tokenStream >> item; - result.push_back(item); - } - return result; -} - template ErrorOr errorOr(T t) { return ErrorOr(t); diff --git a/flowbench/BenchPriorityMultiLock.actor.cpp b/flowbench/BenchPriorityMultiLock.actor.cpp deleted file mode 100644 index c1b517dedc..0000000000 --- a/flowbench/BenchPriorityMultiLock.actor.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * BenchStream.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "benchmark/benchmark.h" - -#include "flow/flow.h" -#include "flow/ThreadHelper.actor.h" -#include "flow/PriorityMultiLock.actor.h" -#include -#include "flow/actorcompiler.h" // This must be the last #include. - -ACTOR static Future benchPriorityMultiLock(benchmark::State* benchState) { - state std::vector priorities; - - // Set up priority list with limits 10, 20, 30, ... - while (priorities.size() < benchState->range(0)) { - priorities.push_back(10 * (priorities.size() + 1)); - } - - state int concurrency = priorities.size() * 10; - state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities); - state std::vector counts; - counts.resize(priorities.size(), 0); - - // Clog the lock buy taking concurrency locks - state std::deque> lockFutures; - for (int j = 0; j < concurrency; ++j) { - lockFutures.push_back(pml->lock(j % priorities.size())); - } - - // Wait for all of the initial locks to be taken - // This will work regardless of their priorities as there are only n = concurrency of them - wait(waitForAll(std::vector>(lockFutures.begin(), lockFutures.end()))); - - // For each iteration of the loop, one new lock user is created, for a total of - // concurrency + 1 users. The new user replaces an old one, which is then waited - // on. This will succeed regardless of the lock priorities used because prior to - // new user there were only n = concurrency users so they will all be served before - // the new user. - state int p = 0; - state int i = 0; - while (benchState->KeepRunning()) { - // Get and replace the i'th lock future with a new lock waiter - Future f = lockFutures[i]; - lockFutures[i] = pml->lock(p); - - PriorityMultiLock::Lock lock = wait(f); - - // Rotate to another priority - if (++p == priorities.size()) { - p = 0; - } - - // Rotate to next lock index - if (++i == lockFutures.size()) { - i = 0; - } - } - - benchState->SetItemsProcessed(static_cast(benchState->iterations())); - - delete pml; - return Void(); -} - -static void bench_priorityMultiLock(benchmark::State& benchState) { - onMainThread([&benchState]() { return benchPriorityMultiLock(&benchState); }).blockUntilReady(); -} - -BENCHMARK(bench_priorityMultiLock)->DenseRange(1, 8)->ReportAggregatesOnly(true); From df2c1374cb923e8da5aa9949839ef62ee0d36b91 Mon Sep 17 00:00:00 2001 From: Dan Adkins <105679810+sfc-gh-dadkins@users.noreply.github.com> Date: Sun, 25 Sep 2022 01:52:13 -0400 Subject: [PATCH 26/34] Correct a couple of comments in simulator and simulated workloads. (#8310) --- fdbrpc/include/fdbrpc/simulator.h | 4 ++-- fdbserver/workloads/Rollback.actor.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 494de44452..09badc0626 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -453,8 +453,8 @@ public: int listenersPerProcess; // We won't kill machines in this set, but we might reboot - // them. This is a conservatie mechanism to prevent the - // simulator from killing off imporant processes and rendering + // them. This is a conservative mechanism to prevent the + // simulator from killing off important processes and rendering // the cluster unrecoverable, e.g. a quorum of coordinators. std::set protectedAddresses; diff --git a/fdbserver/workloads/Rollback.actor.cpp b/fdbserver/workloads/Rollback.actor.cpp index 0b91c8e432..a2ff27cafe 100644 --- a/fdbserver/workloads/Rollback.actor.cpp +++ b/fdbserver/workloads/Rollback.actor.cpp @@ -102,11 +102,12 @@ struct RollbackWorkload : FailureInjectionWorkload { wait(delay(self->clogDuration / 3)); system = self->dbInfo->get(); - // Kill the proxy and clog the unclogged tlog if (self->enableFailures) { + // Reboot the proxy and clog the unclogged tlog. g_simulator->rebootProcess(g_simulator->getProcessByAddress(proxy.address()), ISimulator::Reboot); g_simulator->clogInterface(uncloggedTLog.ip, self->clogDuration, ClogAll); } else { + // Alternatively, if we're not injecting machine failures, clog the proxy and the unclogged tlog. g_simulator->clogInterface(proxy.address().ip, self->clogDuration, ClogAll); g_simulator->clogInterface(uncloggedTLog.ip, self->clogDuration, ClogAll); } From f95c22c6a2a998e0cf9b3a8be1b9efaf6000d0b5 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Mon, 26 Sep 2022 04:15:10 -0500 Subject: [PATCH 27/34] Blob Granule Database API tests (#8276) * adding verify * Adding error test cases * more fixes to bg api tests --- .../TesterBlobGranuleCorrectnessWorkload.cpp | 53 ++++- .../TesterBlobGranuleErrorsWorkload.cpp | 185 +++++++++++++++++- bindings/c/test/fdb_api.hpp | 42 +++- 3 files changed, 266 insertions(+), 14 deletions(-) diff --git a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp index 5f35c7b7ff..f6164296da 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleCorrectnessWorkload.cpp @@ -44,7 +44,8 @@ private: OP_GET_GRANULES, OP_SUMMARIZE, OP_GET_BLOB_RANGES, - OP_LAST = OP_GET_BLOB_RANGES + OP_VERIFY, + OP_LAST = OP_VERIFY }; std::vector excludedOpTypes; @@ -156,6 +157,12 @@ private: } void randomSummarizeOp(TTaskFct cont, std::optional tenantId) { + if (!seenReadSuccess) { + // tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a + // read success + schedule(cont); + return; + } fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); if (begin > end) { @@ -174,11 +181,9 @@ private: true); }, [this, begin, end, results, cont]() { - if (seenReadSuccess) { - ASSERT(results->size() > 0); - ASSERT(results->front().keyRange.beginKey <= begin); - ASSERT(results->back().keyRange.endKey >= end); - } + ASSERT(results->size() > 0); + ASSERT(results->front().keyRange.beginKey <= begin); + ASSERT(results->back().keyRange.endKey >= end); for (int i = 0; i < results->size(); i++) { // TODO: could do validation of subsequent calls and ensure snapshot version never decreases @@ -254,6 +259,39 @@ private: /* failOnError = */ false); } + void randomVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + + auto verifyVersion = std::make_shared(false); + // info("Verify op starting"); + + execOperation( + [begin, end, verifyVersion](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->continueAfter(f, [ctx, verifyVersion, f]() { + *verifyVersion = f.get(); + ctx->done(); + }); + }, + [this, begin, end, verifyVersion, cont]() { + if (*verifyVersion == -1) { + ASSERT(!seenReadSuccess); + } else { + if (!seenReadSuccess) { + info("BlobGranuleCorrectness::randomVerifyOp first success"); + } + seenReadSuccess = true; + } + // info(fmt::format("verify op done @ {}", *verifyVersion)); + schedule(cont); + }, + /* failOnError = */ false); + } + void randomOperation(TTaskFct cont) { std::optional tenantId = randomTenant(); @@ -284,6 +322,9 @@ private: case OP_GET_BLOB_RANGES: randomGetBlobRangesOp(cont); break; + case OP_VERIFY: + randomVerifyOp(cont); + break; } } }; diff --git a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp index 386f8b43cd..b4bcaacdc6 100644 --- a/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp +++ b/bindings/c/test/apitester/TesterBlobGranuleErrorsWorkload.cpp @@ -34,10 +34,21 @@ private: OP_READ_NO_MATERIALIZE, OP_READ_FILE_LOAD_ERROR, OP_READ_TOO_OLD, - OP_CANCEL_RANGES, - OP_LAST = OP_CANCEL_RANGES + OP_PURGE_UNALIGNED, + OP_BLOBBIFY_UNALIGNED, + OP_UNBLOBBIFY_UNALIGNED, + OP_CANCEL_GET_GRANULES, + OP_CANCEL_GET_RANGES, + OP_CANCEL_VERIFY, + OP_CANCEL_SUMMARIZE, + OP_CANCEL_BLOBBIFY, + OP_CANCEL_UNBLOBBIFY, + OP_CANCEL_PURGE, + OP_LAST = OP_CANCEL_PURGE }; + // could add summarize too old and verify too old as ops if desired but those are lower value + // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet // FIXME: should still guarantee a read succeeds eventually somehow bool seenReadSuccess = false; @@ -74,9 +85,6 @@ private: error(fmt::format("Operation succeeded in error test!")); } ASSERT(err.code() != error_code_success); - if (err.code() != error_code_blob_granule_transaction_too_old) { - seenReadSuccess = true; - } if (err.code() != expectedError) { info(fmt::format("incorrect error. Expected {}, Got {}", expectedError, err.code())); if (err.code() == error_code_blob_granule_transaction_too_old) { @@ -86,6 +94,9 @@ private: ctx->onError(err); } } else { + if (err.code() != error_code_blob_granule_transaction_too_old) { + seenReadSuccess = true; + } ctx->done(); } }, @@ -107,7 +118,55 @@ private: doErrorOp(cont, "", true, 1, error_code_blob_granule_transaction_too_old); } - void randomCancelGetRangesOp(TTaskFct cont) { + void randomPurgeUnalignedOp(TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always fail + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [this, begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->continueAfter( + f, + [this, ctx, f]() { + info(fmt::format("unaligned purge got {}", f.error().code())); + ASSERT(f.error().code() == error_code_unsupported_operation); + ctx->done(); + }, + true); + }, + [this, cont]() { schedule(cont); }); + } + + void randomBlobbifyUnalignedOp(bool blobbify, TTaskFct cont) { + // blobbify/unblobbify need to be aligned to blob range boundaries, so this should always return false + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + auto success = std::make_shared(false); + execOperation( + [begin, end, blobbify, success](auto ctx) { + fdb::Future f = blobbify ? ctx->db().blobbifyRange(begin, end).eraseType() + : ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->continueAfter( + f, + [ctx, f, success]() { + *success = f.get(); + ctx->done(); + }, + true); + }, + [this, cont, success]() { + ASSERT(!(*success)); + schedule(cont); + }); + } + + void randomCancelGetGranulesOp(TTaskFct cont) { fdb::Key begin = randomKeyName(); fdb::Key end = randomKeyName(); if (begin > end) { @@ -121,6 +180,90 @@ private: [this, cont]() { schedule(cont); }); } + void randomCancelGetRangesOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelVerifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelSummarizeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execTransaction( + [begin, end](auto ctx) { + fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2, 1000).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelBlobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().blobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelUnblobbifyOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().unblobbifyRange(begin, end).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + + void randomCancelPurgeOp(TTaskFct cont) { + fdb::Key begin = randomKeyName(); + fdb::Key end = randomKeyName(); + if (begin > end) { + std::swap(begin, end); + } + execOperation( + [begin, end](auto ctx) { + fdb::Future f = ctx->db().purgeBlobGranules(begin, end, -2, false).eraseType(); + ctx->done(); + }, + [this, cont]() { schedule(cont); }); + } + void randomOperation(TTaskFct cont) override { OpType txType = (OpType)Random::get().randomInt(0, OP_LAST); switch (txType) { @@ -133,9 +276,37 @@ private: case OP_READ_TOO_OLD: randomOpReadTooOld(cont); break; - case OP_CANCEL_RANGES: + case OP_PURGE_UNALIGNED: + // gets the correct error but it doesn't propagate properly in the test + // randomPurgeUnalignedOp(cont); + break; + case OP_BLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(true, cont); + break; + case OP_UNBLOBBIFY_UNALIGNED: + randomBlobbifyUnalignedOp(false, cont); + break; + case OP_CANCEL_GET_GRANULES: + randomCancelGetGranulesOp(cont); + break; + case OP_CANCEL_GET_RANGES: randomCancelGetRangesOp(cont); break; + case OP_CANCEL_VERIFY: + randomCancelVerifyOp(cont); + break; + case OP_CANCEL_SUMMARIZE: + randomCancelSummarizeOp(cont); + break; + case OP_CANCEL_BLOBBIFY: + randomCancelBlobbifyOp(cont); + break; + case OP_CANCEL_UNBLOBBIFY: + randomCancelUnblobbifyOp(cont); + break; + case OP_CANCEL_PURGE: + randomCancelPurgeOp(cont); + break; } } }; diff --git a/bindings/c/test/fdb_api.hpp b/bindings/c/test/fdb_api.hpp index 0d755c9fc0..5403f59b78 100644 --- a/bindings/c/test/fdb_api.hpp +++ b/bindings/c/test/fdb_api.hpp @@ -155,6 +155,13 @@ struct None { struct Type {}; static Error extract(native::FDBFuture*, Type&) noexcept { return Error(0); } }; +struct Bool { + using Type = native::fdb_bool_t; + static Error extract(native::FDBFuture* f, Type& out) noexcept { + auto err = native::fdb_future_get_bool(f, &out); + return Error(err); + } +}; struct Int64 { using Type = int64_t; static Error extract(native::FDBFuture* f, Type& out) noexcept { @@ -775,10 +782,43 @@ public: TypedFuture listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) { if (!db) - throw std::runtime_error("list_blobbified_ranges from null database"); + throw std::runtime_error("listBlobbifiedRanges from null database"); return native::fdb_database_list_blobbified_ranges( db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); } + + TypedFuture verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) { + if (!db) + throw std::runtime_error("verifyBlobRange from null database"); + return native::fdb_database_verify_blob_range( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version); + } + + TypedFuture blobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("blobbifyRange from null database"); + return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture unblobbifyRange(KeyRef begin, KeyRef end) { + if (!db) + throw std::runtime_error("unblobbifyRange from null database"); + return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); + } + + TypedFuture purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + native::fdb_bool_t forceBool = force; + return native::fdb_database_purge_blob_granules( + db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool); + } + + TypedFuture waitPurgeGranulesComplete(KeyRef purgeKey) { + if (!db) + throw std::runtime_error("purgeBlobGranules from null database"); + return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey)); + } }; inline Error selectApiVersionNothrow(int version) { From dc828d8407c48cb3a7ec281b428f7baed7b5cadd Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 26 Sep 2022 09:14:15 -0700 Subject: [PATCH 28/34] Port #7123 (#8315) Only run valgrind on the binary under test --- contrib/TestHarness2/test_harness/run.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/contrib/TestHarness2/test_harness/run.py b/contrib/TestHarness2/test_harness/run.py index f144ded0da..2cd24575fb 100644 --- a/contrib/TestHarness2/test_harness/run.py +++ b/contrib/TestHarness2/test_harness/run.py @@ -335,7 +335,12 @@ class TestRun: command: List[str] = [] env: Dict[str, str] = os.environ.copy() valgrind_file: Path | None = None - if self.use_valgrind: + if self.use_valgrind and self.binary == config.binary: + # Only run the binary under test under valgrind. There's nothing we + # can do about valgrind errors in old binaries anyway, and it makes + # the test take longer. Also old binaries weren't built with + # USE_VALGRIND=ON, and we have seen false positives with valgrind in + # such binaries. command.append('valgrind') valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed)) dbg_path = os.getenv('FDB_VALGRIND_DBGPATH') From 080acd811fdbf2452a0bc4b0865e0e07abc38dca Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 26 Sep 2022 09:18:37 -0700 Subject: [PATCH 29/34] tuple: user defined type support (#8300) * tuple: add support for usertype as last part of tuple The expectation is user defined types come at the end as there is no delimiter standard. In that case, if we encounter a user defined type, we append it to the end of the tuple and allow users to transform that type by parsing raw string returned. * blob: use Tuple::unpackUserType() for key alignment We want blob granule key alignment to be able to understand user data types. This upgrades support and allows user data types to be the last key in a blob granule and for us to be able to parse that properly. --- fdbclient/Tuple.cpp | 105 +++++++++++++++++- fdbclient/include/fdbclient/Tuple.h | 17 ++- fdbserver/BlobManager.actor.cpp | 2 +- .../BlobGranuleCorrectnessWorkload.actor.cpp | 12 +- 4 files changed, 124 insertions(+), 12 deletions(-) diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index 1575a565ab..212223abd2 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -22,6 +22,8 @@ #include "flow/UnitTest.h" const uint8_t VERSIONSTAMP_96_CODE = 0x33; +const uint8_t USER_TYPE_START = 0x40; +const uint8_t USER_TYPE_END = 0x4f; // TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point. static float bigEndianFloat(float orig) { @@ -59,7 +61,7 @@ static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) { } } -Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { +Tuple::Tuple(StringRef const& str, bool exclude_incomplete, bool include_user_type) { data.append(data.arena(), str.begin(), str.size()); size_t i = 0; @@ -80,6 +82,9 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { i += 1; } else if (data[i] == VERSIONSTAMP_96_CODE) { i += VERSIONSTAMP_TUPLE_SIZE + 1; + } else if (include_user_type && isUserType(data[i])) { + // User defined codes must come at the end of a Tuple and are not delimited. + i = data.size(); } else { throw invalid_tuple_data_type(); } @@ -94,6 +99,14 @@ Tuple Tuple::unpack(StringRef const& str, bool exclude_incomplete) { return Tuple(str, exclude_incomplete); } +Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) { + return Tuple(str, exclude_incomplete, true); +} + +bool Tuple::isUserType(uint8_t code) const { + return code >= USER_TYPE_START && code <= USER_TYPE_END; +} + Tuple& Tuple::append(Tuple const& tuple) { for (size_t offset : tuple.offsets) { offsets.push_back(offset + data.size()); @@ -218,6 +231,15 @@ Tuple& Tuple::appendNull() { return append(nullptr); } +Tuple& Tuple::append(Tuple::UserTypeStr const& udt) { + offsets.push_back(data.size()); + ASSERT(isUserType(udt.code)); + data.push_back(data.arena(), udt.code); + data.append(data.arena(), udt.str.begin(), udt.str.size()); + + return *this; +} + Tuple::ElementType Tuple::getType(size_t index) const { if (index >= offsets.size()) { throw invalid_tuple_index(); @@ -241,6 +263,8 @@ Tuple::ElementType Tuple::getType(size_t index) const { return ElementType::BOOL; } else if (code == VERSIONSTAMP_96_CODE) { return ElementType::VERSIONSTAMP; + } else if (isUserType(code)) { + return ElementType::USER_TYPE; } else { throw invalid_tuple_data_type(); } @@ -401,6 +425,29 @@ Versionstamp Tuple::getVersionstamp(size_t index) const { return Versionstamp(StringRef(data.begin() + offsets[index] + 1, VERSIONSTAMP_TUPLE_SIZE)); } +Tuple::UserTypeStr Tuple::getUserType(size_t index) const { + // Valid index. + if (index >= offsets.size()) { + throw invalid_tuple_index(); + } + + // Valid user type code. + ASSERT_LT(offsets[index], data.size()); + uint8_t code = data[offsets[index]]; + if (!isUserType(code)) { + throw invalid_tuple_data_type(); + } + + size_t start = offsets[index] + 1; + + Standalone str; + VectorRef staging; + staging.append(str.arena(), data.begin() + start, data.size() - start); + str.StringRef::operator=(StringRef(staging.begin(), staging.size())); + + return Tuple::UserTypeStr(code, str); +} + KeyRange Tuple::range(Tuple const& tuple) const { VectorRef begin; VectorRef end; @@ -440,9 +487,16 @@ StringRef Tuple::subTupleRawString(size_t index) const { return StringRef(data.begin() + offsets[index], endPos - offsets[index]); } -TEST_CASE("fdbclient/Tuple/makeTuple") { - Tuple t1 = Tuple::makeTuple( - 1, 1.0f, 1.0, false, "byteStr"_sr, Tuple::UnicodeStr("str"_sr), nullptr, Versionstamp("000000000000"_sr)); +TEST_CASE("/fdbclient/Tuple/makeTuple") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + Versionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); Tuple t2 = Tuple() .append(1) .append(1.0f) @@ -451,7 +505,8 @@ TEST_CASE("fdbclient/Tuple/makeTuple") { .append("byteStr"_sr) .append(Tuple::UnicodeStr("str"_sr)) .append(nullptr) - .append(Versionstamp("000000000000"_sr)); + .append(Versionstamp("000000000000"_sr)) + .append(Tuple::UserTypeStr(0x41, "12345678"_sr)); ASSERT(t1.pack() == t2.pack()); ASSERT(t1.getType(0) == Tuple::INT); @@ -462,7 +517,45 @@ TEST_CASE("fdbclient/Tuple/makeTuple") { ASSERT(t1.getType(5) == Tuple::UTF8); ASSERT(t1.getType(6) == Tuple::NULL_TYPE); ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP); - ASSERT(t1.size() == 8); + ASSERT(t1.getType(8) == Tuple::USER_TYPE); + ASSERT(t1.size() == 9); + + return Void(); +} + +TEST_CASE("/fdbclient/Tuple/unpack") { + Tuple t1 = Tuple::makeTuple(1, + 1.0f, + 1.0, + false, + "byteStr"_sr, + Tuple::UnicodeStr("str"_sr), + nullptr, + Versionstamp("000000000000"_sr), + Tuple::UserTypeStr(0x41, "12345678"_sr)); + + Standalone packed = t1.pack(); + Tuple t2 = Tuple::unpackUserType(packed); + ASSERT(t2.pack() == t1.pack()); + ASSERT(t2.getInt(0) == t1.getInt(0)); + ASSERT(t2.getFloat(1) == t1.getFloat(1)); + ASSERT(t2.getDouble(2) == t1.getDouble(2)); + ASSERT(t2.getBool(3) == t1.getBool(3)); + ASSERT(t2.getString(4) == t1.getString(4)); + ASSERT(t2.getString(5) == t1.getString(5)); + ASSERT(t2.getType(6) == Tuple::NULL_TYPE); + ASSERT(t2.getVersionstamp(7) == t1.getVersionstamp(7)); + ASSERT(t2.getUserType(8) == t1.getUserType(8)); + ASSERT(t2.size() == 9); + + try { + Tuple t3 = Tuple::unpack(packed); + ASSERT(false); + } catch (Error& e) { + if (e.code() != error_code_invalid_tuple_data_type) { + throw e; + } + } return Void(); } diff --git a/fdbclient/include/fdbclient/Tuple.h b/fdbclient/include/fdbclient/Tuple.h index b9de705d91..bf997e4309 100644 --- a/fdbclient/include/fdbclient/Tuple.h +++ b/fdbclient/include/fdbclient/Tuple.h @@ -33,6 +33,14 @@ struct Tuple { explicit UnicodeStr(StringRef str) : str(str) {} }; + struct UserTypeStr { + uint8_t code; + Standalone str; + UserTypeStr(uint8_t code, StringRef str) : code(code), str(str) {} + + bool operator==(const UserTypeStr& other) const { return (code == other.code && str == other.str); } + }; + Tuple() {} // Tuple parsing normally does not care of the final value is a numeric type and is incomplete. @@ -40,6 +48,7 @@ struct Tuple { // Note that strings can't be incomplete because they are parsed such that the end of the packed // byte string is considered the end of the string in lieu of a specific end. static Tuple unpack(StringRef const& str, bool exclude_incomplete = false); + static Tuple unpackUserType(StringRef const& str, bool exclude_incomplete = false); Tuple& append(Tuple const& tuple); @@ -55,6 +64,7 @@ struct Tuple { Tuple& append(std::nullptr_t); Tuple& appendNull(); Tuple& append(Versionstamp const&); + Tuple& append(UserTypeStr const&); Standalone pack() const { return Standalone(StringRef(data.begin(), data.size()), data.arena()); @@ -65,7 +75,9 @@ struct Tuple { return append(t); } - enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP }; + enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE, VERSIONSTAMP, USER_TYPE }; + + bool isUserType(uint8_t code) const; // this is number of elements, not length of data size_t size() const { return offsets.size(); } @@ -85,6 +97,7 @@ struct Tuple { bool getBool(size_t index) const; float getFloat(size_t index) const; double getDouble(size_t index) const; + Tuple::UserTypeStr getUserType(size_t index) const; KeyRange range(Tuple const& tuple = Tuple()) const; @@ -107,7 +120,7 @@ struct Tuple { } private: - Tuple(const StringRef& data, bool exclude_incomplete = false); + Tuple(const StringRef& data, bool exclude_incomplete = false, bool exclude_user_type = false); Standalone> data; std::vector offsets; }; diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index 2a0b36c58f..dea4befdae 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -508,7 +508,7 @@ static void alignKeyBoundary(Reference bmData, alignedKey = alignedKey.removePrefix(tenantData->entry.prefix); } try { - t = Tuple::unpack(alignedKey, true); + t = Tuple::unpackUserType(alignedKey, true); if (t.size() > offset) { t2 = t.subTuple(0, t.size() - offset); alignedKey = t2.pack(); diff --git a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp index b94b50ffe7..e185592e1e 100644 --- a/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp +++ b/fdbserver/workloads/BlobGranuleCorrectnessWorkload.actor.cpp @@ -129,7 +129,13 @@ struct ThreadData : ReferenceCounted, NonCopyable { } // TODO could make keys variable length? - Key getKey(uint32_t key, uint32_t id) { return Tuple().append((int64_t)key).append((int64_t)id).pack(); } + Key getKey(uint32_t key, uint32_t id) { + std::stringstream ss; + ss << std::setw(32) << std::setfill('0') << id; + Standalone str(ss.str()); + Tuple::UserTypeStr udt(0x41, str); + return Tuple::makeTuple((int64_t)key, udt).pack(); + } void validateGranuleBoundary(Key k, Key e, Key lastKey) { if (k == allKeys.begin || k == allKeys.end) { @@ -138,11 +144,11 @@ struct ThreadData : ReferenceCounted, NonCopyable { // Fully formed tuples are inserted. The expectation is boundaries should be a // sub-tuple of the inserted key. - Tuple t = Tuple::unpack(k, true); + Tuple t = Tuple::unpackUserType(k, true); if (SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET) { Tuple t2; try { - t2 = Tuple::unpack(lastKey); + t2 = Tuple::unpackUserType(lastKey); } catch (Error& e) { // Ignore being unable to parse lastKey as it may be a dummy key. } From 0f8820e03d675b22504013636f0c97bc4553a677 Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Mon, 26 Sep 2022 09:19:21 -0700 Subject: [PATCH 30/34] Fix RocksDB link issue (#8296) * Fix the RocksDB compile issue with clang By default, RocksDB is using its own compile/link flags, no matter how FDB flags are. This led to the issue that if FDB decides to use clang/ldd/libc++, RocksDB will pick up the compiler/linker but still use libstdc++, which is incompatible to libc++, causing Symobl Missing error during the link stage. With this patch, if FDB uses libc++, then the information is stored in CMAKE_CXX_FLAGS and being forwarded to RocksDB. RocksDB will then use libc++ and compatible with FDB. * fixup! update the indentation --- cmake/CompileRocksDB.cmake | 83 ++++++++++++++++------------------- cmake/ConfigureCompiler.cmake | 12 ++--- 2 files changed, 44 insertions(+), 51 deletions(-) diff --git a/cmake/CompileRocksDB.cmake b/cmake/CompileRocksDB.cmake index 3a67e0b017..3fdea389ab 100644 --- a/cmake/CompileRocksDB.cmake +++ b/cmake/CompileRocksDB.cmake @@ -4,31 +4,42 @@ find_package(RocksDB 6.27.3) include(ExternalProject) -if (RocksDB_FOUND) +set(RocksDB_CMAKE_ARGS + -DUSE_RTTI=1 + -DPORTABLE=${PORTABLE_ROCKSDB} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS} + -DCMAKE_STATIC_LINKER_FLAGS=${CMAKE_STATIC_LINKER_FLAGS} + -DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DFAIL_ON_WARNINGS=OFF + -DWITH_GFLAGS=OFF + -DWITH_TESTS=OFF + -DWITH_TOOLS=OFF + -DWITH_CORE_TOOLS=OFF + -DWITH_BENCHMARK_TOOLS=OFF + -DWITH_BZ2=OFF + -DWITH_LZ4=ON + -DWITH_SNAPPY=OFF + -DWITH_ZLIB=OFF + -DWITH_ZSTD=OFF + -DWITH_LIBURING=${WITH_LIBURING} + -DWITH_TSAN=${USE_TSAN} + -DWITH_ASAN=${USE_ASAN} + -DWITH_UBSAN=${USE_UBSAN} + -DROCKSDB_BUILD_SHARED=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=True +) + +if(ROCKSDB_FOUND) ExternalProject_Add(rocksdb SOURCE_DIR "${RocksDB_ROOT}" DOWNLOAD_COMMAND "" - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DFAIL_ON_WARNINGS=OFF - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -38,29 +49,9 @@ if (RocksDB_FOUND) ${BINARY_DIR}/librocksdb.a) else() ExternalProject_Add(rocksdb - URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz - URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 - CMAKE_ARGS -DUSE_RTTI=1 -DPORTABLE=${PORTABLE_ROCKSDB} - -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DFAIL_ON_WARNINGS=OFF - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_TOOLS=OFF - -DWITH_CORE_TOOLS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_BZ2=OFF - -DWITH_LZ4=ON - -DWITH_SNAPPY=OFF - -DWITH_ZLIB=OFF - -DWITH_ZSTD=OFF - -DWITH_LIBURING=${WITH_LIBURING} - -DWITH_TSAN=${USE_TSAN} - -DWITH_ASAN=${USE_ASAN} - -DWITH_UBSAN=${USE_UBSAN} - -DROCKSDB_BUILD_SHARED=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=True + URL https://github.com/facebook/rocksdb/archive/refs/tags/v6.27.3.tar.gz + URL_HASH SHA256=ee29901749b9132692b26f0a6c1d693f47d1a9ed8e3771e60556afe80282bf58 + CMAKE_ARGS ${RocksDB_CMAKE_ARGS} BUILD_BYPRODUCTS /librocksdb.a INSTALL_COMMAND "" ) @@ -70,7 +61,7 @@ else() ${BINARY_DIR}/librocksdb.a) ExternalProject_Get_Property(rocksdb SOURCE_DIR) - set (ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") + set(ROCKSDB_INCLUDE_DIR "${SOURCE_DIR}/include") set(ROCKSDB_FOUND TRUE) endif() diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index e59129457a..bb31ba66c0 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -292,19 +292,21 @@ else() #add_compile_options(-fno-builtin-memcpy) if (CLANG OR ICX) - add_compile_options() if (APPLE OR USE_LIBCXX) - add_compile_options($<$:-stdlib=libc++>) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") if (NOT APPLE) if (STATIC_LINK_LIBCXX) - add_link_options(-static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -nostdlib++ -Wl,-Bstatic -lc++ -lc++abi -Wl,-Bdynamic") endif() - add_link_options(-stdlib=libc++ -Wl,-build-id=sha1) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++ -Wl,-build-id=sha1") endif() endif() if (NOT APPLE AND NOT USE_LIBCXX) message(STATUS "Linking libatomic") - add_link_options(-latomic) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -latomic") endif() if (OPEN_FOR_IDE) add_compile_options( From a900d8dfa971f916b97af94183b647539059c1e0 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 26 Sep 2022 09:19:38 -0700 Subject: [PATCH 31/34] increase the target blob worker lag to account for the slow startup time when a blob worker reboots (#8294) --- fdbclient/ServerKnobs.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 067e5f624f..c0b9a38d0a 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -688,8 +688,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( BW_THROTTLING_ENABLED, true ); bool buggifySmallBWLag = randomize && BUGGIFY; - init( TARGET_BW_LAG, 50.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0; - init( TARGET_BW_LAG_BATCH, 30.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0; + init( TARGET_BW_LAG, 240.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0; + init( TARGET_BW_LAG_BATCH, 200.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0; init( TARGET_BW_LAG_UPDATE, 9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0; init( MIN_BW_HISTORY, 10 ); init( BW_ESTIMATION_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0; From 86c27754fb281633853b0331f8934e9206dd80be Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 26 Sep 2022 09:19:53 -0700 Subject: [PATCH 32/34] Fixed a bug where rollbackVersion >= data->storageVersion() (#8290) * fix: the log router could see popped() versions between recoveredAt and the recovery txn version which cause remote tlogs to be stopped at a version in this range, which causes storage servers to read at versions in this range * fix: a spurious correctness bug resulted from the consistency check running during a recovery --- fdbserver/ClusterRecovery.actor.cpp | 2 +- fdbserver/LogRouter.actor.cpp | 2 +- fdbserver/workloads/ConsistencyCheck.actor.cpp | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbserver/ClusterRecovery.actor.cpp b/fdbserver/ClusterRecovery.actor.cpp index d211f16060..946a5913b7 100644 --- a/fdbserver/ClusterRecovery.actor.cpp +++ b/fdbserver/ClusterRecovery.actor.cpp @@ -1112,7 +1112,7 @@ ACTOR Future readTransactionSystemState(Reference sel self->recoveryTransactionVersion += deterministicRandom()->randomInt64(0, 10000000); } - TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERED_EVENT_NAME).c_str(), + TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_RECOVERING_EVENT_NAME).c_str(), self->dbgid) .detail("LastEpochEnd", self->lastEpochEnd) .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion); diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 08485b4722..2ee606e6fd 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -365,7 +365,7 @@ ACTOR Future pullAsyncData(LogRouterData* self) { if (!foundMessage) { ver--; // ver is the next possible version we will get data for - if (ver > self->version.get()) { + if (ver > self->version.get() && ver >= r->popped()) { wait(waitForVersion(self, ver)); self->version.set(ver); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index b159905480..bf1d110e10 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1109,6 +1109,7 @@ struct ConsistencyCheckWorkload : TestWorkload { } if (foundExtraDataStore) { + wait(delay(10)); // let the cluster get to fully_recovered after the reboot before retrying self->testFailure("Extra data stores present on workers"); return false; } From cc5ed5526335def1d77ddbaee30479e9ec18f8c1 Mon Sep 17 00:00:00 2001 From: Dan Adkins <105679810+sfc-gh-dadkins@users.noreply.github.com> Date: Mon, 26 Sep 2022 12:38:03 -0400 Subject: [PATCH 33/34] Change constructor initialization order to silence -Wreoder-ctor. (#7984) --- fdbserver/include/fdbserver/DataDistribution.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/include/fdbserver/DataDistribution.actor.h b/fdbserver/include/fdbserver/DataDistribution.actor.h index 67e3b57f67..bb53c23a63 100644 --- a/fdbserver/include/fdbserver/DataDistribution.actor.h +++ b/fdbserver/include/fdbserver/DataDistribution.actor.h @@ -248,7 +248,7 @@ FDB_DECLARE_BOOLEAN_PARAM(MoveKeyRangeOutPhysicalShard); class PhysicalShardCollection : public ReferenceCounted { public: - PhysicalShardCollection() : requireTransition(false), lastTransitionStartTime(now()) {} + PhysicalShardCollection() : lastTransitionStartTime(now()), requireTransition(false) {} enum class PhysicalShardCreationTime { DDInit, DDRelocator }; From 1d46e88ea28a9524cf1d62a74f97123242a0dc13 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Mon, 26 Sep 2022 11:39:01 -0500 Subject: [PATCH 34/34] fixing protocol versions for restarting tests (#8093) --- flow/ProtocolVersions.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/ProtocolVersions.cmake b/flow/ProtocolVersions.cmake index b66ca6f674..64a830ee21 100644 --- a/flow/ProtocolVersions.cmake +++ b/flow/ProtocolVersions.cmake @@ -81,11 +81,11 @@ set(FDB_PV_NETWORK_ADDRESS_HOSTNAME_FLAG "0x0FDB00B071010000LL") set(FDB_PV_STORAGE_METADATA "0x0FDB00B071010000LL") set(FDB_PV_PERPETUAL_WIGGLE_METADATA "0x0FDB00B071010000LL") set(FDB_PV_STORAGE_INTERFACE_READINESS "0x0FDB00B071010000LL") -set(FDB_PV_RESOLVER_PRIVATE_MUTATIONS "0x0FDB00B071010000LL") +set(FDB_PV_TENANTS "0x0FDB00B071010000LL") +set(FDB_PV_RESOLVER_PRIVATE_MUTATIONS "0x0FDB00B072000000LL") set(FDB_PV_OTEL_SPAN_CONTEXT "0x0FDB00B072000000LL") set(FDB_PV_SW_VERSION_TRACKING "0x0FDB00B072000000LL") set(FDB_PV_ENCRYPTION_AT_REST "0x0FDB00B072000000LL") set(FDB_PV_SHARD_ENCODE_LOCATION_METADATA "0x0FDB00B072000000LL") -set(FDB_PV_TENANTS "0x0FDB00B072000000LL") set(FDB_PV_BLOB_GRANULE_FILE "0x0FDB00B072000000LL") -set(FDB_PV_CLUSTER_ID_SPECIAL_KEY "0x0FDB00B072000000LL") \ No newline at end of file +set(FDB_PV_CLUSTER_ID_SPECIAL_KEY "0x0FDB00B072000000LL")