From 5a717b3ba96859a41a809cb56c6de85a93ce17fb Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Tue, 24 Nov 2020 16:27:21 -0800 Subject: [PATCH 01/18] Change dd_tracker_cancelled to 1215 to avoid conflict This is safe because it's not been in a release, and it doesn't get serialized. --- flow/error_definitions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 6caa76050f..17abec2a8e 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -86,7 +86,7 @@ ERROR( please_reboot_delete, 1208, "Reboot of server process requested, with del ERROR( master_proxy_failed, 1209, "Master terminating because a Proxy failed" ) ERROR( master_resolver_failed, 1210, "Master terminating because a Resolver failed" ) ERROR( server_overloaded, 1211, "Server is under too much load and cannot respond" ) -ERROR(dd_tracker_cancelled, 1212, "The data distribution tracker has been cancelled") +ERROR( dd_tracker_cancelled, 1215, "The data distribution tracker has been cancelled" ) // 15xx Platform errors ERROR( platform_error, 1500, "Platform error" ) From 78c82dc891c2eb68b5147b7c01de3db4f93a1c37 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 25 Nov 2020 23:05:26 -0800 Subject: [PATCH 02/18] Skip transactions that queued for a long time --- fdbserver/MasterProxyServer.actor.cpp | 35 +++++++++++++++++++++------ 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 0387a27582..b14fe4abc9 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -612,6 +612,7 @@ ACTOR Future commitBatch( if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before"); + state double timeStart = g_network->timer(); if(localBatchNumber-self->latestLocalCommitBatchResolving.get()>SERVER_KNOBS->RESET_MASTER_BATCHES && now()-self->lastMasterReset>SERVER_KNOBS->RESET_MASTER_DELAY) { TraceEvent(SevWarnAlways, "ConnectionResetMaster", self->dbgid) @@ -625,6 +626,23 @@ ACTOR Future commitBatch( /////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield) TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); + double queuingDelay = g_network->timer() - timeStart; + state bool rejectBatch = false; + if (queuingDelay > SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || + (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01)) { + TEST(true); // Reject transactions in the batch + rejectBatch = true; + + for (const auto& tr : trs) { + tr.reply.sendError(not_committed()); + } + self->latestLocalCommitBatchResolving.set(localBatchNumber); + self->latestLocalCommitBatchLogging.set(localBatchNumber); + ++self->stats.commitBatchOut; + self->stats.txnCommitOut += trs.size(); + return Void(); + } + state Future releaseDelay = delay(std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE, batchOperations*self->commitComputePerOperation[latencyBucket]), TaskPriority::ProxyMasterVersionReply); if (debugID.present()) @@ -654,7 +672,7 @@ ACTOR Future commitBatch( ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version ); int conflictRangeCount = 0; state int64_t maxTransactionBytes = 0; - for (int t = 0; tdbgid).detail("Snapshot", trs[t].transaction.read_snapshot); @@ -767,9 +785,9 @@ ACTOR Future commitBatch( // Determine which transactions actually committed (conservatively) by combining results from the resolvers state vector committed(trs.size()); - ASSERT(transactionResolverMap.size() == committed.size()); + ASSERT(rejectBatch || transactionResolverMap.size() == committed.size()); vector nextTr(resolution.size()); - for (int t = 0; t commitBatch( state bool locked = lockedKey.present() && lockedKey.get().size(); state Optional mustContainSystemKey = self->txnStateStore->readValue(mustContainSystemMutationsKey).get(); - if(mustContainSystemKey.present() && mustContainSystemKey.get().size()) { + if(mustContainSystemKey.present() && mustContainSystemKey.get().size() && !rejectBatch) { for (int t = 0; t commitBatch( // This first pass through committed transactions deals with "metadata" effects (modifications of txnStateStore, changes to storage servers' responsibilities) int t; state int commitCount = 0; - for (t = 0; t < trs.size() && !forceRecovery; t++) + for (t = 0; t < trs.size() && !forceRecovery && !rejectBatch; t++) { if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) { commitCount++; @@ -819,13 +837,14 @@ ACTOR Future commitBatch( if(firstStateMutations) { ASSERT(committed[t] == ConflictBatch::TransactionCommitted); firstStateMutations = false; - forceRecovery = false; + // forceRecovery = false; } } - if (forceRecovery) { + if (forceRecovery || rejectBatch) { for (; tdbgid).detail("Stage", "AwaitCommit"); + if (forceRecovery) + TraceEvent(SevWarn, "RestartingTxnSubsystem", self->dbgid).detail("Stage", "AwaitCommit"); } lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); From 5cb0b138beb1b1539f225ad20db4afb15b4421f1 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 27 Nov 2020 09:45:52 -0800 Subject: [PATCH 03/18] Don't reject transactions for the recovery transaction Otherwise, the recovery just keeps repeating. --- fdbserver/MasterProxyServer.actor.cpp | 34 +++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index b14fe4abc9..5f3e75504f 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -627,19 +627,24 @@ ACTOR Future commitBatch( TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); double queuingDelay = g_network->timer() - timeStart; - state bool rejectBatch = false; - if (queuingDelay > SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || - (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01)) { + if (queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || + (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01 && trs.size() > 0 && + !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff")))) { + // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch - rejectBatch = true; + TraceEvent("ProxyReject", self->dbgid).detail("Delay", queuingDelay).detail("N", trs.size()); + for (const auto m : trs[0].transaction.mutations) { + TraceEvent("ProxyReject", self->dbgid).detail("Mutation", m.toString()); + } + ASSERT(self->latestLocalCommitBatchResolving.get() == localBatchNumber - 1); + self->latestLocalCommitBatchResolving.set(localBatchNumber); + wait(self->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber-1)); + self->latestLocalCommitBatchLogging.set(localBatchNumber); for (const auto& tr : trs) { tr.reply.sendError(not_committed()); } - self->latestLocalCommitBatchResolving.set(localBatchNumber); - self->latestLocalCommitBatchLogging.set(localBatchNumber); ++self->stats.commitBatchOut; - self->stats.txnCommitOut += trs.size(); return Void(); } @@ -672,7 +677,7 @@ ACTOR Future commitBatch( ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version ); int conflictRangeCount = 0; state int64_t maxTransactionBytes = 0; - for (int t = 0; tdbgid).detail("Snapshot", trs[t].transaction.read_snapshot); @@ -785,9 +790,9 @@ ACTOR Future commitBatch( // Determine which transactions actually committed (conservatively) by combining results from the resolvers state vector committed(trs.size()); - ASSERT(rejectBatch || transactionResolverMap.size() == committed.size()); + ASSERT(transactionResolverMap.size() == committed.size()); vector nextTr(resolution.size()); - for (int t = 0; t commitBatch( state bool locked = lockedKey.present() && lockedKey.get().size(); state Optional mustContainSystemKey = self->txnStateStore->readValue(mustContainSystemMutationsKey).get(); - if(mustContainSystemKey.present() && mustContainSystemKey.get().size() && !rejectBatch) { + if(mustContainSystemKey.present() && mustContainSystemKey.get().size()) { for (int t = 0; t commitBatch( // This first pass through committed transactions deals with "metadata" effects (modifications of txnStateStore, changes to storage servers' responsibilities) int t; state int commitCount = 0; - for (t = 0; t < trs.size() && !forceRecovery && !rejectBatch; t++) + for (t = 0; t < trs.size() && !forceRecovery; t++) { if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) { commitCount++; @@ -840,11 +845,10 @@ ACTOR Future commitBatch( // forceRecovery = false; } } - if (forceRecovery || rejectBatch) { + if (forceRecovery) { for (; tdbgid).detail("Stage", "AwaitCommit"); + TraceEvent(SevWarn, "RestartingTxnSubsystem", self->dbgid).detail("Stage", "AwaitCommit"); } lockedKey = self->txnStateStore->readValue(databaseLockedKey).get(); From df5293e2bebb60e8bdcfb3e5f35cf5d430acd333 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Sat, 28 Nov 2020 19:58:39 -0800 Subject: [PATCH 04/18] Add a knob PROXY_REJECT_BATCH_QUEUED_TOO_LONG Disable the proxy rejection feature for backup workload, because of the ApplyMutationsError. --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/MasterProxyServer.actor.cpp | 20 +++++++++++++------ fdbserver/workloads/AtomicRestore.actor.cpp | 4 ++++ .../workloads/AtomicSwitchover.actor.cpp | 5 +++++ fdbserver/workloads/BackupToDBAbort.actor.cpp | 5 +++++ .../workloads/BackupToDBCorrectness.actor.cpp | 5 +++++ .../workloads/BackupToDBUpgrade.actor.cpp | 5 +++++ 8 files changed, 40 insertions(+), 6 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8431be5149..79b4986dce 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -344,6 +344,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula init( MAX_PROXY_COMPUTE, 2.0 ); init( PROXY_COMPUTE_BUCKETS, 20000 ); init( PROXY_COMPUTE_GROWTH_RATE, 0.01 ); + init( PROXY_REJECT_BATCH_QUEUED_TOO_LONG, true ); init( RESET_MASTER_BATCHES, 200 ); init( RESET_RESOLVER_BATCHES, 200 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 0a5168397d..2047a0439d 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -289,6 +289,7 @@ public: double MAX_PROXY_COMPUTE; int PROXY_COMPUTE_BUCKETS; double PROXY_COMPUTE_GROWTH_RATE; + bool PROXY_REJECT_BATCH_QUEUED_TOO_LONG; int RESET_MASTER_BATCHES; int RESET_RESOLVER_BATCHES; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 5f3e75504f..63df826cbb 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -627,19 +627,27 @@ ACTOR Future commitBatch( TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); double queuingDelay = g_network->timer() - timeStart; - if (queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || - (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01 && trs.size() > 0 && - !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff")))) { + if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || + (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01)) && + SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && + trs.size() > 0 && !trs[0].transaction.mutations.empty() && !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch - TraceEvent("ProxyReject", self->dbgid).detail("Delay", queuingDelay).detail("N", trs.size()); - for (const auto m : trs[0].transaction.mutations) { - TraceEvent("ProxyReject", self->dbgid).detail("Mutation", m.toString()); + TraceEvent("ProxyReject", self->dbgid).detail("Delay", queuingDelay).detail("N", trs.size()).detail("BatchNumber", localBatchNumber); + int i = 0; + for (const auto tr : trs) { + int j = 0; + for (const auto& m : tr.transaction.mutations) { + TraceEvent("ProxyReject", self->dbgid).detail("T", i).detail("M", j).detail("Mutation", m.toString()); + j++; + } + i++; } ASSERT(self->latestLocalCommitBatchResolving.get() == localBatchNumber - 1); self->latestLocalCommitBatchResolving.set(localBatchNumber); wait(self->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber-1)); + ASSERT(self->latestLocalCommitBatchLogging.get() == localBatchNumber - 1); self->latestLocalCommitBatchLogging.set(localBatchNumber); for (const auto& tr : trs) { tr.reply.sendError(not_committed()); diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp index 4537970f7a..bcd1e78d25 100644 --- a/fdbserver/workloads/AtomicRestore.actor.cpp +++ b/fdbserver/workloads/AtomicRestore.actor.cpp @@ -61,6 +61,9 @@ struct AtomicRestoreWorkload : TestWorkload { ACTOR static Future _start(Database cx, AtomicRestoreWorkload* self) { state FileBackupAgent backupAgent; + // Disable proxy rejection + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; + wait( delay(self->startAfter * deterministicRandom()->random01()) ); TraceEvent("AtomicRestore_Start"); @@ -105,6 +108,7 @@ struct AtomicRestoreWorkload : TestWorkload { } TraceEvent("AtomicRestore_Done"); + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } }; diff --git a/fdbserver/workloads/AtomicSwitchover.actor.cpp b/fdbserver/workloads/AtomicSwitchover.actor.cpp index 02fcb77866..dc36a82a86 100644 --- a/fdbserver/workloads/AtomicSwitchover.actor.cpp +++ b/fdbserver/workloads/AtomicSwitchover.actor.cpp @@ -152,6 +152,9 @@ struct AtomicSwitchoverWorkload : TestWorkload { state DatabaseBackupAgent backupAgent(cx); state DatabaseBackupAgent restoreAgent(self->extraDB); + // Disable proxy rejection to avoid ApplyMutationsError + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; + TraceEvent("AS_Wait1"); wait(success( backupAgent.waitBackup(self->extraDB, BackupAgentBase::getDefaultTag(), false) )); TraceEvent("AS_Ready1"); @@ -177,6 +180,8 @@ struct AtomicSwitchoverWorkload : TestWorkload { g_simulator.drAgents = ISimulator::NoBackupAgents; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; + return Void(); } }; diff --git a/fdbserver/workloads/BackupToDBAbort.actor.cpp b/fdbserver/workloads/BackupToDBAbort.actor.cpp index 7d242360ec..20cfb7ae1e 100644 --- a/fdbserver/workloads/BackupToDBAbort.actor.cpp +++ b/fdbserver/workloads/BackupToDBAbort.actor.cpp @@ -22,6 +22,8 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/Knobs.h" + #include "flow/actorcompiler.h" // This must be the last #include. struct BackupToDBAbort : TestWorkload { @@ -54,6 +56,8 @@ struct BackupToDBAbort : TestWorkload { ACTOR static Future _setup(BackupToDBAbort* self, Database cx) { state DatabaseBackupAgent backupAgent(cx); try { + // Disable proxy rejection to avoid ApplyMutationsError + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; TraceEvent("BDBA_Submit1"); wait( backupAgent.submitBackup(self->extraDB, BackupAgentBase::getDefaultTag(), self->backupRanges, false, StringRef(), StringRef(), true) ); TraceEvent("BDBA_Submit2"); @@ -61,6 +65,7 @@ struct BackupToDBAbort : TestWorkload { if( e.code() != error_code_backup_duplicate ) throw; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp index bb1ced25eb..11fd9642d9 100644 --- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp @@ -442,6 +442,9 @@ struct BackupToDBCorrectnessWorkload : TestWorkload { TraceEvent("BARW_Arguments").detail("BackupTag", printable(self->backupTag)).detail("BackupAfter", self->backupAfter) .detail("AbortAndRestartAfter", self->abortAndRestartAfter).detail("DifferentialAfter", self->stopDifferentialAfter); + // Disable proxy rejection to avoid ApplyMutationsError + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; + state UID randomID = nondeterministicRandom()->randomUniqueID(); // Increment the backup agent requets @@ -575,6 +578,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload { throw; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; + return Void(); } }; diff --git a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp index 3f2afcc717..3a324e6b1f 100644 --- a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp +++ b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp @@ -346,6 +346,9 @@ struct BackupToDBUpgradeWorkload : TestWorkload { state UID logUid; state Version commitVersion; + // Disable proxy rejection to avoid ApplyMutationsError + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; + state Future stopDifferential = delay(self->stopDifferentialAfter); state Future waitUpgrade = backupAgent.waitUpgradeToLatestDrVersion(self->extraDB, self->backupTag); wait(success(stopDifferential) && success(waitUpgrade)); @@ -462,6 +465,8 @@ struct BackupToDBUpgradeWorkload : TestWorkload { throw; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; + return Void(); } }; From 589ee0197930061b91ffe8aaaf3d975390f28d28 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Sun, 29 Nov 2020 09:45:55 -0800 Subject: [PATCH 05/18] Fix backup workload w.r.t. proxy rejection --- fdbserver/MasterProxyServer.actor.cpp | 10 ---------- fdbserver/workloads/BackupCorrectness.actor.cpp | 2 ++ 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 63df826cbb..2ff54a6249 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -634,15 +634,6 @@ ACTOR Future commitBatch( // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch TraceEvent("ProxyReject", self->dbgid).detail("Delay", queuingDelay).detail("N", trs.size()).detail("BatchNumber", localBatchNumber); - int i = 0; - for (const auto tr : trs) { - int j = 0; - for (const auto& m : tr.transaction.mutations) { - TraceEvent("ProxyReject", self->dbgid).detail("T", i).detail("M", j).detail("Mutation", m.toString()); - j++; - } - i++; - } ASSERT(self->latestLocalCommitBatchResolving.get() == localBatchNumber - 1); self->latestLocalCommitBatchResolving.set(localBatchNumber); @@ -850,7 +841,6 @@ ACTOR Future commitBatch( if(firstStateMutations) { ASSERT(committed[t] == ConflictBatch::TransactionCommitted); firstStateMutations = false; - // forceRecovery = false; } } if (forceRecovery) { diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp index ddf7e879f3..bf318055c0 100644 --- a/fdbserver/workloads/BackupCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupCorrectness.actor.cpp @@ -364,6 +364,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { state FileBackupAgent backupAgent; state Future extraBackup; state bool extraTasks = false; + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; TraceEvent("BARW_Arguments").detail("BackupTag", printable(self->backupTag)).detail("PerformRestore", self->performRestore) .detail("BackupAfter", self->backupAfter).detail("RestoreAfter", self->restoreAfter) .detail("AbortAndRestartAfter", self->abortAndRestartAfter).detail("DifferentialAfter", self->stopDifferentialAfter); @@ -661,6 +662,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { TraceEvent(SevError, "BackupAndRestoreCorrectness").error(e).GetLastError(); throw; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } }; From da55d7388558105d907c863b7034113a230c39d2 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Sun, 29 Nov 2020 14:07:01 -0800 Subject: [PATCH 06/18] Fix BackupToDBAbort workload w.r.t. proxy rejection --- fdbserver/workloads/BackupToDBAbort.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbserver/workloads/BackupToDBAbort.actor.cpp b/fdbserver/workloads/BackupToDBAbort.actor.cpp index 20cfb7ae1e..60c73620b7 100644 --- a/fdbserver/workloads/BackupToDBAbort.actor.cpp +++ b/fdbserver/workloads/BackupToDBAbort.actor.cpp @@ -56,8 +56,6 @@ struct BackupToDBAbort : TestWorkload { ACTOR static Future _setup(BackupToDBAbort* self, Database cx) { state DatabaseBackupAgent backupAgent(cx); try { - // Disable proxy rejection to avoid ApplyMutationsError - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; TraceEvent("BDBA_Submit1"); wait( backupAgent.submitBackup(self->extraDB, BackupAgentBase::getDefaultTag(), self->backupRanges, false, StringRef(), StringRef(), true) ); TraceEvent("BDBA_Submit2"); @@ -65,7 +63,6 @@ struct BackupToDBAbort : TestWorkload { if( e.code() != error_code_backup_duplicate ) throw; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } @@ -77,6 +74,9 @@ struct BackupToDBAbort : TestWorkload { ACTOR static Future _start(BackupToDBAbort* self, Database cx) { state DatabaseBackupAgent backupAgent(cx); + // Disable proxy rejection to avoid ApplyMutationsError + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; + TraceEvent("BDBA_Start").detail("Delay", self->abortDelay); wait(delay(self->abortDelay)); TraceEvent("BDBA_Wait"); @@ -94,6 +94,7 @@ struct BackupToDBAbort : TestWorkload { g_simulator.drAgents = ISimulator::NoBackupAgents; } + const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } From 6bf1e5f3f918f35f0c891b23b485a6c2e6a7fffe Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Sun, 29 Nov 2020 14:16:10 -0800 Subject: [PATCH 07/18] Update release notes --- .../sphinx/source/release-notes/release-notes-620.rst | 1 + fdbserver/MasterProxyServer.actor.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 1d74690caf..82f417717e 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -8,6 +8,7 @@ Release Notes ====== * Fix invalid memory access on data distributor when snapshotting large clusters. `(PR #4076) `_ * Add human-readable DateTime to trace events `(PR #4087) `_ +* Proxy rejects transaction batch that exceeds MVCC window `(PR #4113) `_ 6.2.28 ====== diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 2ff54a6249..fcb8561510 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -633,7 +633,10 @@ ACTOR Future commitBatch( trs.size() > 0 && !trs[0].transaction.mutations.empty() && !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch - TraceEvent("ProxyReject", self->dbgid).detail("Delay", queuingDelay).detail("N", trs.size()).detail("BatchNumber", localBatchNumber); + TraceEvent(SevWarnAlways, "ProxyReject", self->dbgid) + .detail("QDelay", queuingDelay) + .detail("Transactions", trs.size()) + .detail("BatchNumber", localBatchNumber); ASSERT(self->latestLocalCommitBatchResolving.get() == localBatchNumber - 1); self->latestLocalCommitBatchResolving.set(localBatchNumber); @@ -644,6 +647,7 @@ ACTOR Future commitBatch( tr.reply.sendError(not_committed()); } ++self->stats.commitBatchOut; + self->stats.txnConflicts += trs.size(); return Void(); } From 3d9ac0b050f92af0e9e203ce06d6761a0842eb39 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 30 Nov 2020 09:29:18 -0800 Subject: [PATCH 08/18] Address review comments --- fdbserver/MasterProxyServer.actor.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index fcb8561510..7b40ec7a51 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -612,7 +612,7 @@ ACTOR Future commitBatch( if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before"); - state double timeStart = g_network->timer(); + state double timeStart = now(); if(localBatchNumber-self->latestLocalCommitBatchResolving.get()>SERVER_KNOBS->RESET_MASTER_BATCHES && now()-self->lastMasterReset>SERVER_KNOBS->RESET_MASTER_DELAY) { TraceEvent(SevWarnAlways, "ConnectionResetMaster", self->dbgid) @@ -628,12 +628,13 @@ ACTOR Future commitBatch( wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); double queuingDelay = g_network->timer() - timeStart; if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || - (BUGGIFY && g_network->isSimulated() && deterministicRandom()->random01() < 0.01)) && - SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && - trs.size() > 0 && !trs[0].transaction.mutations.empty() && !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { + (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) && + SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && trs.size() > 0 && !trs[0].transaction.mutations.empty() && + !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch TraceEvent(SevWarnAlways, "ProxyReject", self->dbgid) + .suppressFor(0.1) .detail("QDelay", queuingDelay) .detail("Transactions", trs.size()) .detail("BatchNumber", localBatchNumber); @@ -644,7 +645,7 @@ ACTOR Future commitBatch( ASSERT(self->latestLocalCommitBatchLogging.get() == localBatchNumber - 1); self->latestLocalCommitBatchLogging.set(localBatchNumber); for (const auto& tr : trs) { - tr.reply.sendError(not_committed()); + tr.reply.sendError(transaction_too_old()); } ++self->stats.commitBatchOut; self->stats.txnConflicts += trs.size(); From 52d238967f5ffb29e72c631fd74f1c590b3eaf5d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Mon, 30 Nov 2020 21:06:56 -0800 Subject: [PATCH 09/18] Allow backup's "ApplyMutations" to bypass rejection These mutations should always commit. --- fdbserver/MasterProxyServer.actor.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 7b40ec7a51..fcf9393ab2 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -567,6 +567,19 @@ ACTOR Future releaseResolvingAfter(ProxyCommitData* self, Future rel return Void(); } +// Try to identify recovery transaction and backup's apply mutations. +// Both cannot be rejected and are approximated by looking at first mutation +// starting with 0xff. +bool canReject(const std::vector& trs) { + for (const auto& tr : trs) { + if (tr.transaction.mutations.empty()) continue; + if (tr.transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { + return false; + } + } + return true; +} + ACTOR Future commitBatch( ProxyCommitData* self, vector trs, @@ -629,8 +642,7 @@ ACTOR Future commitBatch( double queuingDelay = g_network->timer() - timeStart; if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) && - SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && trs.size() > 0 && !trs[0].transaction.mutations.empty() && - !trs[0].transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { + SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) { // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. TEST(true); // Reject transactions in the batch TraceEvent(SevWarnAlways, "ProxyReject", self->dbgid) From 245cf28ace876a76f02f7bc20910e9e9bb97594e Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Thu, 3 Dec 2020 02:12:55 -0500 Subject: [PATCH 10/18] Updated docker repositories to Vault repos --- build/Dockerfile | 34 ++++++++++++++++++++++------------ build/Dockerfile.devel | 6 +++--- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index b4cf28ebc0..b230eb4de4 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -1,17 +1,27 @@ FROM centos:6 +# Clean yum cache, disable default Base repo and enable Vault +RUN yum clean all &&\ + sed -i -e 's/gpgcheck=1/enabled=0/g' /etc/yum.repos.d/CentOS-Base.repo &&\ + sed -i -e 's/enabled=0/enabled=1/g' /etc/yum.repos.d/CentOS-Vault.repo &&\ + sed -i -n '/6.1/q;p' /etc/yum.repos.d/CentOS-Vault.repo &&\ + sed -i -e "s/6\.0/$(cut -d\ -f3 /etc/redhat-release)/g" /etc/yum.repos.d/CentOS-Vault.repo &&\ + yum install -y yum-utils &&\ + yum-config-manager --enable rhel-server-rhscl-7-rpms &&\ + yum -y install centos-release-scl-rh epel-release \ + http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ + sed -i -e 's/#baseurl=/baseurl=/g' -e 's/mirror.centos.org/vault.centos.org/g' \ + -e 's/mirrorlist=/#mirrorlist=/g' /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo &&\ + yum clean all + # Install dependencies for developer tools, bindings,\ # documentation, actorcompiler, and packaging tools\ -RUN yum install -y yum-utils &&\ - yum-config-manager --enable rhel-server-rhscl-7-rpms &&\ - yum -y install centos-release-scl epel-release \ - http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ - yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ - devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \ - devtoolset-8-libubsan-devel devtoolset-8-libasan-devel devtoolset-8-valgrind-devel \ - rh-python36-python-devel rh-ruby24 golang python27 rpm-build \ - mono-core debbuild python-pip dos2unix valgrind-devel ccache \ - distcc wget git lz4 lz4-devel lz4-static &&\ +RUN yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ + devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \ + devtoolset-8-libubsan-devel devtoolset-8-libasan-devel devtoolset-8-valgrind-devel \ + rh-python36-python-devel rh-ruby24 golang python27 rpm-build \ + mono-core debbuild python-pip dos2unix valgrind-devel ccache \ + distcc wget git lz4 lz4-devel lz4-static &&\ pip install boto3==1.1.1 USER root @@ -61,8 +71,8 @@ RUN cd /opt/ && curl -L https://github.com/facebook/rocksdb/archive/v6.10.1.tar. ARG TIMEZONEINFO=America/Los_Angeles RUN rm -f /etc/localtime && ln -s /usr/share/zoneinfo/${TIMEZONEINFO} /etc/localtime -LABEL version=0.1.19 -ENV DOCKER_IMAGEVER=0.1.19 +LABEL version=0.1.20 +ENV DOCKER_IMAGEVER=0.1.20 ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/g++ diff --git a/build/Dockerfile.devel b/build/Dockerfile.devel index 33c6d03e36..9de4c1245d 100644 --- a/build/Dockerfile.devel +++ b/build/Dockerfile.devel @@ -1,4 +1,4 @@ -FROM foundationdb/foundationdb-build:0.1.19 +FROM foundationdb/foundationdb-build:0.1.20 USER root @@ -50,8 +50,8 @@ RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\ ldconfig &&\ rm -rf /mnt/artifacts -LABEL version=0.11.10 -ENV DOCKER_IMAGEVER=0.11.10 +LABEL version=0.11.11 +ENV DOCKER_IMAGEVER=0.11.11 ENV CLANGCC=/usr/local/bin/clang.de8a65ef ENV CLANGCXX=/usr/local/bin/clang++.de8a65ef From ae50dc9dae7a55723b225c8b3aa24b13c461145c Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Thu, 3 Dec 2020 02:16:24 -0500 Subject: [PATCH 11/18] Updated docker compose version --- build/docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/docker-compose.yaml b/build/docker-compose.yaml index 98f0ac691e..048d468dab 100644 --- a/build/docker-compose.yaml +++ b/build/docker-compose.yaml @@ -2,7 +2,7 @@ version: "3" services: common: &common - image: foundationdb/foundationdb-build:0.1.19 + image: foundationdb/foundationdb-build:0.1.20 build-setup: &build-setup <<: *common From beccd6e3c2595cf5790ea24513cb28bddb3a32e5 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 3 Dec 2020 11:02:41 -0800 Subject: [PATCH 12/18] Do not reject blind writes, i.e., empty read conflict range --- fdbserver/MasterProxyServer.actor.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index fcf9393ab2..0615ca26c4 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -567,13 +567,14 @@ ACTOR Future releaseResolvingAfter(ProxyCommitData* self, Future rel return Void(); } -// Try to identify recovery transaction and backup's apply mutations. +// Try to identify recovery transaction and backup's apply mutations (blind writes). // Both cannot be rejected and are approximated by looking at first mutation // starting with 0xff. bool canReject(const std::vector& trs) { for (const auto& tr : trs) { if (tr.transaction.mutations.empty()) continue; - if (tr.transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff"))) { + if (tr.transaction.mutations[0].param1.startsWith(LiteralStringRef("\xff")) || + tr.transaction.read_conflict_ranges.empty()) { return false; } } From e499e3ad70141b2dd6d1ce3cacc69f18f4e049f4 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 3 Dec 2020 15:06:33 -0800 Subject: [PATCH 13/18] Remove unneeded knob change for workloads --- fdbserver/workloads/AtomicRestore.actor.cpp | 4 ---- fdbserver/workloads/AtomicSwitchover.actor.cpp | 5 ----- fdbserver/workloads/BackupCorrectness.actor.cpp | 2 -- fdbserver/workloads/BackupToDBAbort.actor.cpp | 6 ------ fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 5 ----- fdbserver/workloads/BackupToDBUpgrade.actor.cpp | 5 ----- 6 files changed, 27 deletions(-) diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp index bcd1e78d25..4537970f7a 100644 --- a/fdbserver/workloads/AtomicRestore.actor.cpp +++ b/fdbserver/workloads/AtomicRestore.actor.cpp @@ -61,9 +61,6 @@ struct AtomicRestoreWorkload : TestWorkload { ACTOR static Future _start(Database cx, AtomicRestoreWorkload* self) { state FileBackupAgent backupAgent; - // Disable proxy rejection - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; - wait( delay(self->startAfter * deterministicRandom()->random01()) ); TraceEvent("AtomicRestore_Start"); @@ -108,7 +105,6 @@ struct AtomicRestoreWorkload : TestWorkload { } TraceEvent("AtomicRestore_Done"); - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } }; diff --git a/fdbserver/workloads/AtomicSwitchover.actor.cpp b/fdbserver/workloads/AtomicSwitchover.actor.cpp index dc36a82a86..02fcb77866 100644 --- a/fdbserver/workloads/AtomicSwitchover.actor.cpp +++ b/fdbserver/workloads/AtomicSwitchover.actor.cpp @@ -152,9 +152,6 @@ struct AtomicSwitchoverWorkload : TestWorkload { state DatabaseBackupAgent backupAgent(cx); state DatabaseBackupAgent restoreAgent(self->extraDB); - // Disable proxy rejection to avoid ApplyMutationsError - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; - TraceEvent("AS_Wait1"); wait(success( backupAgent.waitBackup(self->extraDB, BackupAgentBase::getDefaultTag(), false) )); TraceEvent("AS_Ready1"); @@ -180,8 +177,6 @@ struct AtomicSwitchoverWorkload : TestWorkload { g_simulator.drAgents = ISimulator::NoBackupAgents; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; - return Void(); } }; diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp index bf318055c0..ddf7e879f3 100644 --- a/fdbserver/workloads/BackupCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupCorrectness.actor.cpp @@ -364,7 +364,6 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { state FileBackupAgent backupAgent; state Future extraBackup; state bool extraTasks = false; - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; TraceEvent("BARW_Arguments").detail("BackupTag", printable(self->backupTag)).detail("PerformRestore", self->performRestore) .detail("BackupAfter", self->backupAfter).detail("RestoreAfter", self->restoreAfter) .detail("AbortAndRestartAfter", self->abortAndRestartAfter).detail("DifferentialAfter", self->stopDifferentialAfter); @@ -662,7 +661,6 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload { TraceEvent(SevError, "BackupAndRestoreCorrectness").error(e).GetLastError(); throw; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } }; diff --git a/fdbserver/workloads/BackupToDBAbort.actor.cpp b/fdbserver/workloads/BackupToDBAbort.actor.cpp index 60c73620b7..7d242360ec 100644 --- a/fdbserver/workloads/BackupToDBAbort.actor.cpp +++ b/fdbserver/workloads/BackupToDBAbort.actor.cpp @@ -22,8 +22,6 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/workloads/workloads.actor.h" -#include "fdbserver/Knobs.h" - #include "flow/actorcompiler.h" // This must be the last #include. struct BackupToDBAbort : TestWorkload { @@ -74,9 +72,6 @@ struct BackupToDBAbort : TestWorkload { ACTOR static Future _start(BackupToDBAbort* self, Database cx) { state DatabaseBackupAgent backupAgent(cx); - // Disable proxy rejection to avoid ApplyMutationsError - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; - TraceEvent("BDBA_Start").detail("Delay", self->abortDelay); wait(delay(self->abortDelay)); TraceEvent("BDBA_Wait"); @@ -94,7 +89,6 @@ struct BackupToDBAbort : TestWorkload { g_simulator.drAgents = ISimulator::NoBackupAgents; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; return Void(); } diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp index 11fd9642d9..bb1ced25eb 100644 --- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp @@ -442,9 +442,6 @@ struct BackupToDBCorrectnessWorkload : TestWorkload { TraceEvent("BARW_Arguments").detail("BackupTag", printable(self->backupTag)).detail("BackupAfter", self->backupAfter) .detail("AbortAndRestartAfter", self->abortAndRestartAfter).detail("DifferentialAfter", self->stopDifferentialAfter); - // Disable proxy rejection to avoid ApplyMutationsError - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; - state UID randomID = nondeterministicRandom()->randomUniqueID(); // Increment the backup agent requets @@ -578,8 +575,6 @@ struct BackupToDBCorrectnessWorkload : TestWorkload { throw; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; - return Void(); } }; diff --git a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp index 3a324e6b1f..3f2afcc717 100644 --- a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp +++ b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp @@ -346,9 +346,6 @@ struct BackupToDBUpgradeWorkload : TestWorkload { state UID logUid; state Version commitVersion; - // Disable proxy rejection to avoid ApplyMutationsError - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = false; - state Future stopDifferential = delay(self->stopDifferentialAfter); state Future waitUpgrade = backupAgent.waitUpgradeToLatestDrVersion(self->extraDB, self->backupTag); wait(success(stopDifferential) && success(waitUpgrade)); @@ -465,8 +462,6 @@ struct BackupToDBUpgradeWorkload : TestWorkload { throw; } - const_cast(SERVER_KNOBS)->PROXY_REJECT_BATCH_QUEUED_TOO_LONG = true; - return Void(); } }; From 5ad08782546be042d8d53511ba6e72df995d4dfd Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 4 Dec 2020 14:27:49 -0800 Subject: [PATCH 14/18] Update fdbserver/MasterProxyServer.actor.cpp Co-authored-by: A.J. Beamon --- fdbserver/MasterProxyServer.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 0615ca26c4..bce4b056d4 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -640,7 +640,7 @@ ACTOR Future commitBatch( /////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield) TEST(self->latestLocalCommitBatchResolving.get() < localBatchNumber-1); // Queuing pre-resolution commit processing wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); - double queuingDelay = g_network->timer() - timeStart; + double queuingDelay = g_network->now() - timeStart; if ((queuingDelay > (double)SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS / SERVER_KNOBS->VERSIONS_PER_SECOND || (g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01))) && SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) { From bf59ab684e0cdf78304639f2e9489531566e0cf4 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 4 Dec 2020 16:35:46 -0800 Subject: [PATCH 15/18] Revert change to forceRecovery --- fdbserver/MasterProxyServer.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index bce4b056d4..04ffbec49c 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -859,6 +859,7 @@ ACTOR Future commitBatch( if(firstStateMutations) { ASSERT(committed[t] == ConflictBatch::TransactionCommitted); firstStateMutations = false; + forceRecovery = false; } } if (forceRecovery) { From 75836d6114bcc7ca67728b807b748ce6cccfac8f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 7 Dec 2020 09:01:19 -0800 Subject: [PATCH 16/18] Fix double counting of range reads in TransactionMetrics --- fdbclient/NativeAPI.actor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 0be7548c76..da4c362378 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1902,7 +1902,6 @@ ACTOR Future> getRange( Database cx, ReferencetransactionPhysicalReads; - ++cx->transactionGetRangeRequests; state GetKeyValuesReply rep; try { if (CLIENT_BUGGIFY) { From 207a96ad5a3e315f854a6d1fec54b746b55ee0ef Mon Sep 17 00:00:00 2001 From: Russell Sears Date: Mon, 30 Nov 2020 16:18:30 -0800 Subject: [PATCH 17/18] Add boringssl and ctest2junit to docker; move environment-independent CCACHE flags from Dockerfile.devel to Dockerfile --- build/Dockerfile | 43 ++++++++++++++++++++++++++++++++++++--- build/Dockerfile.devel | 10 ++++----- build/docker-compose.yaml | 2 +- 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index b230eb4de4..4ced205435 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -21,7 +21,7 @@ RUN yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ devtoolset-8-libubsan-devel devtoolset-8-libasan-devel devtoolset-8-valgrind-devel \ rh-python36-python-devel rh-ruby24 golang python27 rpm-build \ mono-core debbuild python-pip dos2unix valgrind-devel ccache \ - distcc wget git lz4 lz4-devel lz4-static &&\ + distcc wget libxslt git lz4 lz4-devel lz4-static &&\ pip install boto3==1.1.1 USER root @@ -51,6 +51,8 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1 # install Ninja RUN cd /tmp && curl -L https://github.com/ninja-build/ninja/archive/v1.9.0.zip -o ninja.zip &&\ + echo "8e2e654a418373f10c22e4cc9bdbe9baeca8527ace8d572e0b421e9d9b85b7ef ninja.zip" > /tmp/ninja-sha.txt &&\ + sha256sum -c /tmp/ninja-sha.txt &&\ unzip ninja.zip && cd ninja-1.9.0 && scl enable devtoolset-8 -- ./configure.py --bootstrap && cp ninja /usr/bin &&\ cd .. && rm -rf ninja-1.9.0 ninja.zip @@ -67,13 +69,48 @@ RUN cd /opt/ && curl -L https://github.com/facebook/rocksdb/archive/v6.10.1.tar. echo "d573d2f15cdda883714f7e0bc87b814a8d4a53a82edde558f08f940e905541ee rocksdb.tar.gz" > rocksdb-sha.txt &&\ sha256sum -c rocksdb-sha.txt && tar xf rocksdb.tar.gz && rm -rf rocksdb.tar.gz rocksdb-sha.txt +RUN cd /opt/ && curl -L https://github.com/manticoresoftware/manticoresearch/raw/master/misc/junit/ctest2junit.xsl -o ctest2junit.xsl + +# Setting this environment variable switches from OpenSSL to BoringSSL +#ENV OPENSSL_ROOT_DIR=/opt/boringssl + +# install BoringSSL: TODO: They don't seem to have releases(?) I picked today's master SHA. +RUN cd /opt &&\ + git clone https://boringssl.googlesource.com/boringssl &&\ + cd boringssl &&\ + git checkout e796cc65025982ed1fb9ef41b3f74e8115092816 &&\ + mkdir build + +# ninja doesn't respect CXXFLAGS, and the boringssl CMakeLists doesn't expose an option to define __STDC_FORMAT_MACROS +# also, enable -fPIC. +# this is moderately uglier than creating a patchfile, but easier to maintain. +RUN cd /opt/boringssl &&\ + for f in crypto/fipsmodule/rand/fork_detect_test.cc \ + include/openssl/bn.h \ + ssl/test/bssl_shim.cc ; do \ + perl -p -i -e 's/#include /#define __STDC_FORMAT_MACROS 1\n#include /g;' $f ; \ + done &&\ + perl -p -i -e 's/-Werror/-Werror -fPIC/' CMakeLists.txt &&\ + git diff + +RUN cd /opt/boringssl/build &&\ + scl enable devtoolset-8 rh-python36 rh-ruby24 -- cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. &&\ + scl enable devtoolset-8 rh-python36 rh-ruby24 -- ninja &&\ + ./ssl/ssl_test &&\ + mkdir -p ../lib && cp crypto/libcrypto.a ssl/libssl.a ../lib + # Localize time zone ARG TIMEZONEINFO=America/Los_Angeles RUN rm -f /etc/localtime && ln -s /usr/share/zoneinfo/${TIMEZONEINFO} /etc/localtime -LABEL version=0.1.20 -ENV DOCKER_IMAGEVER=0.1.20 +LABEL version=0.1.21 +ENV DOCKER_IMAGEVER=0.1.21 ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/g++ + +ENV CCACHE_NOHASHDIR=true +ENV CCACHE_UMASK=0000 +ENV CCACHE_SLOPPINESS="file_macro,time_macros,include_file_mtime,include_file_ctime,file_stat_matches" + CMD scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash diff --git a/build/Dockerfile.devel b/build/Dockerfile.devel index 9de4c1245d..884eae7000 100644 --- a/build/Dockerfile.devel +++ b/build/Dockerfile.devel @@ -1,4 +1,5 @@ -FROM foundationdb/foundationdb-build:0.1.20 +ARG IMAGE_TAG=0.1.21 +FROM foundationdb/foundationdb-build:${IMAGE_TAG} USER root @@ -50,8 +51,8 @@ RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\ ldconfig &&\ rm -rf /mnt/artifacts -LABEL version=0.11.11 -ENV DOCKER_IMAGEVER=0.11.11 +LABEL version=0.11.12 +ENV DOCKER_IMAGEVER=0.11.12 ENV CLANGCC=/usr/local/bin/clang.de8a65ef ENV CLANGCXX=/usr/local/bin/clang++.de8a65ef @@ -63,8 +64,5 @@ ENV CC=/usr/local/bin/clang.de8a65ef ENV CXX=/usr/local/bin/clang++.de8a65ef ENV USE_LD=LLD ENV USE_LIBCXX=1 -ENV CCACHE_NOHASHDIR=true -ENV CCACHE_UMASK=0000 -ENV CCACHE_SLOPPINESS="file_macro,time_macros,include_file_mtime,include_file_ctime,file_stat_matches" CMD scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash diff --git a/build/docker-compose.yaml b/build/docker-compose.yaml index 048d468dab..ad1984b8a3 100644 --- a/build/docker-compose.yaml +++ b/build/docker-compose.yaml @@ -2,7 +2,7 @@ version: "3" services: common: &common - image: foundationdb/foundationdb-build:0.1.20 + image: foundationdb/foundationdb-build:0.1.21 build-setup: &build-setup <<: *common From c68b62f89b3473793802408cdd447e17e1e212f2 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 8 Dec 2020 16:57:59 -0800 Subject: [PATCH 18/18] Update txnCommitOut --- fdbserver/MasterProxyServer.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 04ffbec49c..cc5f9a2e86 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -661,6 +661,7 @@ ACTOR Future commitBatch( tr.reply.sendError(transaction_too_old()); } ++self->stats.commitBatchOut; + self->stats.txnCommitOut += trs.size(); self->stats.txnConflicts += trs.size(); return Void(); }