From 6881a8fc6958cbbca14e2af96fbe5ce8c9d86c26 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 2 Apr 2019 15:19:44 -0700 Subject: [PATCH 1/3] Added more release notes for 6.1.0 --- documentation/sphinx/source/release-notes.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index fb0fc7d2b1..f209b4c269 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -19,7 +19,6 @@ Features * Backup ``status`` and ``describe`` commands now have a ``--json`` output option. `(PR #1248) `_ * Separated data distribution from the master into its own role. `(PR #1062) `_ * Separated ratekeeper from the master into its own role. `(PR #1176) `_ - * Added a ``CompareAndClear`` atomic op that clears a key if its value matches the supplied value. `(PR #1105) `_ * Added support for IPv6. `(PR #1176) `_ * FDB can now simultaneously listen to TLS and unencrypted ports to facilitate smoother migration to and from TLS. `(PR #1157) `_ @@ -29,7 +28,7 @@ Features * Deprecated transaction option ``TRANSACTION_LOGGING_ENABLE``. Added two new transaction options ``DEBUG_TRANSACTION_IDENTIFIER`` and ``LOG_TRANSACTION`` that sets an identifier for the transaction and logs the transaction to the trace file respectively. `(PR #1200) `_ * Clients can now specify default transaction timeouts and retry limits for all transactions through a database option. `(Issue #775) `_ * The "timeout", "max retry delay", and "retry limit" transaction options are no longer reset when the transaction is reset after a call to ``onError`` (as of API version 610). `(Issue #775) `_ -* Added the ``force_recovery_with_data_loss`` command to fdbcli. When a cluster is configured with usable_regions=2, this command will force the database to recover in the remote region. `(PR #1168) `_ +* Added the ``force_recovery_with_data_loss`` command to ``fdbcli``. When a cluster is configured with usable_regions=2, this command will force the database to recover in the remote region. `(PR #1168) `_ * Added a limit to the number of status requests the cluster controller will handle. `(PR #1093) `_ (submitted by tclinken) * Added a ``coordinator`` process class. Processes with this class can only be used as a coordinator, and ``coordinators auto`` will prefer to choose processes of this class. `(PR #1069) `_ (submitted by tclinken) * The ``consistencycheck`` fdbserver role will check the entire database at most once every week. `(PR #1126) `_ @@ -39,7 +38,8 @@ Features * The ``memory`` storage engine configuration now uses the ssd engine for transaction log spilling. Transaction log spilling only happens when the transaction logs are using too much memory, so using the memory storage engine for this purpose can cause the process to run out of memory. Existing clusters will NOT automatically change their configuration. `(PR #1314) `_ * Trace logs can be output as JSON instead of XML using the ``--trace_format`` command line option. `(PR #976) `_ (by atn34) * Added ``modify`` command to fdbbackup for modifying parameters of a running backup. `(PR #1237) `_ -* Added 'header' parameter to blobstore backup URLs for setting custom HTTP headers. `(PR #1237) `_ +* Added ``header`` parameter to blobstore backup URLs for setting custom HTTP headers. `(PR #1237) `_ +* Added the ``maintenance`` command to ``fdbcli``. This command will stop data distribution from moving data away from processes with a specified zoneID. `(PR #1397) `_ Performance ----------- @@ -64,6 +64,8 @@ Fixes ``1.0.0.10``). `(PR #1121) `_ * Restore could crash when reading a file that ends on a block boundary (1MB default). `(PR #1205) `_ * Java: Successful commits and range reads no longer create ``FDBException`` objects, which avoids wasting resources and reduces memory pressure. `(Issue #1235) `_ +* Windows: Fixed a crash when deleting files. `(Issue #1380) `_ (by KrzysFR) +* Starting a restore on a tag already in-use would hang and the process would eventually run out of memory. `(PR #1394) `_ Status ------ From 4d6334eb906a11527346ed9a18e74e19ddc3a63c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 2 Apr 2019 15:23:55 -0700 Subject: [PATCH 2/3] removed a known limitation about force_recovery_with_data_loss --- documentation/sphinx/source/configuration.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/documentation/sphinx/source/configuration.rst b/documentation/sphinx/source/configuration.rst index f5298b3b76..d226bc60fb 100644 --- a/documentation/sphinx/source/configuration.rst +++ b/documentation/sphinx/source/configuration.rst @@ -688,8 +688,6 @@ To drop the dead datacenter do the follow steps: If you are running in a configuration without a satellite datacenter, or you have lost all machines in a region simultaneously, the ``force_recovery_with_data_loss`` command from ``fdbcli`` allows you to force a recovery to the other region. This will discard the portion of the mutation log which did not make it across the WAN. Once the database has recovered, immediately follow the previous steps to drop the dead region the normal way. -.. warning:: In 6.0 the ``force_recovery_with_data_loss`` command from ``fdbcli`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. In general this command has not be tested to same degree as the rest of the codebase, and should only be used in extreme emergencies. - Region change safety -------------------- @@ -743,8 +741,6 @@ The 6.0 release still has a number of rough edges related to region configuratio * While a datacenter has failed, the maximum write throughput of the cluster will be roughly 1/3 of normal performance. - * ``force_recovery_with_data_loss`` can cause data inconsistencies if it is used when processes from both non-satellite datacenters are still in the cluster. - .. _guidelines-process-class-config: Guidelines for setting process class From 31ed73d9f5033c6bd3f49715bfab7a804a318385 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 2 Apr 2019 15:27:37 -0700 Subject: [PATCH 3/3] Ported the bug fix https://github.com/apple/foundationdb/pull/1379 to OldTLogServer_6_0 --- fdbserver/OldTLogServer_6_0.actor.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 200583a5f2..6ac0f68c4a 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1103,7 +1103,7 @@ ACTOR Future watchDegraded(TLogData* self) { return Void(); } -ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { +ACTOR Future doQueueCommit( TLogData* self, Reference logData, std::vector> missingFinalCommit ) { state Version ver = logData->version.get(); state Version commitNumber = self->queueCommitBegin+1; state Version knownCommittedVersion = logData->knownCommittedVersion; @@ -1144,6 +1144,11 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData ) { logData->queueCommittedVersion.set(ver); self->queueCommitEnd.set(commitNumber); + for(auto& it : missingFinalCommit) { + TraceEvent("TLogCommitMissingFinalCommit", self->dbgid).detail("LogId", logData->logId).detail("Version", it->version.get()).detail("QueueVer", it->queueCommittedVersion.get()); + TEST(true); //A TLog was replaced before having a chance to commit its queue + it->queueCommittedVersion.set(it->version.get()); + } return Void(); } @@ -1152,11 +1157,14 @@ ACTOR Future commitQueue( TLogData* self ) { loop { int foundCount = 0; + state std::vector> missingFinalCommit; for(auto it : self->id_data) { if(!it.second->stopped) { logData = it.second; foundCount++; - } + } else if(it.second->version.get() > std::max(it.second->queueCommittingVersion, it.second->queueCommittedVersion.get())) { + missingFinalCommit.push_back(it.second); + } } ASSERT(foundCount < 2); @@ -1181,7 +1189,8 @@ ACTOR Future commitQueue( TLogData* self ) { while( self->queueCommitBegin != self->queueCommitEnd.get() && !self->largeDiskQueueCommitBytes.get() ) { wait( self->queueCommitEnd.whenAtLeast(self->queueCommitBegin) || self->largeDiskQueueCommitBytes.onChange() ); } - self->sharedActors.send(doQueueCommit(self, logData)); + self->sharedActors.send(doQueueCommit(self, logData, missingFinalCommit)); + missingFinalCommit.clear(); } when(wait(self->newLogData.onTrigger())) {} }