From d1e1fea42dee01aefda8410d3a4030f3c1acfbdb Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 28 Feb 2020 09:35:21 -0800 Subject: [PATCH 01/16] Our binaries that act like clients (fdbcli, backup and DR binaries) were reporting an unknown client version. Clients did not react if the list of supported versions changed. --- fdbclient/ClusterInterface.h | 1 + fdbclient/MonitorLeader.actor.cpp | 6 +++--- fdbclient/MonitorLeader.h | 2 +- fdbclient/NativeAPI.actor.cpp | 22 +++++++++++++++++++--- fdbclient/NativeAPI.actor.h | 9 ++------- flow/genericactors.actor.h | 30 ++++++++++++++++++++++++++++++ 6 files changed, 56 insertions(+), 14 deletions(-) diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h index b0724e2b57..8e2839cfbb 100644 --- a/fdbclient/ClusterInterface.h +++ b/fdbclient/ClusterInterface.h @@ -93,6 +93,7 @@ struct ClientVersionRef { } ClientVersionRef(Arena &arena, ClientVersionRef const& cv) : clientVersion(arena, cv.clientVersion), sourceVersion(arena, cv.sourceVersion), protocolVersion(arena, cv.protocolVersion) {} + ClientVersionRef(StringRef clientVersion, StringRef sourceVersion, StringRef protocolVersion) : clientVersion(clientVersion), sourceVersion(sourceVersion), protocolVersion(protocolVersion) {} ClientVersionRef(std::string versionString) { size_t index = versionString.find(","); if(index == versionString.npos) { diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index a7364a77b0..6c0e5a0c65 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -691,7 +691,7 @@ void shrinkProxyList( ClientDBInfo& ni, std::vector& lastProxyUIDs, std::ve } // Leader is the process that will be elected by coordinators as the cluster controller -ACTOR Future monitorProxiesOneGeneration( Reference connFile, Reference> clientInfo, MonitorLeaderInfo info, Standalone> supportedVersions, Key traceLogGroup) { +ACTOR Future monitorProxiesOneGeneration( Reference connFile, Reference> clientInfo, MonitorLeaderInfo info, Reference>>> supportedVersions, Key traceLogGroup) { state ClusterConnectionString cs = info.intermediateConnFile->getConnectionString(); state vector addrs = cs.coordinators(); state int idx = 0; @@ -707,7 +707,7 @@ ACTOR Future monitorProxiesOneGeneration( Referenceget().id; - req.supportedVersions = supportedVersions; + req.supportedVersions = supportedVersions->get(); req.traceLogGroup = traceLogGroup; ClusterConnectionString fileConnectionString; @@ -760,7 +760,7 @@ ACTOR Future monitorProxiesOneGeneration( Reference monitorProxies( Reference>> connFile, Reference> clientInfo, Standalone> supportedVersions, Key traceLogGroup ) { +ACTOR Future monitorProxies( Reference>> connFile, Reference> clientInfo, Reference>>> supportedVersions, Key traceLogGroup ) { state MonitorLeaderInfo info(connFile->get()); loop { choose { diff --git a/fdbclient/MonitorLeader.h b/fdbclient/MonitorLeader.h index 0eae5151f7..3843847de7 100644 --- a/fdbclient/MonitorLeader.h +++ b/fdbclient/MonitorLeader.h @@ -57,7 +57,7 @@ Future monitorLeader( Reference const& connFile, Re Future monitorLeaderForProxies( Value const& key, vector const& coordinators, ClientData* const& clientData ); -Future monitorProxies( Reference>> const& connFile, Reference> const& clientInfo, Standalone> const& supportedVersions, Key const& traceLogGroup ); +Future monitorProxies( Reference>> const& connFile, Reference> const& clientInfo, Reference>>> const& supportedVersions, Key const& traceLogGroup ); void shrinkProxyList( ClientDBInfo& ni, std::vector& lastProxyUIDs, std::vector& lastProxies ); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index e30390cf22..b1a51ae009 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -78,6 +78,21 @@ static void initTLSPolicy() { #endif } +// The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h. +NetworkOptions::NetworkOptions() + : localAddress(""), clusterFile(""), traceDirectory(Optional()), + traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), + traceFormat("xml"), slowTaskProfilingEnabled(false) { + + Standalone> defaultSupportedVersions; + + StringRef sourceVersion = StringRef((const uint8_t*)getHGVersion(), strlen(getHGVersion())); + std::string protocolVersionString = format("%llx", currentProtocolVersion.version()); + defaultSupportedVersions.push_back_deep(defaultSupportedVersions.arena(), ClientVersionRef(LiteralStringRef(FDB_VT_VERSION), sourceVersion, protocolVersionString)); + + supportedVersions = ReferencedObject>>::from(defaultSupportedVersions); +} + static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); @@ -960,18 +975,19 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional valu ASSERT(g_network); ASSERT(value.present()); - networkOptions.supportedVersions.resize(networkOptions.supportedVersions.arena(), 0); + Standalone> supportedVersions; std::string versionString = value.get().toString(); size_t index = 0; size_t nextIndex = 0; while(nextIndex != versionString.npos) { nextIndex = versionString.find(';', index); - networkOptions.supportedVersions.push_back_deep(networkOptions.supportedVersions.arena(), ClientVersionRef(versionString.substr(index, nextIndex-index))); + supportedVersions.push_back_deep(supportedVersions.arena(), ClientVersionRef(versionString.substr(index, nextIndex-index))); index = nextIndex + 1; } - ASSERT(networkOptions.supportedVersions.size() > 0); + ASSERT(supportedVersions.size() > 0); + networkOptions.supportedVersions->set(supportedVersions); break; } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index f02be1b5b3..50f6591386 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -25,7 +25,6 @@ #elif !defined(FDBCLIENT_NATIVEAPI_ACTOR_H) #define FDBCLIENT_NATIVEAPI_ACTOR_H - #include "flow/flow.h" #include "flow/TDMetric.actor.h" #include "fdbclient/FDBTypes.h" @@ -59,14 +58,10 @@ struct NetworkOptions { std::string traceLogGroup; std::string traceFormat; Optional logClientInfo; - Standalone> supportedVersions; + Reference>>> supportedVersions; bool slowTaskProfilingEnabled; - // The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h. - NetworkOptions() - : localAddress(""), clusterFile(""), traceDirectory(Optional()), - traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), - traceFormat("xml"), slowTaskProfilingEnabled(false) {} + NetworkOptions(); }; class Database { diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 7365220e97..e2f9eda32f 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -647,6 +647,36 @@ protected: } }; +template +class ReferencedObject : NonCopyable, public ReferenceCounted> { + public: + ReferencedObject() : value() {} + ReferencedObject(V const& v) : value(v) {} + ReferencedObject(ReferencedObject&& r) : value(std::move(r.value)) {} + void operator=(ReferencedObject&& r) { + value = std::move(r.value); + } + + V const& get() const { + return value; + } + + V& mutate() const { + return value; + } + + void set(V const& v) { + value = v; + } + + static Reference> from(V v) { + return Reference>(new ReferencedObject(v)); + } + + private: + V value; +}; + template class AsyncVar : NonCopyable, public ReferenceCounted> { public: From b0062f58d30de640368aca3c725317580799431a Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 28 Feb 2020 15:44:22 -0800 Subject: [PATCH 02/16] fix: blobstore needs to handshake tls connections --- fdbclient/BlobStore.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbclient/BlobStore.actor.cpp b/fdbclient/BlobStore.actor.cpp index 6e121491bb..4cba0297f0 100644 --- a/fdbclient/BlobStore.actor.cpp +++ b/fdbclient/BlobStore.actor.cpp @@ -507,6 +507,7 @@ ACTOR Future connect_impl(Referenceknobs.secure_connection ? "https" : "http"; state Reference conn = wait(INetworkConnections::net()->connect(b->host, service, b->knobs.secure_connection ? true : false)); + wait(conn->connectHandshake()); TraceEvent("BlobStoreEndpointNewConnection").suppressFor(60) .detail("RemoteEndpoint", conn->getPeerAddress()) From 9862aa8bed51640a9028e7216e86abf52fbc23e2 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 4 Mar 2020 11:15:32 -0800 Subject: [PATCH 03/16] Add support for setting knobs in fdbcli --- fdbcli/fdbcli.actor.cpp | 50 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index cf76fe7ee4..0697fe821d 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -69,7 +69,8 @@ enum { OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION, - OPT_TRACE_FORMAT + OPT_TRACE_FORMAT, + OPT_KNOB }; CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, @@ -87,12 +88,13 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, { OPT_VERSION, "--version", SO_NONE }, { OPT_VERSION, "-v", SO_NONE }, { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_KNOB, "--knob_", SO_REQ_SEP }, #ifndef TLS_DISABLED TLS_OPTION_FLAGS #endif - SO_END_OF_OPTIONS }; + SO_END_OF_OPTIONS }; void printAtCol(const char* text, int col) { const char* iter = text; @@ -423,6 +425,8 @@ static void printProgramUsage(const char* name) { #ifndef TLS_DISABLED TLS_HELP #endif + " --knob_KNOBNAME KNOBVALUE\n" + " Changes a knob option. KNOBNAME should be lowercase.\n" " -v, --version Print FoundationDB CLI version information and exit.\n" " -h, --help Display this help and exit.\n"); } @@ -2444,6 +2448,8 @@ struct CLIOptions { std::string tlsCAPath; std::string tlsPassword; + std::vector> knobs; + CLIOptions( int argc, char* argv[] ) : trace(false), exit_timeout(0), @@ -2467,9 +2473,37 @@ struct CLIOptions { } if (exit_timeout && !exec.present()) { fprintf(stderr, "ERROR: --timeout may only be specified with --exec\n"); - exit_code = 1; + exit_code = FDB_EXIT_ERROR; return; } + + delete FLOW_KNOBS; + FlowKnobs* flowKnobs = new FlowKnobs(true); + FLOW_KNOBS = flowKnobs; + + delete CLIENT_KNOBS; + ClientKnobs* clientKnobs = new ClientKnobs(true); + CLIENT_KNOBS = clientKnobs; + + for(auto k=knobs.begin(); k!=knobs.end(); ++k) { + try { + if (!flowKnobs->setKnob( k->first, k->second ) && + !clientKnobs->setKnob( k->first, k->second )) + { + fprintf(stderr, "ERROR: Unrecognized knob option '%s'\n", k->first.c_str()); + exit_code = FDB_EXIT_ERROR; + } + } catch (Error& e) { + if (e.code() == error_code_invalid_option_value) { + fprintf(stderr, "ERROR: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); + exit_code = FDB_EXIT_ERROR; + } + else { + fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); + exit_code = FDB_EXIT_ERROR; + } + } + } } int processArg(CSimpleOpt& args) { @@ -2536,6 +2570,16 @@ struct CLIOptions { } traceFormat = args.OptionArg(); break; + case OPT_KNOB: { + std::string syn = args.OptionSyntax(); + if (!StringRef(syn).startsWith(LiteralStringRef("--knob_"))) { + fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str()); + return FDB_EXIT_ERROR; + } + syn = syn.substr(7); + knobs.push_back( std::make_pair( syn, args.OptionArg() ) ); + break; + } case OPT_VERSION: printVersion(); return FDB_EXIT_SUCCESS; From 181ca3fce07e0dd72455185bd26d2ec6e2bc7330 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 4 Mar 2020 11:18:00 -0800 Subject: [PATCH 04/16] Add release note. --- documentation/sphinx/source/release-notes.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 4bb82d8dcd..73250b270b 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,6 +2,14 @@ Release Notes ############# +6.2.18 +====== + +Features +-------- + +* Add support for setting knobs in fdbcli. `(PR #2773) `_. + 6.2.17 ====== From 6296465e079cb172b9cfb2c464f98588b135fc06 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 14:07:32 -0800 Subject: [PATCH 05/16] Make the DD priority associated with populating a remote region lower than machine failures --- fdbserver/DataDistribution.actor.cpp | 9 ++++++--- fdbserver/DataDistributionQueue.actor.cpp | 10 ++++++---- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/Status.actor.cpp | 15 ++++++++------- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3d1e8ecdfc..ef16359a74 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -2870,7 +2870,9 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea lastWrongConfiguration = anyWrongConfiguration; state int lastPriority = team->getPriority(); - if( serversLeft < self->configuration.storageTeamSize ) { + if(team->size() == 0) { + team->setPriority( SERVER_KNOBS->PRIORITY_POPULATE_REGION ); + } else if( serversLeft < self->configuration.storageTeamSize ) { if( serversLeft == 0 ) team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ); else if( serversLeft == 1 ) @@ -2887,10 +2889,11 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY ); } } - else if( anyUndesired ) + else if( anyUndesired ) { team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER ); - else + } else { team->setPriority( SERVER_KNOBS->PRIORITY_TEAM_HEALTHY ); + } if(lastPriority != team->getPriority()) { self->priority_teams[lastPriority]--; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index c534cda824..43626ea426 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -57,7 +57,8 @@ struct RelocateData { rs.priority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT), interval("QueuedRelocation") {} static bool isHealthPriority(int priority) { - return priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || + return priority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || + priority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || priority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT || priority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || priority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || @@ -394,7 +395,7 @@ struct DDQueueData { // ensure a team remover will not start before the previous one finishes removing a team and move away data // NOTE: split and merge shard have higher priority. If they have to wait for unhealthyRelocations = 0, // deadlock may happen: split/merge shard waits for unhealthyRelocations, while blocks team_redundant. - if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT || + if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) { unhealthyRelocations++; rawProcessingUnhealthy->set(true); @@ -402,7 +403,7 @@ struct DDQueueData { priority_relocations[priority]++; } void finishRelocation(int priority, int healthPriority) { - if (healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT || + if (healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT || healthPriority == SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) { unhealthyRelocations--; ASSERT(unhealthyRelocations >= 0); @@ -927,7 +928,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd while( tciIndex < self->teamCollections.size() ) { double inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_HEALTHY; if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_UNHEALTHY; - if(rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; + if(rd.healthPriority == SERVER_KNOBS->PRIORITY_POPULATE_REGION || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || rd.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT) inflightPenalty = SERVER_KNOBS->INFLIGHT_PENALTY_ONE_LEFT; auto req = GetTeamRequest(rd.wantsNewServers, rd.priority == SERVER_KNOBS->PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, true, false, inflightPenalty); req.completeSources = rd.completeSources; @@ -1497,6 +1498,7 @@ ACTOR Future dataDistributionQueue( .detail( "PriorityTeamContainsUndesiredServer", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER] ) .detail( "PriorityTeamRedundant", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT] ) .detail( "PriorityMergeShard", self.priority_relocations[SERVER_KNOBS->PRIORITY_MERGE_SHARD] ) + .detail( "PriorityPopulateRegion", self.priority_relocations[SERVER_KNOBS->PRIORITY_POPULATE_REGION] ) .detail( "PriorityTeamUnhealthy", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY] ) .detail( "PriorityTeam2Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_2_LEFT] ) .detail( "PriorityTeam1Left", self.priority_relocations[SERVER_KNOBS->PRIORITY_TEAM_1_LEFT] ) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 2ce0aac021..04118e7e75 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -112,6 +112,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula init( PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER, 150 ); init( PRIORITY_TEAM_REDUNDANT, 200 ); init( PRIORITY_MERGE_SHARD, 340 ); + init( PRIORITY_POPULATE_REGION, 600 ); init( PRIORITY_TEAM_UNHEALTHY, 700 ); init( PRIORITY_TEAM_2_LEFT, 709 ); init( PRIORITY_TEAM_1_LEFT, 800 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index c5c41fc58f..06fb5af3a0 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -117,6 +117,7 @@ public: int PRIORITY_TEAM_CONTAINS_UNDESIRED_SERVER; int PRIORITY_TEAM_REDUNDANT; int PRIORITY_MERGE_SHARD; + int PRIORITY_POPULATE_REGION; int PRIORITY_TEAM_UNHEALTHY; int PRIORITY_TEAM_2_LEFT; int PRIORITY_TEAM_1_LEFT; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 835848ce0f..d1cab6de22 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1430,29 +1430,30 @@ ACTOR static Future dataStatusFetcher(WorkerDetails ddWorker, stateSectionObj["description"] = "No replicas remain of some data"; stateSectionObj["min_replicas_remaining"] = 0; replicas = 0; - } - else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_1_LEFT) { + } else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_1_LEFT) { stateSectionObj["healthy"] = false; stateSectionObj["name"] = "healing"; stateSectionObj["description"] = "Only one replica remains of some data"; stateSectionObj["min_replicas_remaining"] = 1; replicas = 1; - } - else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) { + } else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_2_LEFT) { stateSectionObj["healthy"] = false; stateSectionObj["name"] = "healing"; stateSectionObj["description"] = "Only two replicas remain of some data"; stateSectionObj["min_replicas_remaining"] = 2; replicas = 2; - } - else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) { + } else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_UNHEALTHY) { stateSectionObj["healthy"] = false; stateSectionObj["name"] = "healing"; stateSectionObj["description"] = "Restoring replication factor"; + } else if (highestPriority >= SERVER_KNOBS->PRIORITY_POPULATE_REGION) { + stateSectionObj["healthy"] = true; + stateSectionObj["name"] = "healthy_populating_region"; + stateSectionObj["description"] = "Populating remote region"; } else if (highestPriority >= SERVER_KNOBS->PRIORITY_MERGE_SHARD) { stateSectionObj["healthy"] = true; stateSectionObj["name"] = "healthy_repartitioning"; - stateSectionObj["description"] = "Repartitioning."; + stateSectionObj["description"] = "Repartitioning"; } else if (highestPriority >= SERVER_KNOBS->PRIORITY_TEAM_REDUNDANT) { stateSectionObj["healthy"] = true; stateSectionObj["name"] = "optimizing_team_collections"; From 125bd131987f96adc3f9a821b4b052800e98f924 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 14:17:17 -0800 Subject: [PATCH 06/16] fix: in multi-region configurations, the data distribution queue could start too much work, expecting that the remote region would contribute to the read workload --- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/DataDistribution.actor.h | 1 + fdbserver/DataDistributionQueue.actor.cpp | 30 +++++++++++------------ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3d1e8ecdfc..6078565d3f 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4279,7 +4279,7 @@ ACTOR Future dataDistribution(Reference self) actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, self->ddId ), "DDTracker", self->ddId, &normalDDQueueErrors() ) ); - actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); + actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, self->ddId, storageTeamSize, configuration.storageTeamSize, &lastLimited ), "DDQueue", self->ddId, &normalDDQueueErrors() ) ); vector teamCollectionsPtrs; Reference primaryTeamCollection( new DDTeamCollection(cx, self->ddId, lock, output, shardsAffectedByTeamFailure, configuration, primaryDcId, configuration.usableRegions > 1 ? remoteDcIds : std::vector>(), readyToStart.getFuture(), zeroHealthyTeams[0], true, processingUnhealthy) ); diff --git a/fdbserver/DataDistribution.actor.h b/fdbserver/DataDistribution.actor.h index 005c1c56c2..c52c953e20 100644 --- a/fdbserver/DataDistribution.actor.h +++ b/fdbserver/DataDistribution.actor.h @@ -204,6 +204,7 @@ Future dataDistributionQueue( PromiseStream> const& getAverageShardBytes, UID const& distributorId, int const& teamSize, + int const& singleRegionTeamSize, double* const& lastLimited); //Holds the permitted size and IO Bounds for a shard diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index c534cda824..8540bde61d 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -285,29 +285,27 @@ struct Busyness { }; // find the "workFactor" for this, were it launched now -int getWorkFactor( RelocateData const& relocation ) { - // Avoid the divide by 0! - ASSERT( relocation.src.size() ); - +int getWorkFactor( RelocateData const& relocation, int singleRegionTeamSize ) { if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_1_LEFT || relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_0_LEFT ) return WORK_FULL_UTILIZATION / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; else if( relocation.healthPriority == SERVER_KNOBS->PRIORITY_TEAM_2_LEFT ) return WORK_FULL_UTILIZATION / 2 / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; else // for now we assume that any message at a lower priority can best be assumed to have a full team left for work - return WORK_FULL_UTILIZATION / relocation.src.size() / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; + return WORK_FULL_UTILIZATION / singleRegionTeamSize / SERVER_KNOBS->RELOCATION_PARALLELISM_PER_SOURCE_SERVER; } // Data movement's resource control: Do not overload source servers used for the RelocateData // return true if servers are not too busy to launch the relocation -bool canLaunch( RelocateData & relocation, int teamSize, std::map & busymap, +bool canLaunch( RelocateData & relocation, int teamSize, int singleRegionTeamSize, std::map & busymap, std::vector cancellableRelocations ) { // assert this has not already been launched ASSERT( relocation.workFactor == 0 ); ASSERT( relocation.src.size() != 0 ); + ASSERT( teamSize >= singleRegionTeamSize ); // find the "workFactor" for this, were it launched now - int workFactor = getWorkFactor( relocation ); - int neededServers = std::max( 1, (int)relocation.src.size() - teamSize + 1 ); + int workFactor = getWorkFactor( relocation, singleRegionTeamSize ); + int neededServers = std::min( relocation.src.size(), teamSize - singleRegionTeamSize + 1 ); // see if each of the SS can launch this task for( int i = 0; i < relocation.src.size(); i++ ) { // For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be cancelled @@ -328,9 +326,9 @@ bool canLaunch( RelocateData & relocation, int teamSize, std::map } // update busyness for each server -void launch( RelocateData & relocation, std::map & busymap ) { +void launch( RelocateData & relocation, std::map & busymap, int singleRegionTeamSize ) { // if we are here this means that we can launch and should adjust all the work the servers can do - relocation.workFactor = getWorkFactor( relocation ); + relocation.workFactor = getWorkFactor( relocation, singleRegionTeamSize ); for( int i = 0; i < relocation.src.size(); i++ ) busymap[ relocation.src[i] ].addWork( relocation.priority, relocation.workFactor ); } @@ -359,6 +357,7 @@ struct DDQueueData { int queuedRelocations; int64_t bytesWritten; int teamSize; + int singleRegionTeamSize; std::map busymap; @@ -415,10 +414,10 @@ struct DDQueueData { DDQueueData( UID mid, MoveKeysLock lock, Database cx, std::vector teamCollections, Reference sABTF, PromiseStream> getAverageShardBytes, - int teamSize, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, double* lastLimited ) : + int teamSize, int singleRegionTeamSize, PromiseStream output, FutureStream input, PromiseStream getShardMetrics, double* lastLimited ) : activeRelocations( 0 ), queuedRelocations( 0 ), bytesWritten ( 0 ), teamCollections( teamCollections ), shardsAffectedByTeamFailure( sABTF ), getAverageShardBytes( getAverageShardBytes ), distributorId( mid ), lock( lock ), - cx( cx ), teamSize( teamSize ), output( output ), input( input ), getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), + cx( cx ), teamSize( teamSize ), singleRegionTeamSize( singleRegionTeamSize ), output( output ), input( input ), getShardMetrics( getShardMetrics ), startMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), finishMoveKeysParallelismLock( SERVER_KNOBS->DD_MOVE_KEYS_PARALLELISM ), lastLimited(lastLimited), suppressIntervals(0), lastInterval(0), unhealthyRelocations(0), rawProcessingUnhealthy( new AsyncVar(false) ) {} @@ -815,7 +814,7 @@ struct DDQueueData { // Data movement avoids overloading source servers in moving data. // SOMEDAY: the list of source servers may be outdated since they were fetched when the work was put in the queue // FIXME: we need spare capacity even when we're just going to be cancelling work via TEAM_HEALTHY - if( !canLaunch( rd, teamSize, busymap, cancellableRelocations ) ) { + if( !canLaunch( rd, teamSize, singleRegionTeamSize, busymap, cancellableRelocations ) ) { //logRelocation( rd, "SkippingQueuedRelocation" ); continue; } @@ -853,7 +852,7 @@ struct DDQueueData { RelocateData& rrs = inFlight.rangeContaining(ranges[r].begin)->value(); rrs.keys = ranges[r]; - launch( rrs, busymap ); + launch( rrs, busymap, singleRegionTeamSize ); activeRelocations++; startRelocation(rrs.priority, rrs.healthPriority); inFlightActors.insert( rrs.keys, dataDistributionRelocator( this, rrs ) ); @@ -1396,9 +1395,10 @@ ACTOR Future dataDistributionQueue( PromiseStream> getAverageShardBytes, UID distributorId, int teamSize, + int singleRegionTeamSize, double* lastLimited) { - state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, output, input, getShardMetrics, lastLimited ); + state DDQueueData self( distributorId, lock, cx, teamCollections, shardsAffectedByTeamFailure, getAverageShardBytes, teamSize, singleRegionTeamSize, output, input, getShardMetrics, lastLimited ); state std::set serversToLaunchFrom; state KeyRange keysToLaunchFrom; state RelocateData launchData; From 820957025fd1d470edf271f7902e1bd3dd91d10d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 14:24:57 -0800 Subject: [PATCH 07/16] accept connections in batches of 20 to improve performance --- fdbrpc/FlowTransport.actor.cpp | 6 +++++- flow/Knobs.cpp | 2 +- flow/Knobs.h | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 83d40b1753..a2baab3b3d 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -988,6 +988,7 @@ ACTOR static Future connectionIncoming( TransportData* self, Reference listen( TransportData* self, NetworkAddress listenAddr ) { state ActorCollectionNoErrors incoming; // Actors monitoring incoming connections that haven't yet been associated with a peer state Reference listener = INetworkConnections::net()->listen( listenAddr ); + state int64_t connectionCount = 0; try { loop { Reference conn = wait( listener->accept() ); @@ -997,7 +998,10 @@ ACTOR static Future listen( TransportData* self, NetworkAddress listenAddr .detail("ListenAddress", listenAddr.toString()); incoming.add( connectionIncoming(self, conn) ); } - wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskPriority::WriteSocket)); + connectionCount++; + if( connectionCount%(FLOW_KNOBS->ACCEPT_BATCH_SIZE) == 0 ) { + wait(delay(0, TaskPriority::AcceptSocket)); + } } } catch (Error& e) { TraceEvent(SevError, "ListenError").error(e); diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index aa714551a0..751a8cd05b 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -67,7 +67,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( MAX_RECONNECTION_TIME, 0.5 ); init( RECONNECTION_TIME_GROWTH_RATE, 1.2 ); init( RECONNECTION_RESET_TIME, 5.0 ); - init( CONNECTION_ACCEPT_DELAY, 0.5 ); + init( ACCEPT_BATCH_SIZE, 20 ); init( USE_OBJECT_SERIALIZER, 1 ); init( TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY, 5.0 ); init( TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT, 20.0 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 358fc82be0..a3bdd1572f 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -87,7 +87,7 @@ public: double MAX_RECONNECTION_TIME; double RECONNECTION_TIME_GROWTH_RATE; double RECONNECTION_RESET_TIME; - double CONNECTION_ACCEPT_DELAY; + int ACCEPT_BATCH_SIZE; int USE_OBJECT_SERIALIZER; int TLS_CERT_REFRESH_DELAY_SECONDS; From da579faf62bb9c70223ee971b2a896a87a16b03c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 14:25:30 -0800 Subject: [PATCH 08/16] add missing task priority --- flow/network.h | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/network.h b/flow/network.h index 127d765bba..bbefb0d146 100644 --- a/flow/network.h +++ b/flow/network.h @@ -44,6 +44,7 @@ enum class TaskPriority { DiskIOComplete = 9150, LoadBalancedEndpoint = 9000, ReadSocket = 9000, + AcceptSocket = 8950, Handshake = 8900, CoordinationReply = 8810, Coordination = 8800, From 35a1ac648255415e7039298ae847d6a720d57666 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 14:26:01 -0800 Subject: [PATCH 09/16] prepare net2 for new versions of boost --- flow/Net2.actor.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index be6aecedd5..23bd3f724e 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -451,12 +451,13 @@ private: }; class Listener : public IListener, ReferenceCounted { + boost::asio::io_context& io_service; NetworkAddress listenAddress; tcp::acceptor acceptor; public: - Listener( boost::asio::io_service& io_service, NetworkAddress listenAddress ) - : listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ) + Listener( boost::asio::io_context& io_service, NetworkAddress listenAddress ) + : io_service(io_service), listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ) { platform::setCloseOnExec(acceptor.native_handle()); } @@ -473,7 +474,7 @@ public: private: ACTOR static Future> doAccept( Listener* self ) { - state Reference conn( new Connection( self->acceptor.get_io_service() ) ); + state Reference conn( new Connection( self->io_service ) ); state tcp::acceptor::endpoint_type peer_endpoint; try { BindPromise p("N2_AcceptError", UID()); @@ -785,13 +786,14 @@ private: }; class SSLListener : public IListener, ReferenceCounted { + boost::asio::io_context& io_service; NetworkAddress listenAddress; tcp::acceptor acceptor; boost::asio::ssl::context* context; public: - SSLListener( boost::asio::io_service& io_service, boost::asio::ssl::context* context, NetworkAddress listenAddress ) - : listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ), context(context) + SSLListener( boost::asio::io_context& io_service, boost::asio::ssl::context* context, NetworkAddress listenAddress ) + : io_service(io_service), listenAddress(listenAddress), acceptor( io_service, tcpEndpoint( listenAddress ) ), context(context) { platform::setCloseOnExec(acceptor.native_handle()); } @@ -808,7 +810,7 @@ public: private: ACTOR static Future> doAccept( SSLListener* self ) { - state Reference conn( new SSLConnection( self->acceptor.get_io_service(), *self->context) ); + state Reference conn( new SSLConnection( self->io_service, *self->context) ); state tcp::acceptor::endpoint_type peer_endpoint; try { BindPromise p("N2_AcceptError", UID()); @@ -861,7 +863,7 @@ Net2::Net2(bool useThreadPool, bool useMetrics, Reference tlsPolicy, tlsPolicy(tlsPolicy), tlsParams(tlsParams) #ifndef TLS_DISABLED - ,sslContext(boost::asio::ssl::context(boost::asio::ssl::context::tlsv12)) + ,sslContext(boost::asio::ssl::context(boost::asio::ssl::context::tls)) #endif { From 7cbabca124475eedac9e645f0f1f376893106879 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 15:06:22 -0800 Subject: [PATCH 10/16] remove printing to stderr from initTLS because that could cause problems on clients --- flow/Net2.actor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 23bd3f724e..420289b0a7 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -954,8 +954,7 @@ void Net2::initTLS() { sslContext.add_certificate_authority(boost::asio::buffer(cert.data(), cert.size())); } catch (Error& e) { - fprintf(stderr, "Error reading CA file %s: %s\n", tlsParams.tlsCAPath.c_str(), e.what()); - TraceEvent("Net2TLSReadCAError").error(e); + TraceEvent("Net2TLSReadCAError").error(e).detail("CAPath", tlsParams.tlsCAPath); throw tls_error(); } } @@ -978,7 +977,6 @@ void Net2::initTLS() { } } } catch(boost::system::system_error e) { - fprintf(stderr, "Error initializing TLS: %s\n", e.what()); TraceEvent("Net2TLSInitError").detail("Message", e.what()); throw tls_error(); } From b3c3f8aa5f5f52db5196cb1d6a272b2a722c4183 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 4 Mar 2020 15:35:51 -0800 Subject: [PATCH 11/16] Update flow/genericactors.actor.h Pass by reference --- flow/genericactors.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index e2f9eda32f..6fdb646fb7 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -669,7 +669,7 @@ class ReferencedObject : NonCopyable, public ReferenceCounted> from(V v) { + static Reference> from(V const& v) { return Reference>(new ReferencedObject(v)); } From 58e621eca184ae5a94feebc2b27f135b9570d7d9 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 4 Mar 2020 15:50:04 -0800 Subject: [PATCH 12/16] Invalid knobs or knob values are treated as warnings rather than errors. Apply this change to backup as well. --- documentation/sphinx/source/release-notes.rst | 5 +++++ fdbbackup/backup.actor.cpp | 11 +++++++---- fdbcli/fdbcli.actor.cpp | 9 +++++---- fdbserver/fdbserver.actor.cpp | 4 +++- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 73250b270b..67b54a7867 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -10,6 +10,11 @@ Features * Add support for setting knobs in fdbcli. `(PR #2773) `_. +Other Changes +------------- + +* Setting invalid knobs in backup and DR binaries is now a warning instead of an error and will not result in the application being terminated. `(PR #2773) `_. + 6.2.17 ====== diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 2ea44f1a99..e523de965e 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -3196,14 +3196,17 @@ int main(int argc, char* argv[]) { if (!flowKnobs->setKnob( k->first, k->second ) && !clientKnobs->setKnob( k->first, k->second )) { - fprintf(stderr, "Unrecognized knob option '%s'\n", k->first.c_str()); - return FDB_EXIT_ERROR; + fprintf(stderr, "WARNING: Unrecognized knob option '%s'\n", k->first.c_str()); + TraceEvent(SevWarnAlways, "UnrecognizedKnobOption").detail("Knob", printable(k->first)); } } catch (Error& e) { if (e.code() == error_code_invalid_option_value) { - fprintf(stderr, "Invalid value '%s' for option '%s'\n", k->second.c_str(), k->first.c_str()); - return FDB_EXIT_ERROR; + fprintf(stderr, "WARNING: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); + TraceEvent(SevWarnAlways, "InvalidKnobValue").detail("Knob", printable(k->first)).detail("Value", printable(k->second)); } + + fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); + TraceEvent(SevError, "FailedToSetKnob").detail("Knob", printable(k->first)).detail("Value", printable(k->second)).error(e); throw; } } diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 0697fe821d..15319d1431 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -2490,16 +2490,17 @@ struct CLIOptions { if (!flowKnobs->setKnob( k->first, k->second ) && !clientKnobs->setKnob( k->first, k->second )) { - fprintf(stderr, "ERROR: Unrecognized knob option '%s'\n", k->first.c_str()); - exit_code = FDB_EXIT_ERROR; + fprintf(stderr, "WARNING: Unrecognized knob option '%s'\n", k->first.c_str()); + TraceEvent(SevWarnAlways, "UnrecognizedKnobOption").detail("Knob", printable(k->first)); } } catch (Error& e) { if (e.code() == error_code_invalid_option_value) { - fprintf(stderr, "ERROR: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); - exit_code = FDB_EXIT_ERROR; + fprintf(stderr, "WARNING: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); + TraceEvent(SevWarnAlways, "InvalidKnobValue").detail("Knob", printable(k->first)).detail("Value", printable(k->second)); } else { fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); + TraceEvent(SevError, "FailedToSetKnob").detail("Knob", printable(k->first)).detail("Value", printable(k->second)).error(e); exit_code = FDB_EXIT_ERROR; } } diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 9f43cd2bde..70b0cc7346 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1476,9 +1476,11 @@ int main(int argc, char* argv[]) { } } catch (Error& e) { if (e.code() == error_code_invalid_option_value) { - fprintf(stderr, "WARNING: Invalid value '%s' for option '%s'\n", k->second.c_str(), k->first.c_str()); + fprintf(stderr, "WARNING: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); TraceEvent(SevWarnAlways, "InvalidKnobValue").detail("Knob", printable(k->first)).detail("Value", printable(k->second)); } else { + fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); + TraceEvent(SevError, "FailedToSetKnob").detail("Knob", printable(k->first)).detail("Value", printable(k->second)).error(e); throw; } } From cdcb81686688336a819b8603123f129aa49dd8ad Mon Sep 17 00:00:00 2001 From: Evan Tschannen <36455792+etschannen@users.noreply.github.com> Date: Wed, 4 Mar 2020 16:08:45 -0800 Subject: [PATCH 13/16] Update fdbbackup/backup.actor.cpp --- fdbbackup/backup.actor.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index e523de965e..afc27f7681 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -3204,10 +3204,11 @@ int main(int argc, char* argv[]) { fprintf(stderr, "WARNING: Invalid value '%s' for knob option '%s'\n", k->second.c_str(), k->first.c_str()); TraceEvent(SevWarnAlways, "InvalidKnobValue").detail("Knob", printable(k->first)).detail("Value", printable(k->second)); } - - fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); - TraceEvent(SevError, "FailedToSetKnob").detail("Knob", printable(k->first)).detail("Value", printable(k->second)).error(e); - throw; + else { + fprintf(stderr, "ERROR: Failed to set knob option '%s': %s\n", k->first.c_str(), e.what()); + TraceEvent(SevError, "FailedToSetKnob").detail("Knob", printable(k->first)).detail("Value", printable(k->second)).error(e); + throw; + } } } From 976c2fc7a834aee4100cc489c91435f81cdb6257 Mon Sep 17 00:00:00 2001 From: Evan Tschannen <36455792+etschannen@users.noreply.github.com> Date: Wed, 4 Mar 2020 16:13:59 -0800 Subject: [PATCH 14/16] Update fdbrpc/FlowTransport.actor.cpp Co-Authored-By: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com> --- fdbrpc/FlowTransport.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index a2baab3b3d..5041489d67 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -988,7 +988,7 @@ ACTOR static Future connectionIncoming( TransportData* self, Reference listen( TransportData* self, NetworkAddress listenAddr ) { state ActorCollectionNoErrors incoming; // Actors monitoring incoming connections that haven't yet been associated with a peer state Reference listener = INetworkConnections::net()->listen( listenAddr ); - state int64_t connectionCount = 0; + state uint64_t connectionCount = 0; try { loop { Reference conn = wait( listener->accept() ); From 6d6f184e2f96934d681254ea650aeb0ab4ad60be Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 16:23:49 -0800 Subject: [PATCH 15/16] added a knob which reverts the new queue behavior --- fdbserver/DataDistributionQueue.actor.cpp | 3 +++ fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + 3 files changed, 5 insertions(+) diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index 8540bde61d..b33349b2b2 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -306,6 +306,9 @@ bool canLaunch( RelocateData & relocation, int teamSize, int singleRegionTeamSiz // find the "workFactor" for this, were it launched now int workFactor = getWorkFactor( relocation, singleRegionTeamSize ); int neededServers = std::min( relocation.src.size(), teamSize - singleRegionTeamSize + 1 ); + if(SERVER_KNOBS->USE_OLD_NEEDED_SERVERS) { + neededServers = std::max( 1, (int)relocation.src.size() - teamSize + 1 ); + } // see if each of the SS can launch this task for( int i = 0; i < relocation.src.size(); i++ ) { // For each source server for this relocation, copy and modify its busyness to reflect work that WOULD be cancelled diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 2ce0aac021..783f8f319f 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -104,6 +104,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs, bool isSimula init( INFLIGHT_PENALTY_HEALTHY, 1.0 ); init( INFLIGHT_PENALTY_UNHEALTHY, 500.0 ); init( INFLIGHT_PENALTY_ONE_LEFT, 1000.0 ); + init( USE_OLD_NEEDED_SERVERS, false ); init( PRIORITY_RECOVER_MOVE, 110 ); init( PRIORITY_REBALANCE_UNDERUTILIZED_TEAM, 120 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index c5c41fc58f..fc54a7b065 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -104,6 +104,7 @@ public: double INFLIGHT_PENALTY_REDUNDANT; double INFLIGHT_PENALTY_UNHEALTHY; double INFLIGHT_PENALTY_ONE_LEFT; + bool USE_OLD_NEEDED_SERVERS; // Higher priorities are executed first // Priority/100 is the "priority group"/"superpriority". Priority inversion From b353ea1fd1e765b6fb20e3e6bfee3b5bbc231f86 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 4 Mar 2020 17:40:59 -0800 Subject: [PATCH 16/16] updated documentation --- documentation/sphinx/source/downloads.rst | 24 +++++++++---------- documentation/sphinx/source/release-notes.rst | 14 ++++++++++- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 6659b6a27f..c9084089ab 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.2.17.pkg `_ +* `FoundationDB-6.2.18.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.2.17-1_amd64.deb `_ -* `foundationdb-server-6.2.17-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.2.18-1_amd64.deb `_ +* `foundationdb-server-6.2.18-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.2.17-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.2.17-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.18-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.2.18-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.2.17-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.2.17-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.18-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.2.18-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.2.17-x64.msi `_ +* `foundationdb-6.2.18-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.2.17.tar.gz `_ +* `foundationdb-6.2.18.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.2.17.gem `_ +* `fdb-6.2.18.gem `_ Java 8+ ------- -* `fdb-java-6.2.17.jar `_ -* `fdb-java-6.2.17-javadoc.jar `_ +* `fdb-java-6.2.18.jar `_ +* `fdb-java-6.2.18-javadoc.jar `_ Go 1.11+ -------- diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 67b54a7867..cc632dab6f 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -5,6 +5,18 @@ Release Notes 6.2.18 ====== +Fixes +----- + +* When configuring a cluster to usable_regions=2, data distribution would not react to machine failures while copying data to the remote region. `(PR #2774) `_. +* When a cluster is configured with usable_regions=2, data distribution could push a cluster into saturation by relocating too many shards simulatenously. `(PR #2776) `_. +* Backup could not establish TLS connections (broken in 6.2.16). `(PR #2775) `_. + +Performance +----------- + +* Improved the efficiency of establishing large numbers of network connections. `(PR #2777) `_. + Features -------- @@ -21,7 +33,7 @@ Other Changes Fixes ----- -* Restored the ability to set TLS configuration using environment variables. `(PR #2755) `_. +* Restored the ability to set TLS configuration using environment variables (broken in 6.2.16). `(PR #2755) `_. 6.2.16 ======