From 8425f53fc5b3f271ce6246156ffd345db5183155 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Sun, 28 Jul 2019 23:52:29 -0700 Subject: [PATCH 1/4] clients only connect to three proxies --- fdbclient/Knobs.cpp | 1 + fdbclient/Knobs.h | 1 + fdbclient/MonitorLeader.actor.cpp | 9 ++++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index 788686023c..1c62b8fe26 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -45,6 +45,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( COORDINATOR_RECONNECTION_DELAY, 1.0 ); init( CLIENT_EXAMPLE_AMOUNT, 20 ); init( MAX_CLIENT_STATUS_AGE, 1.0 ); + init( MAX_CLIENT_PROXY_CONNECTIONS, 3 ); if( randomize && BUGGIFY ) MAX_CLIENT_PROXY_CONNECTIONS = 1; // wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h index 099bbb4306..eb40e8d7f3 100644 --- a/fdbclient/Knobs.h +++ b/fdbclient/Knobs.h @@ -44,6 +44,7 @@ public: double COORDINATOR_RECONNECTION_DELAY; int CLIENT_EXAMPLE_AMOUNT; double MAX_CLIENT_STATUS_AGE; + int MAX_CLIENT_PROXY_CONNECTIONS; // wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test) diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 15735a26ed..20d2c62d0c 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -601,8 +601,15 @@ ACTOR Future getClientInfoFromLeader( ReferenceclientInfo->get().id; choose { - when( ClientDBInfo ni = wait( brokenPromiseToNever( knownLeader->get().get().clientInterface.openDatabase.getReply( req ) ) ) ) { + when( state ClientDBInfo ni = wait( brokenPromiseToNever( knownLeader->get().get().clientInterface.openDatabase.getReply( req ) ) ) ) { TraceEvent("MonitorLeaderForProxiesGotClientInfo", knownLeader->get().get().clientInterface.id()).detail("Proxy0", ni.proxies.size() ? ni.proxies[0].id() : UID()).detail("ClientID", ni.id); + if(ni.proxies.size() > CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS) { + deterministicRandom()->randomShuffle(ni.proxies); + ni.proxies.resize(CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS); + for(int i = 0; i < ni.proxies.size(); i++) { + TraceEvent("ClientConnectedProxy", knownLeader->get().get().clientInterface.id()).detail("Proxy", ni.proxies[i].id()); + } + } clientData->clientInfo->set(ni); } when( wait( knownLeader->onChange() ) ) {} From 7aece7398baffddc4f10287435cba29b54136d74 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 30 Jul 2019 17:15:24 -0700 Subject: [PATCH 2/4] fix: it was reducing the list of proxies on the coordinators, which would have made all the clients talking to that coordinator connect to the same set of proxies optimized the code to avoid re-randomizing the same list of proxies --- fdbclient/Knobs.cpp | 2 +- fdbclient/MonitorLeader.actor.cpp | 25 ++++++++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index 1c62b8fe26..d9777a1f1e 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -45,7 +45,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( COORDINATOR_RECONNECTION_DELAY, 1.0 ); init( CLIENT_EXAMPLE_AMOUNT, 20 ); init( MAX_CLIENT_STATUS_AGE, 1.0 ); - init( MAX_CLIENT_PROXY_CONNECTIONS, 3 ); if( randomize && BUGGIFY ) MAX_CLIENT_PROXY_CONNECTIONS = 1; + init( MAX_CLIENT_PROXY_CONNECTIONS, 5 ); if( randomize && BUGGIFY ) MAX_CLIENT_PROXY_CONNECTIONS = 1; // wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 20d2c62d0c..4223569569 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -603,13 +603,6 @@ ACTOR Future getClientInfoFromLeader( Referenceget().get().clientInterface.openDatabase.getReply( req ) ) ) ) { TraceEvent("MonitorLeaderForProxiesGotClientInfo", knownLeader->get().get().clientInterface.id()).detail("Proxy0", ni.proxies.size() ? ni.proxies[0].id() : UID()).detail("ClientID", ni.id); - if(ni.proxies.size() > CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS) { - deterministicRandom()->randomShuffle(ni.proxies); - ni.proxies.resize(CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS); - for(int i = 0; i < ni.proxies.size(); i++) { - TraceEvent("ClientConnectedProxy", knownLeader->get().get().clientInterface.id()).detail("Proxy", ni.proxies[i].id()); - } - } clientData->clientInfo->set(ni); } when( wait( knownLeader->onChange() ) ) {} @@ -674,6 +667,8 @@ ACTOR Future monitorProxiesOneGeneration( Reference incorrectTime; + state std::vector lastProxyUIDs; + deterministicRandom()->randomShuffle(addrs); loop { state ClientLeaderRegInterface clientLeaderServer( addrs[idx] ); @@ -723,6 +718,22 @@ ACTOR Future monitorProxiesOneGeneration( ReferencenotifyConnected(); + auto& ni = rep.get(); + if(ni.proxies.size() > CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS) { + std::vector proxyUIDs; + for(auto& proxy : ni.proxies) { + proxyUIDs.push_back(proxy.id()); + } + if(proxyUIDs != lastProxyUIDs) { + lastProxyUIDs = proxyUIDs; + deterministicRandom()->randomShuffle(ni.proxies); + ni.proxies.resize(CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS); + for(int i = 0; i < ni.proxies.size(); i++) { + TraceEvent("ClientConnectedProxy", knownLeader->get().get().clientInterface.id()).detail("Proxy", ni.proxies[i].id()); + } + } + } + clientInfo->set( rep.get() ); successIdx = idx; } else if(idx == successIdx) { From 85767f2034293f41aa4241cd0583fc7245346303 Mon Sep 17 00:00:00 2001 From: Evan Tschannen <36455792+etschannen@users.noreply.github.com> Date: Tue, 30 Jul 2019 17:19:33 -0700 Subject: [PATCH 3/4] Update fdbclient/MonitorLeader.actor.cpp --- fdbclient/MonitorLeader.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 4223569569..d4140fb59a 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -601,7 +601,7 @@ ACTOR Future getClientInfoFromLeader( ReferenceclientInfo->get().id; choose { - when( state ClientDBInfo ni = wait( brokenPromiseToNever( knownLeader->get().get().clientInterface.openDatabase.getReply( req ) ) ) ) { + when( ClientDBInfo ni = wait( brokenPromiseToNever( knownLeader->get().get().clientInterface.openDatabase.getReply( req ) ) ) ) { TraceEvent("MonitorLeaderForProxiesGotClientInfo", knownLeader->get().get().clientInterface.id()).detail("Proxy0", ni.proxies.size() ? ni.proxies[0].id() : UID()).detail("ClientID", ni.id); clientData->clientInfo->set(ni); } From 54df2abe8e4f14abd11f08af7f32c61208f99c37 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 30 Jul 2019 17:52:53 -0700 Subject: [PATCH 4/4] fix: trace event did not compile --- fdbclient/MonitorLeader.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index d4140fb59a..c296807a09 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -729,7 +729,7 @@ ACTOR Future monitorProxiesOneGeneration( ReferencerandomShuffle(ni.proxies); ni.proxies.resize(CLIENT_KNOBS->MAX_CLIENT_PROXY_CONNECTIONS); for(int i = 0; i < ni.proxies.size(); i++) { - TraceEvent("ClientConnectedProxy", knownLeader->get().get().clientInterface.id()).detail("Proxy", ni.proxies[i].id()); + TraceEvent("ClientConnectedProxy").detail("Proxy", ni.proxies[i].id()); } } }