From e8f994965d00cc4db1dac85a74687fa5c6a531c0 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 29 Mar 2018 18:00:12 -0700 Subject: [PATCH 01/25] python: Post-API Version 620, @fdb.transactional on a generator will throw. Previously, writing code like @fdb.transactional def foo(tr): yield tr.get('a') yield tr.get('b') print(foo(db)) was accepted by the python bindings, but had surprising semantics. The function returned by @fdb.transactional opens a transaction, runs foo(), commits the transaction, and then returns the generator returned by foo(). This generator then uses the committed transaction. This worked before API version 410 (FDB 4.1), and hasn't worked since. This will also be a problem if a closure is returned from foo() that contains `tr`, but it's much harder to check that in Python. Rather than allow users to hit an unexpected and mysterious "Operation issued while a commit was outstanding" exception, it's nicer to explicitly highlight this problem as soon as we can. Unfortunately, we have no way to know that a function will return a generator until we call it, so that's the soonest we can give a more informative error. --- bindings/python/fdb/impl.py | 21 +++++++++++++-------- bindings/python/tests/tester.py | 10 ++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 77c7a74d13..e69bda22cb 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -22,16 +22,17 @@ import ctypes import ctypes.util +import datetime import functools +import inspect +import multiprocessing +import os +import platform +import sys import threading import traceback -import inspect -import datetime -import platform -import os -import sys -import multiprocessing +import fdb from fdb import six _network_thread = None @@ -203,7 +204,9 @@ def transactional(*tr_args, **tr_kwargs): It is important to note that the wrapped method may be called multiple times in the event of a commit failure, until the commit - succeeds. + succeeds. This restriction requires that the wrapped function + may not be a generator, or a function that returns a closure that + contains the `tr` object. If given a Transaction, the Transaction will be passed into the wrapped code, and WILL NOT be committed at completion of the @@ -247,7 +250,6 @@ def transactional(*tr_args, **tr_kwargs): except FDBError as e: yield asyncio.From(tr.on_error(e.code)) else: - @functools.wraps(func) def wrapper(*args, **kwargs): if isinstance(args[index], TransactionRead): @@ -269,6 +271,9 @@ def transactional(*tr_args, **tr_kwargs): except FDBError as e: tr.on_error(e.code).wait() + if fdb.get_api_version() >= 620 and isinstance(ret, types.GeneratorType): + raise ValueError("Generators can not be wrapped with fdb.transactional") + # now = datetime.datetime.now() # td = now - last # elapsed = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / float(10**6) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 32ae2c01a3..c430d699f9 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -125,6 +125,16 @@ class Instruction: self.stack.push(self.index, val) +def test_fdb_transactional_generator(db): + try: + @fdb.transactional + def function_that_yields(tr): + yield 0 + assert fdb.get_api_version() < 620, "Generators post-6.2.0 should throw" + except ValueError as e: + pass + + def test_db_options(db): db.options.set_max_watches(100001) db.options.set_datacenter_id("dc_id") From 4098eb721e581f24ebee7d4213c3f7b1018db856 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 19 Jun 2019 02:35:07 -0700 Subject: [PATCH 02/25] Upon reflection, I think this test was doing nothing. --- bindings/python/tests/tester.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index c430d699f9..98e12dd61e 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -130,6 +130,7 @@ def test_fdb_transactional_generator(db): @fdb.transactional def function_that_yields(tr): yield 0 + function_that_yields(db) assert fdb.get_api_version() < 620, "Generators post-6.2.0 should throw" except ValueError as e: pass From 100afcd4f8dc67cdfc3b379464e5c71fdaed2135 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 27 Jun 2019 18:56:29 -0700 Subject: [PATCH 03/25] Remove @fdb.transactional from docs example. To reflect that this would fail if a database was passed in and not an open transaction. --- documentation/sphinx/source/developer-guide.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index e17651a79c..f9ea20e6a7 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -463,7 +463,6 @@ In some situations, you may want to explicitly control the number of key-value p LIMIT = 100 # adjust according to data characteristics - @fdb.transactional def get_range_limited(tr, begin, end): keys_found = True while keys_found: From 40b290b7de8fc4ccff5f9bdfd35f6ffa92206a5e Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 27 Jun 2019 18:57:58 -0700 Subject: [PATCH 04/25] If `func` throws, then `ret` will never be assigned, and accessed later. Thus we now initiailize it with None so that it will always have a value. --- bindings/python/fdb/impl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index e69bda22cb..f11b15711a 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -264,6 +264,7 @@ def transactional(*tr_args, **tr_kwargs): # last = start while not committed: + ret = None try: ret = func(*largs, **kwargs) tr.commit().wait() From 4ceae8c83a3017e505f09b4c125eb5ec5b39f617 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 23 Jan 2020 16:09:56 -0800 Subject: [PATCH 05/25] Ban generators in 7.0.0 --- bindings/python/fdb/impl.py | 11 ++++++++--- bindings/python/tests/tester.py | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 490b10cc46..5641832970 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -252,6 +252,12 @@ def transactional(*tr_args, **tr_kwargs): else: @functools.wraps(func) def wrapper(*args, **kwargs): + # We can't throw this from the decorator, as when a user runs + # >>> import fdb ; fdb.api_version(700) + # the code above uses @transactional before the API version is set + if fdb.get_api_version() >= 700 and inspect.isgeneratorfunction(func): + raise ValueError("Generators can not be wrapped with fdb.transactional") + if isinstance(args[index], TransactionRead): return func(*args, **kwargs) @@ -267,14 +273,13 @@ def transactional(*tr_args, **tr_kwargs): ret = None try: ret = func(*largs, **kwargs) + if fdb.get_api_version() >= 700 and inspect.isgenerator(ret): + raise ValueError("Generators can not be wrapped with fdb.transactional") tr.commit().wait() committed = True except FDBError as e: tr.on_error(e.code).wait() - if fdb.get_api_version() >= 620 and isinstance(ret, types.GeneratorType): - raise ValueError("Generators can not be wrapped with fdb.transactional") - # now = datetime.datetime.now() # td = now - last # elapsed = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / float(10**6) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 4376a29766..a32de5114b 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -132,10 +132,22 @@ def test_fdb_transactional_generator(db): @fdb.transactional def function_that_yields(tr): yield 0 - function_that_yields(db) - assert fdb.get_api_version() < 620, "Generators post-6.2.0 should throw" + assert fdb.get_api_version() < 700, "Pre-7.0, a decorators may wrap a function that yield" except ValueError as e: - pass + assert fdb.get_api_version() >= 700, "Post-7.0, a decorator should throw if wrapped function yields" + + +def test_fdb_transactional_returns_generator(db): + try: + def function_that_yields(tr): + yield 0 + @fdb.transactional + def function_that_returns(tr): + return function_that_yields(tr) + function_that_returns() + assert fdb.get_api_version() < 700, "Pre-7.0, returning a generator is allowed" + except ValueError as e: + assert fdb.get_api_version() >= 700, "Post-7.0, returning a generator should throw" def test_db_options(db): From d688536a7c2a3a41bc16e5066a6dfdc2825978e7 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 23 Jan 2020 16:53:02 -0800 Subject: [PATCH 06/25] Remove fdb.transactional from get_range_limited, due to generator --- documentation/sphinx/source/segmented-range-reads.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/documentation/sphinx/source/segmented-range-reads.rst b/documentation/sphinx/source/segmented-range-reads.rst index c5a7bfa69f..70068d8a9a 100644 --- a/documentation/sphinx/source/segmented-range-reads.rst +++ b/documentation/sphinx/source/segmented-range-reads.rst @@ -48,7 +48,6 @@ Code Here’s a basic function that successively reads sub-ranges of a size determined by the value of ``LIMIT``. :: - @fdb.transactional def get_range_limited(tr, begin, end): keys_found = True while keys_found: From 986905f3dda71fe5d7a9c79d179b1e33dec51f9f Mon Sep 17 00:00:00 2001 From: Alex Miller <35046903+alexmiller-apple@users.noreply.github.com> Date: Thu, 6 Feb 2020 16:57:14 -0800 Subject: [PATCH 07/25] Update bindings/python/tests/tester.py Co-Authored-By: A.J. Beamon --- bindings/python/tests/tester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index a32de5114b..5db96ac9a0 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -132,7 +132,7 @@ def test_fdb_transactional_generator(db): @fdb.transactional def function_that_yields(tr): yield 0 - assert fdb.get_api_version() < 700, "Pre-7.0, a decorators may wrap a function that yield" + assert fdb.get_api_version() < 700, "Pre-7.0, a decorator may wrap a function that yields" except ValueError as e: assert fdb.get_api_version() >= 700, "Post-7.0, a decorator should throw if wrapped function yields" From 9dc65547adbe14938a14be8581dc8f096b4b32ef Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 11 May 2020 11:24:19 -0700 Subject: [PATCH 08/25] Avoid setting failure status of local addresses in main thread --- fdbrpc/FailureMonitor.actor.cpp | 9 +++++++++ fdbrpc/FailureMonitor.h | 2 +- fdbrpc/FlowTransport.actor.cpp | 12 +++--------- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fdbrpc/FailureMonitor.actor.cpp b/fdbrpc/FailureMonitor.actor.cpp index fcea9ec014..f47dfc3d8a 100644 --- a/fdbrpc/FailureMonitor.actor.cpp +++ b/fdbrpc/FailureMonitor.actor.cpp @@ -64,6 +64,15 @@ Future IFailureMonitor::onFailedFor(Endpoint const& endpoint, double susta return waitForContinuousFailure(this, endpoint, sustainedFailureDuration, slope); } +SimpleFailureMonitor::SimpleFailureMonitor() : endpointKnownFailed() { + // Mark ourselves as avaiable in FailureMonitor + const auto& localAddresses = FlowTransport::transport().getLocalAddresses(); + addressStatus[localAddresses.address] = FailureStatus(false); + if (localAddresses.secondaryAddress.present()) { + addressStatus[localAddresses.secondaryAddress.get()] = FailureStatus(false); + } +} + void SimpleFailureMonitor::setStatus(NetworkAddress const& address, FailureStatus const& status) { // if (status.failed) diff --git a/fdbrpc/FailureMonitor.h b/fdbrpc/FailureMonitor.h index dd36517842..d6d11e6e3e 100644 --- a/fdbrpc/FailureMonitor.h +++ b/fdbrpc/FailureMonitor.h @@ -136,7 +136,7 @@ public: class SimpleFailureMonitor : public IFailureMonitor { public: - SimpleFailureMonitor() : endpointKnownFailed() {} + SimpleFailureMonitor(); void setStatus(NetworkAddress const& address, FailureStatus const& status); void endpointNotFound(Endpoint const&); virtual void notifyDisconnect(NetworkAddress const&); diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index e659602aca..aa8fc5f25a 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -586,6 +586,7 @@ ACTOR Future connectionKeeper( Reference self, if (firstConnFailedTime.present()) { if (now() - firstConnFailedTime.get() > FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT) { TraceEvent(SevWarnAlways, "PeerUnavailableForLongTime", conn ? conn->getDebugID() : UID()) + .suppressFor(1.0) .detail("PeerAddr", self->destination); firstConnFailedTime = now() - FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT/2.0; } @@ -1446,18 +1447,11 @@ bool FlowTransport::incompatibleOutgoingConnectionsPresent() { } void FlowTransport::createInstance(bool isClient, uint64_t transportId) { - g_network->setGlobal(INetwork::enFailureMonitor, (flowGlobalType) new SimpleFailureMonitor()); - g_network->setGlobal(INetwork::enClientFailureMonitor, isClient ? (flowGlobalType)1 : nullptr); g_network->setGlobal(INetwork::enFlowTransport, (flowGlobalType) new FlowTransport(transportId)); g_network->setGlobal(INetwork::enNetworkAddressFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddress); g_network->setGlobal(INetwork::enNetworkAddressesFunc, (flowGlobalType) &FlowTransport::getGlobalLocalAddresses); - - // Mark ourselves as avaiable in FailureMonitor - const auto& localAddresses = FlowTransport::transport().getLocalAddresses(); - IFailureMonitor::failureMonitor().setStatus(localAddresses.address, FailureStatus(false)); - if (localAddresses.secondaryAddress.present()) { - IFailureMonitor::failureMonitor().setStatus(localAddresses.secondaryAddress.get(), FailureStatus(false)); - } + g_network->setGlobal(INetwork::enFailureMonitor, (flowGlobalType) new SimpleFailureMonitor()); + g_network->setGlobal(INetwork::enClientFailureMonitor, isClient ? (flowGlobalType)1 : nullptr); } HealthMonitor* FlowTransport::healthMonitor() { From ffc36dcf45b7cd1db823b193ded648f2247774c9 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 11 May 2020 11:53:34 -0700 Subject: [PATCH 09/25] 7.0 became 6.3, so change API versions accordingly --- bindings/python/fdb/impl.py | 6 +++--- bindings/python/tests/tester.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py index 58acec1150..3dd5e87077 100644 --- a/bindings/python/fdb/impl.py +++ b/bindings/python/fdb/impl.py @@ -253,9 +253,9 @@ def transactional(*tr_args, **tr_kwargs): @functools.wraps(func) def wrapper(*args, **kwargs): # We can't throw this from the decorator, as when a user runs - # >>> import fdb ; fdb.api_version(700) + # >>> import fdb ; fdb.api_version(630) # the code above uses @transactional before the API version is set - if fdb.get_api_version() >= 700 and inspect.isgeneratorfunction(func): + if fdb.get_api_version() >= 630 and inspect.isgeneratorfunction(func): raise ValueError("Generators can not be wrapped with fdb.transactional") if isinstance(args[index], TransactionRead): @@ -273,7 +273,7 @@ def transactional(*tr_args, **tr_kwargs): ret = None try: ret = func(*largs, **kwargs) - if fdb.get_api_version() >= 700 and inspect.isgenerator(ret): + if fdb.get_api_version() >= 630 and inspect.isgenerator(ret): raise ValueError("Generators can not be wrapped with fdb.transactional") tr.commit().wait() committed = True diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index bb564a188f..a2ae66dde6 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -132,9 +132,9 @@ def test_fdb_transactional_generator(db): @fdb.transactional def function_that_yields(tr): yield 0 - assert fdb.get_api_version() < 700, "Pre-7.0, a decorators may wrap a function that yield" + assert fdb.get_api_version() < 630, "Pre-6.3, a decorators may wrap a function that yield" except ValueError as e: - assert fdb.get_api_version() >= 700, "Post-7.0, a decorator should throw if wrapped function yields" + assert fdb.get_api_version() >= 630, "Post-6.3, a decorator should throw if wrapped function yields" def test_fdb_transactional_returns_generator(db): @@ -145,9 +145,9 @@ def test_fdb_transactional_returns_generator(db): def function_that_returns(tr): return function_that_yields(tr) function_that_returns() - assert fdb.get_api_version() < 700, "Pre-7.0, returning a generator is allowed" + assert fdb.get_api_version() < 630, "Pre-6.3, returning a generator is allowed" except ValueError as e: - assert fdb.get_api_version() >= 700, "Post-7.0, returning a generator should throw" + assert fdb.get_api_version() >= 630, "Post-6.3, returning a generator should throw" def test_db_options(db): From afe6bf19e2fc2ed4b33c8e8f6f8a4d6c05d13947 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 11 May 2020 12:53:19 -0700 Subject: [PATCH 10/25] Fix typos and whitespace --- fdbrpc/FailureMonitor.actor.cpp | 2 +- fdbrpc/FlowTransport.actor.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbrpc/FailureMonitor.actor.cpp b/fdbrpc/FailureMonitor.actor.cpp index f47dfc3d8a..799c4fda77 100644 --- a/fdbrpc/FailureMonitor.actor.cpp +++ b/fdbrpc/FailureMonitor.actor.cpp @@ -65,7 +65,7 @@ Future IFailureMonitor::onFailedFor(Endpoint const& endpoint, double susta } SimpleFailureMonitor::SimpleFailureMonitor() : endpointKnownFailed() { - // Mark ourselves as avaiable in FailureMonitor + // Mark ourselves as available in FailureMonitor const auto& localAddresses = FlowTransport::transport().getLocalAddresses(); addressStatus[localAddresses.address] = FailureStatus(false); if (localAddresses.secondaryAddress.present()) { diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index aa8fc5f25a..65ef314e9f 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -586,7 +586,7 @@ ACTOR Future connectionKeeper( Reference self, if (firstConnFailedTime.present()) { if (now() - firstConnFailedTime.get() > FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT) { TraceEvent(SevWarnAlways, "PeerUnavailableForLongTime", conn ? conn->getDebugID() : UID()) - .suppressFor(1.0) + .suppressFor(1.0) .detail("PeerAddr", self->destination); firstConnFailedTime = now() - FLOW_KNOBS->PEER_UNAVAILABLE_FOR_LONG_TIME_TIMEOUT/2.0; } From c90ff6a6c3020619036e8742acc415279cc4a4bf Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 11 May 2020 18:17:57 -0700 Subject: [PATCH 11/25] revert loadBalanceDelay because it hurts read performance --- fdbserver/storageserver.actor.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 3f46be2958..74d87bc9cb 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -407,8 +407,6 @@ public: Reference> db; Database cx; ActorCollection actors; - Future loadBalanceDelay; - int sharedDelayCount; StorageServerMetrics metrics; CoalescedKeyRangeMap> byteSampleClears; @@ -604,7 +602,7 @@ public: rebootAfterDurableVersion(std::numeric_limits::max()), durableInProgress(Void()), versionLag(0), primaryLocality(tagLocalityInvalid), - updateEagerReads(0), sharedDelayCount(0), loadBalanceDelay(Void()), + updateEagerReads(0), shardChangeCounter(0), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), @@ -625,13 +623,6 @@ public: cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true); } - Future getLoadBalanceDelay() { - if(loadBalanceDelay.isReady() || ++sharedDelayCount > SERVER_KNOBS->MAX_SHARED_LOAD_BALANCE_DELAY) { - sharedDelayCount = 0; - loadBalanceDelay = delay(0, TaskPriority::DefaultEndpoint); - } - return loadBalanceDelay; - } //~StorageServer() { fclose(log); } // Puts the given shard into shards. The caller is responsible for adding shards @@ -926,7 +917,7 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( data->getLoadBalanceDelay() ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); @@ -1463,7 +1454,7 @@ ACTOR Future getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req ) // // Placeholder for up-prioritizing fetches for important requests // taskType = TaskPriority::DefaultDelay; } else { - wait( data->getLoadBalanceDelay() ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); } try { @@ -1594,7 +1585,7 @@ ACTOR Future getKeyQ( StorageServer* data, GetKeyRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( data->getLoadBalanceDelay() ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); try { state Version version = wait( waitForVersion( data, req.version ) ); From a8e0f1d581dc2e5074661808e3b96c1a495e9f5e Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 11 May 2020 18:20:46 -0700 Subject: [PATCH 12/25] removed knob --- fdbserver/Knobs.cpp | 1 - fdbserver/Knobs.h | 1 - 2 files changed, 2 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index ee804fff19..281b902213 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -546,7 +546,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( BEHIND_CHECK_COUNT, 2 ); init( BEHIND_CHECK_VERSIONS, 5 * VERSIONS_PER_SECOND ); init( WAIT_METRICS_WRONG_SHARD_CHANCE, isSimulated ? 1.0 : 0.1 ); - init( MAX_SHARED_LOAD_BALANCE_DELAY, 20 ); init( MIN_TAG_PAGES_READ_RATE, 1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_PAGES_READ_RATE = 0; init( READ_TAG_MEASUREMENT_INTERVAL, 30.0 ); if( randomize && BUGGIFY ) READ_TAG_MEASUREMENT_INTERVAL = 1.0; init( OPERATION_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) OPERATION_COST_BYTE_FACTOR = 4096; diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 781a0e8c8d..3fce535859 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -475,7 +475,6 @@ public: int BEHIND_CHECK_COUNT; int64_t BEHIND_CHECK_VERSIONS; double WAIT_METRICS_WRONG_SHARD_CHANCE; - int MAX_SHARED_LOAD_BALANCE_DELAY; int64_t MIN_TAG_PAGES_READ_RATE; double READ_TAG_MEASUREMENT_INTERVAL; int64_t OPERATION_COST_BYTE_FACTOR; From 696d95ebca203c25c2adbb35b81074746caae7f4 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Tue, 12 May 2020 08:12:15 -0700 Subject: [PATCH 13/25] Fix the possible dead loop when endKey got reset to equal to the beginKey. --- fdbserver/StorageMetrics.actor.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h index a04007b6f9..41f51ac227 100644 --- a/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/StorageMetrics.actor.h @@ -436,7 +436,13 @@ struct StorageServerMetrics { IndexedSet::iterator endKey = byteSample.sample.index(byteSample.sample.sumTo(byteSample.sample.lower_bound(beginKey)) + baseChunkSize); while (endKey != byteSample.sample.end()) { - if (*endKey > shard.end) endKey = byteSample.sample.lower_bound(shard.end); + if (*endKey > shard.end) { + endKey = byteSample.sample.lower_bound(shard.end); + if (*endKey == beginKey) { + // No need to increment endKey since otherwise it would stuck here forever. + break; + } + } if (*endKey == beginKey) { ++endKey; continue; From f33af59a6fc5f5163578e8002e8d56955911cf7d Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Tue, 12 May 2020 07:45:12 -0700 Subject: [PATCH 14/25] Added support for building bindings via cmake --- build/docker-compose.yaml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/build/docker-compose.yaml b/build/docker-compose.yaml index 2f6fe49b42..a20dc2917e 100644 --- a/build/docker-compose.yaml +++ b/build/docker-compose.yaml @@ -60,15 +60,23 @@ services: snapshot-cmake: &snapshot-cmake <<: *build-setup - command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DCMAKE_COLOR_MAKEFILE=0 -DFDB_RELEASE=0 -DVALGRIND=0 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" "packages" "strip_targets" && cpack' + command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=0 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" "packages" "strip_targets" && cpack' prb-cmake: <<: *snapshot-cmake + snapshot-bindings-cmake: &snapshot-bindings-cmake + <<: *build-setup + command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=0 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" "bindings/all"' + + prb-bindings-cmake: + <<: *snapshot-bindings-cmake + + snapshot-ctest: &snapshot-ctest <<: *build-setup - command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DCMAKE_COLOR_MAKEFILE=0 -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -L fast -j "$${MAKEJOBS}" --output-on-failure' + command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -L fast -j "$${MAKEJOBS}" --output-on-failure' prb-ctest: <<: *snapshot-ctest @@ -76,7 +84,7 @@ services: snapshot-correctness: &snapshot-correctness <<: *build-setup - command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DCMAKE_COLOR_MAKEFILE=0 -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -j "$${MAKEJOBS}" --output-on-failure' + command: scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash -c 'mkdir -p "$${BUILD_DIR}" && cd "$${BUILD_DIR}" && cmake -G "Ninja" -DFDB_RELEASE=1 /__this_is_some_very_long_name_dir_needed_to_fix_a_bug_with_debug_rpms__/foundationdb && ninja -v -j "$${MAKEJOBS}" && ctest -j "$${MAKEJOBS}" --output-on-failure' prb-correctness: <<: *snapshot-correctness From c496f6e21686ee844e8d1233d055dec402074aed Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Tue, 12 May 2020 07:45:37 -0700 Subject: [PATCH 15/25] Updated dockerfile to include newer version of git --- build/Dockerfile | 32 +++++++++++++++++--------------- build/docker-compose.yaml | 2 +- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index c8a84818b8..de7db14506 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -3,14 +3,16 @@ FROM centos:6 # Install dependencies for developer tools, bindings,\ # documentation, actorcompiler, and packaging tools\ RUN yum install -y yum-utils &&\ - yum-config-manager --enable rhel-server-rhscl-7-rpms &&\ - yum -y install centos-release-scl epel-release &&\ - yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ - devtoolset-8-gcc-8.3.1-3.1.el6 devtoolset-8-gcc-c++-8.3.1-3.1.el6 \ - rh-python36-python-devel devtoolset-8-valgrind-devel \ - mono-core rh-ruby24 golang python27 rpm-build debbuild \ - python-pip dos2unix valgrind-devel ccache distcc devtoolset-8-libubsan-devel libubsan-devel &&\ - pip install boto3==1.1.1 + yum-config-manager --enable rhel-server-rhscl-7-rpms &&\ + yum -y install centos-release-scl epel-release &&\ + yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ + devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \ + devtoolset-8-libubsan-devel devtoolset-8-valgrind-devel \ + rh-python36-python-devel rh-ruby24 golang python27 rpm-build \ + mono-core debbuild python-pip dos2unix valgrind-devel ccache \ + distcc wget &&\ + yum install -y http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ + pip install boto3==1.1.1 USER root @@ -24,8 +26,8 @@ RUN cd /opt/ &&\ tar -xjf boost_1_67_0.tar.bz2 &&\ rm -rf boost_1_67_0.tar.bz2 boost-sha-67.txt boost_1_67_0/libs &&\ curl -L https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2 -o boost_1_72_0.tar.bz2 &&\ - echo "59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722 boost_1_72_0.tar.bz2" > boost-sha-72.txt &&\ - sha256sum -c boost-sha-72.txt &&\ + echo "59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722 boost_1_72_0.tar.bz2" > boost-sha-72.txt &&\ + sha256sum -c boost-sha-72.txt &&\ tar -xjf boost_1_72_0.tar.bz2 &&\ rm -rf boost_1_72_0.tar.bz2 boost-sha-72.txt boost_1_72_0/libs @@ -34,8 +36,8 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1 echo "563a39e0a7c7368f81bfa1c3aff8b590a0617cdfe51177ddc808f66cc0866c76 /tmp/cmake.tar.gz" > /tmp/cmake-sha.txt &&\ sha256sum -c /tmp/cmake-sha.txt &&\ cd /tmp && tar xf cmake.tar.gz &&\ - cp -r cmake-3.13.4-Linux-x86_64/* /usr/local/ &&\ - rm -rf cmake.tar.gz cmake-3.13.4-Linux-x86_64 cmake-sha.txt + cp -r cmake-3.13.4-Linux-x86_64/* /usr/local/ &&\ + rm -rf cmake.tar.gz cmake-3.13.4-Linux-x86_64 cmake-sha.txt # install Ninja RUN cd /tmp && curl -L https://github.com/ninja-build/ninja/archive/v1.9.0.zip -o ninja.zip &&\ @@ -48,11 +50,11 @@ RUN cd /tmp && curl -L https://www.openssl.org/source/openssl-1.1.1d.tar.gz -o o sha256sum -c openssl-sha.txt && tar -xzf openssl.tar.gz &&\ cd openssl-1.1.1d && scl enable devtoolset-8 -- ./config CFLAGS="-fPIC -O3" --prefix=/usr/local &&\ scl enable devtoolset-8 -- make -j`nproc` && scl enable devtoolset-8 -- make -j1 install &&\ - ln -sv /usr/local/lib64/lib*.so.1.1 /usr/lib64/ &&\ + ln -sv /usr/local/lib64/lib*.so.1.1 /usr/lib64/ &&\ cd /tmp/ && rm -rf /tmp/openssl-1.1.1d /tmp/openssl.tar.gz -LABEL version=0.1.12 -ENV DOCKER_IMAGEVER=0.1.12 +LABEL version=0.1.13 +ENV DOCKER_IMAGEVER=0.1.13 ENV JAVA_HOME=/usr/lib/jvm/java-1.8.0 ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/g++ diff --git a/build/docker-compose.yaml b/build/docker-compose.yaml index a20dc2917e..87774da9f4 100644 --- a/build/docker-compose.yaml +++ b/build/docker-compose.yaml @@ -2,7 +2,7 @@ version: "3" services: common: &common - image: foundationdb/foundationdb-build:0.1.12 + image: foundationdb/foundationdb-build:0.1.13 build-setup: &build-setup <<: *common From bdc58eaa11177ecf4dae508f2c0ba89fe977dd88 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 12 May 2020 11:16:35 -0700 Subject: [PATCH 16/25] Fix invalid memory access in ReadHotDetection test due to memory being stored in a non-standalone stringref. --- fdbserver/workloads/ReadHotDetection.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/workloads/ReadHotDetection.actor.cpp b/fdbserver/workloads/ReadHotDetection.actor.cpp index 5e00314765..d6c589e059 100644 --- a/fdbserver/workloads/ReadHotDetection.actor.cpp +++ b/fdbserver/workloads/ReadHotDetection.actor.cpp @@ -73,7 +73,7 @@ struct ReadHotDetectionWorkload : TestWorkload { loop { try { for (int i = 0; i < self->keyCount; i++) { - auto key = StringRef(format("testkey%08x", i)); + Standalone key = StringRef(format("testkey%08x", i)); if (key == self->readKey) { tr.set(key, largeValue); } else { From f148412a328abf6b19193addba2de97b5bffcc96 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 12 May 2020 16:59:20 -0700 Subject: [PATCH 17/25] Make UPDATE_STORAGE_BYTE_LIMIT the reference spill variety. Which is unrelated, but a change I was supposed to do a while ago and forgot. --- fdbserver/OldTLogServer_6_2.actor.cpp | 2 +- fdbserver/TLogServer.actor.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index c33d1c3471..d8a31ee999 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1105,7 +1105,7 @@ ACTOR Future updateStorage( TLogData* self ) { commitLockReleaser.release(); } - if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { + if( totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT ) { wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 242d5f90cf..a2850f2dcf 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1197,7 +1197,7 @@ ACTOR Future updateStorage( TLogData* self ) { commitLockReleaser.release(); } - if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { + if( totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT ) { wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { From 283fd3af271af9c9cbe6c3e8c599ce8c9c858097 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 12 May 2020 17:01:52 -0700 Subject: [PATCH 18/25] Add a knob which controls writing prefix compressed kvs mem snapshots. Which will be set to on by default in 7.0 --- fdbserver/KeyValueStoreMemory.actor.cpp | 2 +- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp index bd46d08e1c..284257973f 100644 --- a/fdbserver/KeyValueStoreMemory.actor.cpp +++ b/fdbserver/KeyValueStoreMemory.actor.cpp @@ -743,7 +743,7 @@ private: // Get the common prefix between this key and the previous one, or 0 if there was no previous one. int commonPrefix; - if(useDelta) { + if(useDelta && SERVER_KNOBS->PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS) { commonPrefix = commonPrefixLength(lastSnapshotKeyA, lastSnapshotKeyB); } else { diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 281b902213..f1b1b26034 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -549,6 +549,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( MIN_TAG_PAGES_READ_RATE, 1.0e4 ); if( randomize && BUGGIFY ) MIN_TAG_PAGES_READ_RATE = 0; init( READ_TAG_MEASUREMENT_INTERVAL, 30.0 ); if( randomize && BUGGIFY ) READ_TAG_MEASUREMENT_INTERVAL = 1.0; init( OPERATION_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) OPERATION_COST_BYTE_FACTOR = 4096; + init( PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS, false ); if( randomize && BUGGIFY ) PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS = true; //Wait Failure init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 3fce535859..0569660c40 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -478,6 +478,7 @@ public: int64_t MIN_TAG_PAGES_READ_RATE; double READ_TAG_MEASUREMENT_INTERVAL; int64_t OPERATION_COST_BYTE_FACTOR; + bool PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS; //Wait Failure int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; From 634c9880592a4943374a32e4df8530411c07d11f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 12 May 2020 18:09:43 -0700 Subject: [PATCH 19/25] Tag throttles reacted poorly to the rate being set to max --- fdbclient/DatabaseContext.h | 10 +++++++++- fdbserver/Ratekeeper.actor.cpp | 28 +++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 50b76b6206..97c89b12bc 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -52,6 +52,7 @@ private: double tpsRate; double expiration; double lastCheck; + bool rateSet = false; Smoother smoothRate; Smoother smoothReleased; @@ -68,7 +69,14 @@ public: void update(ClientTagThrottleLimits const& limits) { ASSERT(limits.tpsRate >= 0); this->tpsRate = limits.tpsRate; - smoothRate.setTotal(limits.tpsRate); + + if(!rateSet || expired()) { + rateSet = true; + smoothRate.reset(limits.tpsRate); + } + else { + smoothRate.setTotal(limits.tpsRate); + } expiration = limits.expiration; } diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 5c0fadf323..99ff36b20b 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -141,8 +141,9 @@ private: // Only used by auto-throttles double created = now(); - double lastUpdated = now(); + double lastUpdated = 0; double lastReduced = now(); + bool rateSet = false; RkTagThrottleData() : clientRate(CLIENT_KNOBS->TAG_THROTTLE_SMOOTHING_WINDOW) {} @@ -157,13 +158,26 @@ private: Optional updateAndGetClientRate(Optional requestRate) { if(limits.expiration > now()) { - clientRate.setTotal(getTargetRate(requestRate)); + double targetRate = getTargetRate(requestRate); + if(targetRate == std::numeric_limits::max()) { + rateSet = false; + return Optional(); + } + if(!rateSet) { + rateSet = true; + clientRate.reset(targetRate); + } + else { + clientRate.setTotal(targetRate); + } + double rate = clientRate.smoothTotal(); ASSERT(rate >= 0); return rate; } else { TEST(true); // Get throttle rate for expired throttle + rateSet = false; return Optional(); } } @@ -246,10 +260,14 @@ public: throttle.limits.tpsRate = tpsRate.get(); throttle.limits.expiration = expiration.get(); - Optional clientRate = throttle.updateAndGetClientRate(getRequestRate(tag)); - ASSERT(clientRate.present()); + throttle.updateAndGetClientRate(getRequestRate(tag)); - return tpsRate.get(); + if(tpsRate.get() != std::numeric_limits::max()) { + return tpsRate.get(); + } + else { + return Optional(); + } } void manualThrottleTag(TransactionTag const& tag, TransactionPriority priority, double tpsRate, double expiration, Optional const& oldLimits) { From f5aef706f66ba4cb37198e9d3e640d10bd94d747 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 May 2020 16:38:43 -0700 Subject: [PATCH 20/25] FastRestore:Delay leader election until restore requests are set --- fdbserver/RestoreApplier.actor.cpp | 4 ++-- fdbserver/RestoreMaster.actor.cpp | 34 ++++++++++++++--------------- fdbserver/RestoreWorker.actor.cpp | 35 ++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index ccedec991f..42d1c6511b 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -235,8 +235,8 @@ ACTOR static Future getAndComputeStagingKeys( wait(waitForAll(fValues)); break; } catch (Error& e) { - if (retries++ > 10) { - TraceEvent(SevError, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID) + if (retries++ > 10) { // TODO: Can we stop retry at the first error? + TraceEvent(SevWarn, "FastRestoreApplierGetAndComputeStagingKeysGetKeysStuck", applierID) .detail("BatchIndex", batchIndex) .detail("GetKeys", incompleteStagingKeys.size()) .error(e); diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index e37f6869d3..5c0693a9f4 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -633,32 +633,32 @@ ACTOR static Future>> collectRestoreRequest state Future watch4RestoreRequest; state ReadYourWritesTransaction tr(cx); - // wait for the restoreRequestTriggerKey to be set by the client/test workload + // restoreRequestTriggerKey should already been set loop { try { TraceEvent("FastRestoreMasterPhaseCollectRestoreRequestsWait"); tr.reset(); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::LOCK_AWARE); + + // Sanity check Optional numRequests = wait(tr.get(restoreRequestTriggerKey)); - if (!numRequests.present()) { - watch4RestoreRequest = tr.watch(restoreRequestTriggerKey); - wait(tr.commit()); - wait(watch4RestoreRequest); - } else { - Standalone restoreRequestValues = - wait(tr.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY)); - ASSERT(!restoreRequestValues.more); - if (restoreRequestValues.size()) { - for (auto& it : restoreRequestValues) { - restoreRequests.push_back(restoreRequests.arena(), decodeRestoreRequestValue(it.value)); - TraceEvent("FastRestoreMasterPhaseCollectRestoreRequests") - .detail("RestoreRequest", restoreRequests.back().toString()); - } - } else { - TraceEvent(SevWarnAlways, "FastRestoreMasterPhaseCollectRestoreRequestsEmptyRequests"); + ASSERT(numRequests.present()); + + Standalone restoreRequestValues = + wait(tr.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT(!restoreRequestValues.more); + if (restoreRequestValues.size()) { + for (auto& it : restoreRequestValues) { + restoreRequests.push_back(restoreRequests.arena(), decodeRestoreRequestValue(it.value)); + TraceEvent("FastRestoreMasterPhaseCollectRestoreRequests") + .detail("RestoreRequest", restoreRequests.back().toString()); } break; + } else { + // TODO: Change this to SevError + TraceEvent(SevWarnAlways, "FastRestoreMasterPhaseCollectRestoreRequestsEmptyRequests"); + wait(delay(5.0)); } } catch (Error& e) { wait(tr.onError(e)); diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp index c718751b9c..1896c8c6b5 100644 --- a/fdbserver/RestoreWorker.actor.cpp +++ b/fdbserver/RestoreWorker.actor.cpp @@ -248,6 +248,39 @@ ACTOR Future startRestoreWorker(Reference self, Restore return Void(); } +ACTOR static Future waitOnRestoreRequests(Database cx, UID nodeID = UID()) { + state Future watch4RestoreRequest; + state ReadYourWritesTransaction tr(cx); + state Optional numRequests; + + // wait for the restoreRequestTriggerKey to be set by the client/test workload + TraceEvent("FastRestoreWaitOnRestoreRequest", nodeID); + loop { + try { + tr.reset(); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + Optional _numRequests = wait(tr.get(restoreRequestTriggerKey)); + numRequests = _numRequests; + if (!numRequests.present()) { + watch4RestoreRequest = tr.watch(restoreRequestTriggerKey); + wait(tr.commit()); + TraceEvent(SevInfo, "FastRestoreWaitOnRestoreRequestTriggerKey", nodeID); + wait(watch4RestoreRequest); + TraceEvent(SevInfo, "FastRestoreDetectRestoreRequestTriggerKeyChanged", nodeID); + } else { + TraceEvent(SevInfo, "FastRestoreRestoreRequestTriggerKey", nodeID) + .detail("TriggerKey", numRequests.get().toString()); + break; + } + } catch (Error& e) { + wait(tr.onError(e)); + } + } + + return Void(); +} + // RestoreMaster is the leader ACTOR Future monitorleader(Reference> leader, Database cx, RestoreWorkerInterface myWorkerInterf) { @@ -331,6 +364,8 @@ ACTOR Future _restoreWorker(Database cx, LocalityData locality) { .detail("TxnBatchSize", SERVER_KNOBS->FASTRESTORE_TXN_BATCH_MAX_BYTES) .detail("VersionBatchSize", SERVER_KNOBS->FASTRESTORE_VERSIONBATCH_MAX_BYTES); + wait(waitOnRestoreRequests(cx, myWorkerInterf.id())); + wait(monitorleader(leader, cx, myWorkerInterf)); TraceEvent("FastRestoreWorker", myWorkerInterf.id()).detail("LeaderElection", "WaitForLeader"); From 0a49dbd6d09e900f32890588382d009bb7fe9d84 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 May 2020 19:27:46 -0700 Subject: [PATCH 21/25] FastRestore:Check restore request must have been set when master needs it --- fdbserver/RestoreMaster.actor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index 5c0693a9f4..de1fc909f2 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -656,8 +656,7 @@ ACTOR static Future>> collectRestoreRequest } break; } else { - // TODO: Change this to SevError - TraceEvent(SevWarnAlways, "FastRestoreMasterPhaseCollectRestoreRequestsEmptyRequests"); + TraceEvent(SevError, "FastRestoreMasterPhaseCollectRestoreRequestsEmptyRequests"); wait(delay(5.0)); } } catch (Error& e) { From 31c298c5de0bf0ff89af18a34a8ea1af8f243858 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Tue, 12 May 2020 23:02:51 -0700 Subject: [PATCH 22/25] Integrated git repository with others --- build/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build/Dockerfile b/build/Dockerfile index de7db14506..928f2924dc 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -4,14 +4,14 @@ FROM centos:6 # documentation, actorcompiler, and packaging tools\ RUN yum install -y yum-utils &&\ yum-config-manager --enable rhel-server-rhscl-7-rpms &&\ - yum -y install centos-release-scl epel-release &&\ + yum -y install centos-release-scl epel-release \ + http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ yum -y install devtoolset-8-8.1-1.el6 java-1.8.0-openjdk-devel \ devtoolset-8-gcc-8.3.1 devtoolset-8-gcc-c++-8.3.1 \ devtoolset-8-libubsan-devel devtoolset-8-valgrind-devel \ rh-python36-python-devel rh-ruby24 golang python27 rpm-build \ mono-core debbuild python-pip dos2unix valgrind-devel ccache \ - distcc wget &&\ - yum install -y http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ + distcc wget git &&\ pip install boto3==1.1.1 USER root From c85b302e59f834d0f9999c6f6ed6d5c7a380929a Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Wed, 29 Apr 2020 10:35:07 -0700 Subject: [PATCH 23/25] Added Dockerfile used to build a development docker image --- build/Dockerfile.devel | 70 ++++++++++++++++++++++++++++++++++++++++ build/artifacts/.gitkeep | 0 2 files changed, 70 insertions(+) create mode 100644 build/Dockerfile.devel create mode 100644 build/artifacts/.gitkeep diff --git a/build/Dockerfile.devel b/build/Dockerfile.devel new file mode 100644 index 0000000000..1181b823dd --- /dev/null +++ b/build/Dockerfile.devel @@ -0,0 +1,70 @@ +FROM foundationdb/foundationdb-build:0.1.12 + +USER root + +ARG FDB_ARTIFACTSURL=http://foundationdb.org +ADD artifacts /mnt/artifacts + +# Install newer version of git and build tools for building via make +RUN \ + yum install -y http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ + yum install -y git distcc-server gperf rubygems python34 libmpc-devel npm wget + +# Download and install llvm-10.0.0 +RUN cd / &&\ + curl -L $FDB_ARTIFACTSURL/downloads/docker/foundationdb-dev/LLVM-10.0.0-Linux.rpm > /mnt/artifacts/LLVM-10.0.0-Linux.rpm &&\ + yum install -y /mnt/artifacts/LLVM-10.0.0-Linux.rpm &&\ + curl -L $FDB_ARTIFACTSURL/downloads/docker/foundationdb-dev/gcc910.conf > /etc/ld.so.conf.d/gcc910.conf + +# Download and install gcc-9.3.0 +RUN cd / &&\ + curl -L $FDB_ARTIFACTSURL/downloads/docker/foundationdb-dev/gcc-9.3.0.tar.gz | tar -xvz + +# Download and install distcc 3.3.2 new centos binaries +RUN cd / &&\ + curl -L $FDB_ARTIFACTSURL/downloads/docker/foundationdb-dev/distcc-3.3.2-centos.tar.gz | tar -xvz &&\ + mkdir -p /usr/lib/gcc-cross &&\ + update-distcc-symlinks &&\ + mv -iv /usr/bin/distcc /usr/bin/distcc.orig &&\ + mv -iv /usr/bin/distccd /usr/bin/distccd.orig &&\ + mv -iv /usr/bin/distccmon-text /usr/bin/distccmon-text.orig + +# Replace the clang and gcc links with dereferenced file +# Add md5sum links to compilers to allow unique id of binary +# Copy new devtoolset tools to /usr/local/bin +RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\ + mv -iv /usr/local/bin/clang++ /usr/local/bin/clang++.lnk &&\ + mv -iv /usr/local/bin/clang++.deref /usr/local/bin/clang++ &&\ + cp -iv /usr/local/bin/clang /usr/local/bin/clang.deref &&\ + mv -iv /usr/local/bin/clang /usr/local/bin/clang.lnk &&\ + mv -iv /usr/local/bin/clang.deref /usr/local/bin/clang &&\ + cp -iv /usr/local/bin/g++ /usr/local/bin/g++.deref &&\ + mv -iv /usr/local/bin/g++ /usr/local/bin/g++.lnk &&\ + mv -iv /usr/local/bin/g++.deref /usr/local/bin/g++ &&\ + cp -iv /usr/local/bin/gcc /usr/local/bin/gcc.deref &&\ + mv -iv /usr/local/bin/gcc /usr/local/bin/gcc.lnk &&\ + mv -iv /usr/local/bin/gcc.deref /usr/local/bin/gcc &&\ + for compiler in /usr/local/bin/gcc /usr/local/bin/g++ /opt/rh/devtoolset-8/root/usr/bin/g++ /opt/rh/devtoolset-8/root/usr/bin/gcc /usr/local/bin/clang /usr/local/bin/clang++; do md5file=$(md5sum "${compiler}" | cut -d\ -f1) && ln -sv "${compiler##*\/}" "${compiler}.${md5file:0:8}"; done &&\ + for toolexe in addr2line ar as ld gdb valgrind; do cp -iv "/opt/rh/devtoolset-8/root/usr/bin/${toolexe}" "/usr/local/bin/${toolexe}"; done &&\ + ldconfig &&\ + rm -rf /mnt/artifacts + +LABEL version=0.11.4 +ENV DOCKER_IMAGEVER=0.11.4 + +ENV CLANGCC=/usr/local/bin/clang.457a3d3d +ENV CLANGCXX=/usr/local/bin/clang++.457a3d3d +ENV GCC80CC=/opt/rh/devtoolset-8/root/usr/bin/gcc.b4f14270 +ENV GCC80CXX=/opt/rh/devtoolset-8/root/usr/bin/g++.0bea8e1c +ENV GCC93CC=/usr/local/bin/gcc.04edd07a +ENV GCC93CXX=/usr/local/bin/g++.b058d8c5 +ENV CC=/usr/local/bin/clang.457a3d3d +ENV CXX=/usr/local/bin/clang++.457a3d3d +ENV USE_LD=LLD +ENV USE_CCACHE=1 +ENV USE_LIBCXX=1 +ENV CCACHE_NOHASHDIR=true +ENV CCACHE_UMASK=0000 +ENV CCACHE_SLOPPINESS="file_macro,time_macros,include_file_mtime,include_file_ctime,file_stat_matches" + +CMD scl enable devtoolset-8 rh-python36 rh-ruby24 -- bash diff --git a/build/artifacts/.gitkeep b/build/artifacts/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 From afec69101b17c262e1fd049e29026fe6f810d666 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Mon, 4 May 2020 05:31:22 -0700 Subject: [PATCH 24/25] Updated the version --- build/Dockerfile.devel | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/build/Dockerfile.devel b/build/Dockerfile.devel index 1181b823dd..22cd9a83e9 100644 --- a/build/Dockerfile.devel +++ b/build/Dockerfile.devel @@ -49,19 +49,18 @@ RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\ ldconfig &&\ rm -rf /mnt/artifacts -LABEL version=0.11.4 -ENV DOCKER_IMAGEVER=0.11.4 +LABEL version=0.11.5 +ENV DOCKER_IMAGEVER=0.11.5 -ENV CLANGCC=/usr/local/bin/clang.457a3d3d -ENV CLANGCXX=/usr/local/bin/clang++.457a3d3d +ENV CLANGCC=/usr/local/bin/clang.de8a65ef +ENV CLANGCXX=/usr/local/bin/clang++.de8a65ef ENV GCC80CC=/opt/rh/devtoolset-8/root/usr/bin/gcc.b4f14270 ENV GCC80CXX=/opt/rh/devtoolset-8/root/usr/bin/g++.0bea8e1c ENV GCC93CC=/usr/local/bin/gcc.04edd07a ENV GCC93CXX=/usr/local/bin/g++.b058d8c5 -ENV CC=/usr/local/bin/clang.457a3d3d -ENV CXX=/usr/local/bin/clang++.457a3d3d +ENV CC=/usr/local/bin/clang.de8a65ef +ENV CXX=/usr/local/bin/clang++.de8a65ef ENV USE_LD=LLD -ENV USE_CCACHE=1 ENV USE_LIBCXX=1 ENV CCACHE_NOHASHDIR=true ENV CCACHE_UMASK=0000 From e10e09f9fae3714b5b10a0d0cd704212f93e76c3 Mon Sep 17 00:00:00 2001 From: Alvin Moore Date: Thu, 14 May 2020 01:45:25 -0700 Subject: [PATCH 25/25] Removed the git installation since in inheritted docker image --- build/Dockerfile.devel | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/build/Dockerfile.devel b/build/Dockerfile.devel index 22cd9a83e9..339782edb0 100644 --- a/build/Dockerfile.devel +++ b/build/Dockerfile.devel @@ -1,14 +1,13 @@ -FROM foundationdb/foundationdb-build:0.1.12 +FROM foundationdb/foundationdb-build:0.1.13 USER root ARG FDB_ARTIFACTSURL=http://foundationdb.org ADD artifacts /mnt/artifacts -# Install newer version of git and build tools for building via make +# Install build tools for building via make RUN \ - yum install -y http://opensource.wandisco.com/centos/6/git/x86_64/wandisco-git-release-6-1.noarch.rpm &&\ - yum install -y git distcc-server gperf rubygems python34 libmpc-devel npm wget + yum install -y distcc-server gperf rubygems python34 libmpc-devel npm # Download and install llvm-10.0.0 RUN cd / &&\ @@ -49,13 +48,13 @@ RUN cp -iv /usr/local/bin/clang++ /usr/local/bin/clang++.deref &&\ ldconfig &&\ rm -rf /mnt/artifacts -LABEL version=0.11.5 -ENV DOCKER_IMAGEVER=0.11.5 +LABEL version=0.11.6 +ENV DOCKER_IMAGEVER=0.11.6 ENV CLANGCC=/usr/local/bin/clang.de8a65ef ENV CLANGCXX=/usr/local/bin/clang++.de8a65ef -ENV GCC80CC=/opt/rh/devtoolset-8/root/usr/bin/gcc.b4f14270 -ENV GCC80CXX=/opt/rh/devtoolset-8/root/usr/bin/g++.0bea8e1c +ENV GCC80CC=/opt/rh/devtoolset-8/root/usr/bin/gcc.00f99754 +ENV GCC80CXX=/opt/rh/devtoolset-8/root/usr/bin/g++.12c01dd6 ENV GCC93CC=/usr/local/bin/gcc.04edd07a ENV GCC93CXX=/usr/local/bin/g++.b058d8c5 ENV CC=/usr/local/bin/clang.de8a65ef